diff --git a/doc/src/Manual.txt b/doc/src/Manual.txt
index a5a874fa8bcb4bcd57bc914273a3db4272ebb7a8..bceb1740177ee0ac737b6efcf33c9212bf17a95b 100644
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="11 Apr 2017 version">
+<META NAME="docnumber" CONTENT="4 May 2017 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation.  This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@@ -21,7 +21,7 @@
 <H1></H1>
 
 LAMMPS Documentation :c,h3
-11 Apr 2017 version :c,h4
+4 May 2017 version :c,h4
 
 Version info: :h4
 
@@ -158,12 +158,11 @@ END_RST -->
   2.1 "What's in the LAMMPS distribution"_start_1 :ulb,b
   2.2 "Making LAMMPS"_start_2 :b
   2.3 "Making LAMMPS with optional packages"_start_3 :b
-  2.4 "Building LAMMPS via the Make.py script"_start_4 :b
-  2.5 "Building LAMMPS as a library"_start_5 :b
-  2.6 "Running LAMMPS"_start_6 :b
-  2.7 "Command-line options"_start_7 :b
-  2.8 "Screen output"_start_8 :b
-  2.9 "Tips for users of previous versions"_start_9 :ule,b
+  2.4 "Building LAMMPS as a library"_start_4 :b
+  2.5 "Running LAMMPS"_start_5 :b
+  2.6 "Command-line options"_start_6 :b
+  2.7 "Screen output"_start_7 :b
+  2.8 "Tips for users of previous versions"_start_8 :ule,b
 "Commands"_Section_commands.html :l
   3.1 "LAMMPS input script"_cmd_1 :ulb,b
   3.2 "Parsing rules"_cmd_2 :b
diff --git a/doc/src/Section_commands.txt b/doc/src/Section_commands.txt
index 3f1d6ff2039cd77b32b8bbb888422731c3e5c952..c71acfe06f2f081fbefacfb1d8bcf4564b74c8db 100644
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@@ -1052,7 +1052,7 @@ package"_Section_start.html#start_3.
 "oxdna2/excv"_pair_oxdna2.html,
 "oxdna2/stk"_pair_oxdna2.html,
 "quip"_pair_quip.html,
-"reax/c (k)"_pair_reax_c.html,
+"reax/c (k)"_pair_reaxc.html,
 "smd/hertz"_pair_smd_hertz.html,
 "smd/tlsph"_pair_smd_tlsph.html,
 "smd/triangulated/surface"_pair_smd_triangulated_surface.html,
diff --git a/doc/src/Section_errors.txt b/doc/src/Section_errors.txt
index 832c5718ab6bc738c82959da47c856f598a296ed..5e0574b390ff4b3da9c13a98ce7636ad349ffa1f 100644
--- a/doc/src/Section_errors.txt
+++ b/doc/src/Section_errors.txt
@@ -11171,6 +11171,12 @@ Self-explanatory. :dd
 If the fix changes the timestep, the dump dcd file will not
 reflect the change. :dd
 
+{Energy due to X extra global DOFs will be included in minimizer energies} :dt
+
+When using fixes like box/relax, the potential energy used by the minimizer
+is augmented by an additional energy provided by the fix. Thus the printed
+converged energy may be different from the total potential energy. :dd
+
 {Energy tally does not account for 'zero yes'} :dt
 
 The energy removed by using the 'zero yes' flag is not accounted
diff --git a/doc/src/Section_intro.txt b/doc/src/Section_intro.txt
index 33c3cf395facc5981bd02ab78487033f9c8c8d1f..bfb6ef390171f67cc4df16f5d1a4486d427cb469 100644
--- a/doc/src/Section_intro.txt
+++ b/doc/src/Section_intro.txt
@@ -249,8 +249,12 @@ Pizza.py WWW site"_pizza. :l
 
 Specialized features :h5
 
-These are LAMMPS capabilities which you may not think of as typical
-molecular dynamics options:
+LAMMPS can be built with optional packages which implement a variety
+of additional capabilities.  An overview of all the packages is "given
+here"_Section_packages.html.
+
+These are some LAMMPS capabilities which you may not think of as
+typical classical molecular dynamics options:
 
 "static"_balance.html and "dynamic load-balancing"_fix_balance.html
 "generalized aspherical particles"_body.html
@@ -515,7 +519,7 @@ the packages they have written are somewhat unique to LAMMPS and the
 code would not be as general-purpose as it is without their expertise
 and efforts.
 
-Axel Kohlmeyer (Temple U), akohlmey at gmail.com, SVN and Git repositories, indefatigable mail list responder, USER-CG-CMM and USER-OMP packages
+Axel Kohlmeyer (Temple U), akohlmey at gmail.com, SVN and Git repositories, indefatigable mail list responder, USER-CGSDK and USER-OMP packages
 Roy Pollock (LLNL), Ewald and PPPM solvers
 Mike Brown (ORNL), brownw at ornl.gov, GPU package
 Greg Wagner (Sandia), gjwagne at sandia.gov, MEAM package for MEAM potential
diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index b327b7b1ceccebbbd77111a5d239ba5cfb6ea4dd..2a0a8386e8886d18ff2c33161469de5c58682634 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -10,1895 +10,2592 @@ Section"_Section_accelerate.html :c
 
 4. Packages :h3
 
-This section gives an overview of the add-on optional packages that
-extend LAMMPS functionality.  Packages are groups of files that enable
-a specific set of features.  For example, force fields for molecular
-systems or granular systems are in packages.  You can see the list of
-all packages by typing "make package" from within the src directory of
-the LAMMPS distribution.
-
-Here are links for two tables below, which list standard and user
-packages.
-
-4.1 "Standard packages"_#pkg_1
-4.2 "User packages"_#pkg_2 :all(b)
-
-"Section 2.3"_Section_start.html#start_3 of the manual describes
-the difference between standard packages and user packages.  It also
-has general details on how to include/exclude specific packages as
-part of the LAMMPS build process, and on how to build auxiliary
-libraries or modify a machine Makefile if a package requires it.
-
-Following the two tables below, is a sub-section for each package.  It
-has a summary of what the package contains.  It has specific
-instructions on how to install it, build or obtain any auxiliary
-library it requires, and any Makefile.machine changes it requires.  It
-also lists pointers to examples of its use or documentation provided
-in the LAMMPS distribution.  If you want to know the complete list of
-commands that a package adds to LAMMPS, simply list the files in its
-directory, e.g. "ls src/GRANULAR".  Source files with names that start
-with compute, fix, pair, bond, etc correspond to command styles with
-the same names.
-
-NOTE: The USER package sub-sections below are still being filled in,
-as of March 2016.
-
-Unless otherwise noted below, every package is independent of all the
-others.  I.e. any package can be included or excluded in a LAMMPS
-build, independent of all other packages.  However, note that some
-packages include commands derived from commands in other packages.  If
-the other package is not installed, the derived command from the new
-package will also not be installed when you include the new one.
-E.g. the pair lj/cut/coul/long/omp command from the USER-OMP package
-will not be installed as part of the USER-OMP package if the KSPACE
-package is not also installed, since it contains the pair
-lj/cut/coul/long command.  If you later install the KSPACE package and
-the USER-OMP package is already installed, both the pair
-lj/cut/coul/long and lj/cut/coul/long/omp commands will be installed.
-
-:line
-
-4.1 Standard packages :h4,link(pkg_1)
-
-The current list of standard packages is as follows.  Each package
-name links to a sub-section below with more details.
-
-Package, Description, Author(s), Doc page, Example, Library
-"ASPHERE"_#ASPHERE, aspherical particles, -, "Section 6.6.14"_Section_howto.html#howto_14, ellipse, -
-"BODY"_#BODY, body-style particles, -, "body"_body.html, body, -
-"CLASS2"_#CLASS2, class 2 force fields, -, "pair_style lj/class2"_pair_class2.html, -, -
-"COLLOID"_#COLLOID, colloidal particles, Kumar (1), "atom_style colloid"_atom_style.html, colloid, -
-"COMPRESS"_#COMPRESS, I/O compression, Axel Kohlmeyer (Temple U), "dump */gz"_dump.html, -, -
-"CORESHELL"_#CORESHELL, adiabatic core/shell model, Hendrik Heenen (Technical U of Munich), "Section 6.6.25"_Section_howto.html#howto_25, coreshell, -
-"DIPOLE"_#DIPOLE, point dipole particles, -, "pair_style dipole/cut"_pair_dipole.html, dipole, -
-"GPU"_#GPU, GPU-enabled styles, Mike Brown (ORNL), "Section 5.3.1"_accelerate_gpu.html, gpu, lib/gpu
-"GRANULAR"_#GRANULAR, granular systems, -, "Section 6.6.6"_Section_howto.html#howto_6, pour, -
-"KIM"_#KIM, openKIM potentials, Smirichinski & Elliot & Tadmor (3), "pair_style kim"_pair_kim.html, kim, KIM
-"KOKKOS"_#KOKKOS, Kokkos-enabled styles, Trott & Moore (4), "Section 5.3.3"_accelerate_kokkos.html, kokkos, lib/kokkos
-"KSPACE"_#KSPACE, long-range Coulombic solvers, -, "kspace_style"_kspace_style.html, peptide, -
-"MANYBODY"_#MANYBODY, many-body potentials, -, "pair_style tersoff"_pair_tersoff.html, shear, -
-"MEAM"_#MEAM, modified EAM potential, Greg Wagner (Sandia), "pair_style meam"_pair_meam.html, meam, lib/meam
-"MC"_#MC, Monte Carlo options, -, "fix gcmc"_fix_gcmc.html, -, -
-"MOLECULE"_#MOLECULE, molecular system force fields, -, "Section 6.6.3"_Section_howto.html#howto_3, peptide, -
-"OPT"_#OPT, optimized pair styles, Fischer & Richie & Natoli (2), "Section 5.3.5"_accelerate_opt.html, -, -
-"PERI"_#PERI, Peridynamics models, Mike Parks (Sandia), "pair_style peri"_pair_peri.html, peri, -
-"POEMS"_#POEMS, coupled rigid body motion, Rudra Mukherjee (JPL), "fix poems"_fix_poems.html, rigid, lib/poems
-"PYTHON"_#PYTHON, embed Python code in an input script, -, "python"_python.html, python, lib/python
-"REAX"_#REAX, ReaxFF potential, Aidan Thompson (Sandia), "pair_style reax"_pair_reax.html, reax, lib/reax
-"REPLICA"_#REPLICA, multi-replica methods, -, "Section 6.6.5"_Section_howto.html#howto_5, tad, -
-"RIGID"_#RIGID, rigid bodies, -, "fix rigid"_fix_rigid.html, rigid, -
-"SHOCK"_#SHOCK, shock loading methods, -, "fix msst"_fix_msst.html, -, -
-"SNAP"_#SNAP, quantum-fit potential, Aidan Thompson (Sandia), "pair snap"_pair_snap.html, snap, -
-"SRD"_#SRD, stochastic rotation dynamics, -, "fix srd"_fix_srd.html, srd, -
-"VORONOI"_#VORONOI, Voronoi tesselations, Daniel Schwen (LANL), "compute voronoi/atom"_compute_voronoi_atom.html, -, Voro++
-:tb(ea=c)
-
-The "Authors" column lists a name(s) if a specific person is
-responsible for creating and maintaining the package.
-
-(1) The COLLOID package includes Fast Lubrication Dynamics pair styles
-which were created by Amit Kumar and Michael Bybee from Jonathan
-Higdon's group at UIUC.
-
-(2) The OPT package was created by James Fischer (High Performance
-Technologies), David Richie, and Vincent Natoli (Stone Ridge
-Technolgy).
-
-(3) The KIM package was created by Valeriu Smirichinski, Ryan Elliott,
-and Ellad Tadmor (U Minn).
-
-(4) The KOKKOS package was created primarily by Christian Trott and
-Stan Moore (Sandia).  It uses the Kokkos library which was developed
-by Carter Edwards, Christian Trott, and others at Sandia.
+This section gives an overview of the optional packages that extend
+LAMMPS functionality with instructions on how to build LAMMPS with
+each of them.  Packages are groups of files that enable a specific set
+of features.  For example, force fields for molecular systems or
+granular systems are in packages.  You can see the list of all
+packages and "make" commands to manage them by typing "make package"
+from within the src directory of the LAMMPS distribution.  "Section
+2.3"_Section_start.html#start_3 gives general info on how to install
+and un-install packages as part of the LAMMPS build process.
+
+There are two kinds of packages in LAMMPS, standard and user packages:
+
+"Table of standard packages"_#table_standard
+"Table of user packages"_#table_user :ul
+
+Standard packages are supported by the LAMMPS developers and are
+written in a syntax and style consistent with the rest of LAMMPS.
+This means the developers will answer questions about them, debug and
+fix them if necessary, and keep them compatible with future changes to
+LAMMPS.
+
+User packages have been contributed by users, and begin with the
+"user" prefix.  If they are a single command (single file), they are
+typically in the user-misc package.  User packages don't necessarily
+meet the requirements of the standard packages.  If you have problems
+using a feature provided in a user package, you may need to contact
+the contributor directly to get help.  Information on how to submit
+additions you make to LAMMPS as single files or as a standard or user
+package are given in "this section"_Section_modify.html#mod_15 of the
+manual.
+
+Following the next two tables is a sub-section for each package.  It
+lists authors (if applicable) and summarizes the package contents.  It
+has specific instructions on how to install the package, including (if
+necessary) downloading or building any extra library it requires. It
+also gives links to documentation, example scripts, and
+pictures/movies (if available) that illustrate use of the package.
+
+NOTE: To see the complete list of commands a package adds to LAMMPS,
+just look at the files in its src directory, e.g. "ls src/GRANULAR".
+Files with names that start with fix, compute, atom, pair, bond,
+angle, etc correspond to commands with the same style names.
+
+In these two tables, the "Example" column is a sub-directory in the
+examples directory of the distribution which has an input script that
+uses the package.  E.g. "peptide" refers to the examples/peptide
+directory; USER/atc refers to the examples/USER/atc directory.  The
+"Library" column indicates whether an extra library is needed to build
+and use the package:
+
+dash = no library
+sys = system library: you likely have it on your machine
+int = internal library: provided with LAMMPS, but you may need to build it
+ext = external library: you will need to download and install it on your machine :ul
 
-The "Doc page" column links to either a sub-section of the
-"Section 6"_Section_howto.html of the manual, or an input script
-command implemented as part of the package, or to additional
-documentation provided within the package.
-
-The "Example" column is a sub-directory in the examples directory of
-the distribution which has an input script that uses the package.
-E.g. "peptide" refers to the examples/peptide directory.
+:line
+:line
 
-The "Library" column lists an external library which must be built
-first and which LAMMPS links to when it is built.  If it is listed as
-lib/package, then the code for the library is under the lib directory
-of the LAMMPS distribution.  See the lib/package/README file for info
-on how to build the library.  If it is not listed as lib/package, then
-it is a third-party library not included in the LAMMPS distribution.
-See details on all of this below for individual packages.
+[Standard packages] :link(table_standard),p
+
+Package, Description, Doc page, Example, Library
+"ASPHERE"_#ASPHERE, aspherical particle models, "Section 6.6.14"_Section_howto.html#howto_14, ellipse, -
+"BODY"_#BODY, body-style particles, "body"_body.html, body, -
+"CLASS2"_#CLASS2, class 2 force fields, "pair_style lj/class2"_pair_class2.html, -, -
+"COLLOID"_#COLLOID, colloidal particles, "atom_style colloid"_atom_style.html, colloid, -
+"COMPRESS"_#COMPRESS, I/O compression, "dump */gz"_dump.html, -, sys
+"CORESHELL"_#CORESHELL, adiabatic core/shell model, "Section 6.6.25"_Section_howto.html#howto_25, coreshell, -
+"DIPOLE"_#DIPOLE, point dipole particles, "pair_style dipole/cut"_pair_dipole.html, dipole, -
+"GPU"_#GPU, GPU-enabled styles, "Section 5.3.1"_accelerate_gpu.html, WWW bench, int
+"GRANULAR"_#GRANULAR, granular systems, "Section 6.6.6"_Section_howto.html#howto_6, pour, -
+"KIM"_#KIM, openKIM wrapper, "pair_style kim"_pair_kim.html, kim, ext
+"KOKKOS"_#KOKKOS, Kokkos-enabled styles, "Section 5.3.3"_accelerate_kokkos.html, WWW bench, -
+"KSPACE"_#KSPACE, long-range Coulombic solvers, "kspace_style"_kspace_style.html, peptide, -
+"MANYBODY"_#MANYBODY, many-body potentials, "pair_style tersoff"_pair_tersoff.html, shear, -
+"MC"_#MC, Monte Carlo options, "fix gcmc"_fix_gcmc.html, -, -
+"MEAM"_#MEAM, modified EAM potential, "pair_style meam"_pair_meam.html, meam, int
+"MISC"_#MISC, miscellanous single-file commands, -, -, -
+"MOLECULE"_#MOLECULE, molecular system force fields, "Section 6.6.3"_Section_howto.html#howto_3, peptide, -
+"MPIIO"_#MPIIO, MPI parallel I/O dump and restart, "dump"_dump.html, -, -
+"MSCG"_#MSCG, multi-scale coarse-graining wrapper, "fix mscg"_fix_mscg.html, mscg, ext
+"OPT"_#OPT, optimized pair styles, "Section 5.3.5"_accelerate_opt.html, WWW bench, -
+"PERI"_#PERI, Peridynamics models, "pair_style peri"_pair_peri.html, peri, -
+"POEMS"_#POEMS, coupled rigid body motion, "fix poems"_fix_poems.html, rigid, int
+"PYTHON"_#PYTHON, embed Python code in an input script, "python"_python.html, python, sys
+"QEQ"_#QEQ, QEq charge equilibration, "fix qeq"_fix_qeq.html, qeq, -
+"REAX"_#REAX, ReaxFF potential (Fortran), "pair_style reax"_pair_reax.html, reax, int
+"REPLICA"_#REPLICA, multi-replica methods, "Section 6.6.5"_Section_howto.html#howto_5, tad, -
+"RIGID"_#RIGID, rigid bodies and constraints, "fix rigid"_fix_rigid.html, rigid, -
+"SHOCK"_#SHOCK, shock loading methods, "fix msst"_fix_msst.html, -, -
+"SNAP"_#SNAP, quantum-fitted potential, "pair snap"_pair_snap.html, snap, -
+"SRD"_#SRD, stochastic rotation dynamics, "fix srd"_fix_srd.html, srd, -
+"VORONOI"_#VORONOI, Voronoi tesselation, "compute voronoi/atom"_compute_voronoi_atom.html, -, ext
+:tb(ea=c,ca1=l)
+
+[USER packages] :link(table_user),p
+
+Package, Description, Doc page, Example, Library
+"USER-ATC"_#USER-ATC, atom-to-continuum coupling, "fix atc"_fix_atc.html, USER/atc, int
+"USER-AWPMD"_#USER-AWPMD, wave-packet MD, "pair_style awpmd/cut"_pair_awpmd.html, USER/awpmd, int
+"USER-CGDNA"_#USER-CGDNA, coarse-grained DNA force fields, src/USER-CGDNA/README, USER/cgdna, -
+"USER-CGSDK"_#USER-CGSDK, SDK coarse-graining model, "pair_style lj/sdk"_pair_sdk.html, USER/cgsdk, -
+"USER-COLVARS"_#USER-COLVARS, collective variables library, "fix colvars"_fix_colvars.html, USER/colvars, int
+"USER-DIFFRACTION"_#USER-DIFFRACTION, virtual x-ray and electron diffraction,"compute xrd"_compute_xrd.html, USER/diffraction, -
+"USER-DPD"_#USER-DPD, reactive dissipative particle dynamics, src/USER-DPD/README, USER/dpd, -
+"USER-DRUDE"_#USER-DRUDE, Drude oscillators, "tutorial"_tutorial_drude.html, USER/drude, -
+"USER-EFF"_#USER-EFF, electron force field,"pair_style eff/cut"_pair_eff.html, USER/eff, -
+"USER-FEP"_#USER-FEP, free energy perturbation,"compute fep"_compute_fep.html, USER/fep, -
+"USER-H5MD"_#USER-H5MD, dump output via HDF5,"dump h5md"_dump_h5md.html, -, ext
+"USER-INTEL"_#USER-INTEL, optimized Intel CPU and KNL styles,"Section 5.3.2"_accelerate_intel.html, WWW bench, -
+"USER-LB"_#USER-LB, Lattice Boltzmann fluid,"fix lb/fluid"_fix_lb_fluid.html, USER/lb, -
+"USER-MANIFOLD"_#USER-MANIFOLD, motion on 2d surfaces,"fix manifoldforce"_fix_manifoldforce.html, USER/manifold, -
+"USER-MGPT"_#USER-MGPT, fast MGPT multi-ion potentials, "pair_style mgpt"_pair_mgpt.html, USER/mgpt, -
+"USER-MISC"_#USER-MISC, single-file contributions, USER-MISC/README, USER/misc, -
+"USER-MOLFILE"_#USER-MOLFILE, "VMD"_VMD molfile plug-ins,"dump molfile"_dump_molfile.html, -, ext
+"USER-NETCDF"_#USER-NETCDF, dump output via NetCDF,"dump netcdf"_dump_netcdf.html, -, ext
+"USER-OMP"_#USER-OMP, OpenMP-enabled styles,"Section 5.3.4"_accelerate_omp.html, WWW bench, -
+"USER-PHONON"_#USER-PHONON, phonon dynamical matrix,"fix phonon"_fix_phonon.html, USER/phonon, -
+"USER-QMMM"_#USER-QMMM, QM/MM coupling,"fix qmmm"_fix_qmmm.html, USER/qmmm, ext
+"USER-QTB"_#USER-QTB, quantum nuclear effects,"fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, -
+"USER-QUIP"_#USER-QUIP, QUIP/libatoms interface,"pair_style quip"_pair_quip.html, USER/quip, ext
+"USER-REAXC"_#USER-REAXC, ReaxFF potential (C/C++) ,"pair_style reaxc"_pair_reaxc.html, reax, -
+"USER-SMD"_#USER-SMD, smoothed Mach dynamics,"SMD User Guide"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, ext
+"USER-SMTBQ"_#USER-SMTBQ, second moment tight binding QEq potential,"pair_style smtbq"_pair_smtbq.html, USER/smtbq, -
+"USER-SPH"_#USER-SPH, smoothed particle hydrodynamics,"SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, -
+"USER-TALLY"_#USER-TALLY, pairwise tally computes,"compute XXX/tally"_compute_tally.html, USER/tally, -
+"USER-VTK"_#USER-VTK, dump output via VTK, "compute custom/vtk"_dump_custom_vtk.html, -, ext
+:tb(ea=c,ca1=l)
 
 :line
+:line
+
+ASPHERE package :link(ASPHERE),h4
 
-ASPHERE package :link(ASPHERE),h5
+[Contents:]
 
-Contents: Several computes, time-integration fixes, and pair styles
-for aspherical particle models: ellipsoids, 2d lines, 3d triangles.
+Computes, time-integration fixes, and pair styles for aspherical
+particle models including ellipsoids, 2d lines, and 3d triangles.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-asphere
 make machine :pre
 
-Make.py -p asphere -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-asphere
 make machine :pre
 
-Make.py -p ^asphere -a machine :pre
+[Supporting info:]
 
-Supporting info: "Section 6.14"_Section_howto.html#howto_14,
-"pair_style gayberne"_pair_gayberne.html, "pair_style
-resquared"_pair_resquared.html,
-"doc/PDF/pair_gayberne_extra.pdf"_PDF/pair_gayberne_extra.pdf,
-"doc/PDF/pair_resquared_extra.pdf"_PDF/pair_resquared_extra.pdf,
-examples/ASPHERE, examples/ellipse
+src/ASPHERE: filenames -> commands
+"Section 6.14"_Section_howto.html#howto_14
+"pair_style gayberne"_pair_gayberne.html
+"pair_style resquared"_pair_resquared.html
+"doc/PDF/pair_gayberne_extra.pdf"_PDF/pair_gayberne_extra.pdf
+"doc/PDF/pair_resquared_extra.pdf"_PDF/pair_resquared_extra.pdf
+examples/ASPHERE
+examples/ellipse
+http://lammps.sandia.gov/movies.html#line
+http://lammps.sandia.gov/movies.html#tri :ul
 
 :line
 
-BODY package :link(BODY),h5
+BODY package :link(BODY),h4
+
+[Contents:]
 
-Contents: Support for body-style particles.  Computes,
+Body-style particles with internal structure.  Computes,
 time-integration fixes, pair styles, as well as the body styles
 themselves.  See the "body"_body.html doc page for an overview.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-body
 make machine :pre
 
-Make.py -p body -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-body
 make machine :pre
 
-Make.py -p ^body -a machine :pre
+[Supporting info:]
 
-Supporting info: "atom_style body"_atom_style.html, "body"_body.html,
-"pair_style body"_pair_body.html, examples/body
+src/BODY filenames -> commands
+"body"_body.html
+"atom_style body"_atom_style.html
+"fix nve/body"_fix_nve_body.html
+"pair_style body"_pair_body.html
+examples/body :ul
 
 :line
 
-CLASS2 package :link(CLASS2),h5
+CLASS2 package :link(CLASS2),h4
 
-Contents: Bond, angle, dihedral, improper, and pair styles for the
-COMPASS CLASS2 molecular force field.
+[Contents:]
 
-To install via make or Make.py:
+Bond, angle, dihedral, improper, and pair styles for the COMPASS
+CLASS2 molecular force field.
+
+[Install or un-install:]
 
 make yes-class2
 make machine :pre
 
-Make.py -p class2 -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-class2
 make machine :pre
 
-Make.py -p ^class2 -a machine :pre
+[Supporting info:]
 
-Supporting info: "bond_style class2"_bond_class2.html, "angle_style
-class2"_angle_class2.html, "dihedral_style
-class2"_dihedral_class2.html, "improper_style
-class2"_improper_class2.html, "pair_style lj/class2"_pair_class2.html
+src/CLASS2: filenames -> commands
+"bond_style class2"_bond_class2.html
+"angle_style class2"_angle_class2.html
+"dihedral_style class2"_dihedral_class2.html
+"improper_style class2"_improper_class2.html
+"pair_style lj/class2"_pair_class2.html :ul
 
 :line
 
-COLLOID package :link(COLLOID),h5
+COLLOID package :link(COLLOID),h4
 
-Contents: Support for coarse-grained colloidal particles.  Wall fix
-and pair styles that implement colloidal interaction models for
-finite-size particles.  This includes the Fast Lubrication Dynamics
-method for hydrodynamic interactions, which is a simplified
-approximation to Stokesian dynamics.
+[Contents:]
 
-To install via make or Make.py:
+Coarse-grained finite-size colloidal particles.  Pair stayle and fix
+wall styles for colloidal interactions.  Includes the Fast Lubrication
+Dynamics (FLD) method for hydrodynamic interactions, which is a
+simplified approximation to Stokesian dynamics.
 
-make yes-colloid
-make machine :pre
+[Authors:] This package includes Fast Lubrication Dynamics pair styles
+which were created by Amit Kumar and Michael Bybee from Jonathan
+Higdon's group at UIUC.
 
-Make.py -p colloid -a machine :pre
+[Install or un-install:]
 
-To un-install via make or Make.py:
+make yes-colloid
+make machine :pre
 
 make no-colloid
 make machine :pre
 
-Make.py -p ^colloid -a machine :pre
+[Supporting info:]
 
-Supporting info: "fix wall/colloid"_fix_wall.html, "pair_style
-colloid"_pair_colloid.html, "pair_style
-yukawa/colloid"_pair_yukawa_colloid.html, "pair_style
-brownian"_pair_brownian.html, "pair_style
-lubricate"_pair_lubricate.html, "pair_style
-lubricateU"_pair_lubricateU.html, examples/colloid, examples/srd
+src/COLLOID: filenames -> commands
+"fix wall/colloid"_fix_wall.html
+"pair_style colloid"_pair_colloid.html
+"pair_style yukawa/colloid"_pair_yukawa_colloid.html
+"pair_style brownian"_pair_brownian.html
+"pair_style lubricate"_pair_lubricate.html
+"pair_style lubricateU"_pair_lubricateU.html
+examples/colloid
+examples/srd :ul
 
 :line
 
-COMPRESS package :link(COMPRESS),h5
+COMPRESS package :link(COMPRESS),h4
 
-Contents: Support for compressed output of dump files via the zlib
-compression library, using dump styles with a "gz" in their style
-name.
+[Contents:]
 
-Building with the COMPRESS package assumes you have the zlib
-compression library available on your system.  The build uses the
-lib/compress/Makefile.lammps file in the compile/link process.  You
-should only need to edit this file if the LAMMPS build cannot find the
-zlib info it specifies.
+Compressed output of dump files via the zlib compression library,
+using dump styles with a "gz" in their style name.
 
-To install via make or Make.py:
+To use this package you must have the zlib compression library
+available on your system.
 
-make yes-compress
-make machine :pre
+[Author:] Axel Kohlmeyer (Temple U).
 
-Make.py -p compress -a machine :pre
+[Install or un-install:]
 
-To un-install via make or Make.py:
+Note that building with this package assumes you have the zlib
+compression library available on your system.  The LAMMPS build uses
+the settings in the lib/compress/Makefile.lammps file in the
+compile/link process.  You should only need to edit this file if the
+LAMMPS build fails on your system.
+
+make yes-compress
+make machine :pre
 
 make no-compress
 make machine :pre
 
-Make.py -p ^compress -a machine :pre
+[Supporting info:]
 
-Supporting info: src/COMPRESS/README, lib/compress/README, "dump
-atom/gz"_dump.html, "dump cfg/gz"_dump.html, "dump
-custom/gz"_dump.html, "dump xyz/gz"_dump.html
+src/COMPRESS: filenames -> commands
+src/COMPRESS/README
+lib/compress/README
+"dump atom/gz"_dump.html
+"dump cfg/gz"_dump.html
+"dump custom/gz"_dump.html
+"dump xyz/gz"_dump.html :ul
 
 :line
 
-CORESHELL package :link(CORESHELL),h5
+CORESHELL package :link(CORESHELL),h4
 
-Contents: Compute and pair styles that implement the adiabatic
-core/shell model for polarizability.  The compute temp/cs command
-measures the temperature of a system with core/shell particles.  The
-pair styles augment Born, Buckingham, and Lennard-Jones styles with
-core/shell capabilities.  See "Section 6.26"_Section_howto.html#howto_26
-for an overview of how to use the package.
+[Contents:]
 
-To install via make or Make.py:
+Compute and pair styles that implement the adiabatic core/shell model
+for polarizability.  The pair styles augment Born, Buckingham, and
+Lennard-Jones styles with core/shell capabilities.  The "compute
+temp/cs"_compute_temp_cs.html command calculates the temperature of a
+system with core/shell particles.  See "Section
+6.26"_Section_howto.html#howto_26 for an overview of how to use this
+package.
 
-make yes-coreshell
-make machine :pre
+[Author:] Hendrik Heenen (Technical U of Munich).
 
-Make.py -p coreshell -a machine :pre
+[Install or un-install:]
 
-To un-install via make or Make.py:
+make yes-coreshell
+make machine :pre
 
 make no-coreshell
 make machine :pre
 
-Make.py -p ^coreshell -a machine :pre
+[Supporting info:]
 
-Supporting info: "Section 6.26"_Section_howto.html#howto_26,
-"compute temp/cs"_compute_temp_cs.html,
-"pair_style born/coul/long/cs"_pair_cs.html, "pair_style
-buck/coul/long/cs"_pair_cs.html, pair_style
-lj/cut/coul/long/cs"_pair_lj.html, examples/coreshell
+src/CORESHELL: filenames -> commands
+"Section 6.26"_Section_howto.html#howto_26
+"Section 6.25"_Section_howto.html#howto_25
+"compute temp/cs"_compute_temp_cs.html
+"pair_style born/coul/long/cs"_pair_cs.html
+"pair_style buck/coul/long/cs"_pair_cs.html
+"pair_style lj/cut/coul/long/cs"_pair_lj.html
+examples/coreshell :ul
 
 :line
 
-DIPOLE package :link(DIPOLE),h5
+DIPOLE package :link(DIPOLE),h4
 
-Contents: An atom style and several pair styles to support point
-dipole models with short-range or long-range interactions.
+[Contents:]
 
-To install via make or Make.py:
+An atom style and several pair styles for point dipole models with
+short-range or long-range interactions.
+
+[Install or un-install:]
 
 make yes-dipole
 make machine :pre
 
-Make.py -p dipole -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-dipole
 make machine :pre
 
-Make.py -p ^dipole -a machine :pre
+[Supporting info:]
 
-Supporting info: "atom_style dipole"_atom_style.html, "pair_style
-lj/cut/dipole/cut"_pair_dipole.html, "pair_style
-lj/cut/dipole/long"_pair_dipole.html, "pair_style
-lj/long/dipole/long"_pair_dipole.html, examples/dipole
+src/DIPOLE: filenames -> commands
+"atom_style dipole"_atom_style.html
+"pair_style lj/cut/dipole/cut"_pair_dipole.html
+"pair_style lj/cut/dipole/long"_pair_dipole.html
+"pair_style lj/long/dipole/long"_pair_dipole.html
+examples/dipole :ul
 
 :line
 
-GPU package :link(GPU),h5
+GPU package :link(GPU),h4
+
+[Contents:]
 
-Contents: Dozens of pair styles and a version of the PPPM long-range
-Coulombic solver for NVIDIA GPUs.  All of them have a "gpu" in their
-style name.  "Section 5.3.1"_accelerate_gpu.html gives
+Dozens of pair styles and a version of the PPPM long-range Coulombic
+solver optimized for NVIDIA GPUs.  All such styles have a "gpu" as a
+suffix in their style name.  "Section 5.3.1"_accelerate_gpu.html gives
 details of what hardware and Cuda software is required on your system,
-and how to build and use this package.  See the KOKKOS package, which
-also has GPU-enabled styles.
-
-Building LAMMPS with the GPU package requires first building the GPU
-library itself, which is a set of C and Cuda files in lib/gpu.
-Details of how to do this are in lib/gpu/README.  As illustrated
-below, perform a "make" using one of the Makefile.machine files in
-lib/gpu which should create a lib/reax/libgpu.a file.
-Makefile.linux.* and Makefile.xk7 are examples for different
-platforms.  There are 3 important settings in the Makefile.machine you
-use:
+and details on how to build and use this package.  Its styles can be
+invoked at run time via the "-sf gpu" or "-suffix gpu" "command-line
+switches"_Section_start.html#start_7.  See also the "KOKKOS"_#KOKKOS
+package, which has GPU-enabled styles.
+
+[Authors:] Mike Brown (Intel) while at Sandia and ORNL and Trung Nguyen
+(Northwestern U) while at ORNL.
+
+[Install or un-install:]
+
+Before building LAMMPS with this package, you must first build the GPU
+library in lib/gpu from a set of provided C and Cuda files.  You can
+do this manually if you prefer; follow the instructions in
+lib/gpu/README.  You can also do it in one step from the lammps/src
+dir, using a command like these, which simply invoke the
+lib/gpu/Install.py script with the specified args:
+
+make lib-gpu                                # print help message
+make lib-gpu args="-m"                      # build GPU library with default Makefile.linux
+make lib-gpu args="-i xk7 -p single -o xk7.single"      # create new Makefile.xk7.single, altered for single-precision
+make lib-gpu args="-i xk7 -p single -o xk7.single -m"   # ditto, also build GPU library
+
+Note that this procedure starts with one of the existing
+Makefile.machine files in lib/gpu.  It allows you to alter 4 important
+settings in that Makefile, via the -h, -a, -p, -e switches,
+and save the new Makefile, if desired:
 
 CUDA_HOME = where NVIDIA Cuda software is installed on your system
-CUDA_ARCH = appropriate to your GPU hardware
-CUDA_PREC = precision (double, mixed, single) you desire :ul
-
-See example Makefile.machine files in lib/gpu for the syntax of these
-settings.  See lib/gpu/Makefile.linux.double for ARCH settings for
-various NVIDIA GPUs.  The "make" also creates a
-lib/gpu/Makefile.lammps file.  This file has settings that enable
-LAMMPS to link with Cuda libraries.  If the settings in
-Makefile.lammps for your machine are not correct, the LAMMPS link will
-fail.  Note that the Make.py script has a "-gpu" option to allow the
-GPU library (with several of its options) and LAMMPS to be built in
-one step, with Type "python src/Make.py -h -gpu" to see the details.
-
-To install via make or Make.py:
-
-cd ~/lammps/lib/gpu
-make -f Makefile.linux.mixed     # for example
-cd ~/lammps/src
-make yes-gpu
-make machine :pre
+CUDA_ARCH = what GPU hardware you have (see help message for details)
+CUDA_PRECISION = precision (double, mixed, single)
+EXTRAMAKE = which Makefile.lammps.* file to copy to Makefile.lammps :ul
+
+If the library build is successful, 2 files should be created:
+lib/gpu/libgpu.a and lib/gpu/Makefile.lammps.  The latter has settings
+that enable LAMMPS to link with Cuda libraries.  If the settings in
+Makefile.lammps for your machine are not correct, the LAMMPS build
+will fail.
 
-Make.py -p gpu -gpu mode=mixed arch=35 -a machine :pre
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
 
-To un-install via make or Make.py:
+make yes-gpu
+make machine :pre
 
 make no-gpu
 make machine :pre
 
-Make.py -p ^gpu -a machine :pre
+NOTE: If you re-build the GPU library in lib/gpu, you should always
+un-install the GPU package, then re-install it and re-build LAMMPS.
+This is because the compilation of files in the GPU package use the
+library settings from the lib/gpu/Makefile.machine used to build the
+GPU library.
 
-Supporting info: src/GPU/README, lib/gpu/README,
-"Section 5.3"_Section_accelerate.html#acc_3,
-"Section 5.3.1"_accelerate_gpu.html,
-Pair Styles section of "Section 3.5"_Section_commands.html#cmd_5
-for any pair style listed with a (g),
-"kspace_style"_kspace_style.html, "package gpu"_package.html,
-examples/accelerate, bench/FERMI, bench/KEPLER
+[Supporting info:]
+
+src/GPU: filenames -> commands
+src/GPU/README
+lib/gpu/README
+"Section 5.3"_Section_accelerate.html#acc_3
+"Section 5.3.1"_accelerate_gpu.html
+"Section 2.7 -sf gpu"_Section_start.html#start_7
+"Section 2.7 -pk gpu"_Section_start.html#start_7
+"package gpu"_package.html
+Pair Styles section of "Section 3.5"_Section_commands.html#cmd_5 for pair styles followed by (g)
+"Benchmarks page"_http://lammps.sandia.gov/bench.html of web site :ul
 
 :line
 
-GRANULAR package :link(GRANULAR),h5
+GRANULAR package :link(GRANULAR),h4
 
-Contents: Fixes and pair styles that support models of finite-size
-granular particles, which interact with each other and boundaries via
-frictional and dissipative potentials.
+[Contents:]
 
-To install via make or Make.py:
+Pair styles and fixes for finite-size granular particles, which
+interact with each other and boundaries via frictional and dissipative
+potentials.
+
+[Install or un-install:]
 
 make yes-granular
 make machine :pre
 
-Make.py -p granular -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-granular
 make machine :pre
 
-Make.py -p ^granular -a machine :pre
-
-Supporting info: "Section 6.6"_Section_howto.html#howto_6, "fix
-pour"_fix_pour.html, "fix wall/gran"_fix_wall_gran.html, "pair_style
-gran/hooke"_pair_gran.html, "pair_style
-gran/hertz/history"_pair_gran.html, examples/pour, bench/in.chute
+[Supporting info:]
+
+src/GRANULAR: filenames -> commands
+"Section 6.6"_Section_howto.html#howto_6,
+"fix pour"_fix_pour.html
+"fix wall/gran"_fix_wall_gran.html
+"pair_style gran/hooke"_pair_gran.html
+"pair_style gran/hertz/history"_pair_gran.html
+examples/granregion
+examples/pour
+bench/in.chute
+http://lammps.sandia.gov/pictures.html#jamming
+http://lammps.sandia.gov/movies.html#hopper
+http://lammps.sandia.gov/movies.html#dem
+http://lammps.sandia.gov/movies.html#brazil
+http://lammps.sandia.gov/movies.html#granregion :ul
 
 :line
 
-KIM package :link(KIM),h5
+KIM package :link(KIM),h4
 
-Contents: A pair style that interfaces to the Knowledge Base for
-Interatomic Models (KIM) repository of interatomic potentials, so that
-KIM potentials can be used in a LAMMPS simulation.
+[Contents:]
 
-To build LAMMPS with the KIM package you must have previously
-installed the KIM API (library) on your system.  The lib/kim/README
-file explains how to download and install KIM.  Building with the KIM
-package also uses the lib/kim/Makefile.lammps file in the compile/link
-process.  You should not need to edit this file.
+A "pair_style kim"_pair_kim.html command which is a wrapper on the
+Knowledge Base for Interatomic Models (KIM) repository of interatomic
+potentials, enabling any of them to be used in LAMMPS simulations.
 
-To install via make or Make.py:
+To use this package you must have the KIM library available on your
+system.
 
-make yes-kim
-make machine :pre
+Information about the KIM project can be found at its website:
+https://openkim.org.  The KIM project is led by Ellad Tadmor and Ryan
+Elliott (U Minnesota) and James Sethna (Cornell U).
+
+[Authors:] Ryan Elliott (U Minnesota) is the main developer for the KIM
+API which the "pair_style kim"_pair_kim.html command uses.  He
+developed the pair style in collaboration with Valeriu Smirichinski (U
+Minnesota).
+
+[Install or un-install:]
 
-Make.py -p kim -a machine :pre
+Using this package requires the KIM library and its models
+(interatomic potentials) to be downloaded and installed on your
+system.  The library can be downloaded and built in lib/kim or
+elsewhere on your system.  Details of the download, build, and install
+process for KIM are given in the lib/kim/README file.
 
-To un-install via make or Make.py:
+Once that process is complete, you can then install/un-install the
+package and build LAMMPS in the usual manner:
+
+make yes-kim
+make machine :pre
 
 make no-kim
 make machine :pre
 
-Make.py -p ^kim -a machine :pre
+[Supporting info:]
 
-Supporting info: src/KIM/README, lib/kim/README, "pair_style
-kim"_pair_kim.html, examples/kim
+src/KIM: filenames -> commands
+src/KIM/README
+lib/kim/README
+"pair_style kim"_pair_kim.html
+examples/kim :ul
 
 :line
 
-KOKKOS package :link(KOKKOS),h5
+KOKKOS package :link(KOKKOS),h4
 
-Contents: Dozens of atom, pair, bond, angle, dihedral, improper styles
-which run with the Kokkos library to provide optimization for
-multicore CPUs (via OpenMP), NVIDIA GPUs, or the Intel Xeon Phi (in
-native mode).  All of them have a "kk" in their style name.  "Section
-5.3.3"_accelerate_kokkos.html gives details of what
-hardware and software is required on your system, and how to build and
-use this package.  See the GPU, OPT, USER-INTEL, USER-OMP packages,
-which also provide optimizations for the same range of hardware.
+[Contents:]
 
-Building with the KOKKOS package requires choosing which of 3 hardware
-options you are optimizing for: CPU acceleration via OpenMP, GPU
-acceleration, or Intel Xeon Phi.  (You can build multiple times to
-create LAMMPS executables for different hardware.)  It also requires a
-C++11 compatible compiler.  For GPUs, the NVIDIA "nvcc" compiler is
-used, and an appropriate KOKKOS_ARCH setting should be made in your
-Makefile.machine for your GPU hardware and NVIDIA software.
+Dozens of atom, pair, bond, angle, dihedral, improper, fix, compute
+styles adapted to compile using the Kokkos library which can convert
+them to OpenMP or Cuda code so that they run efficiently on multicore
+CPUs, KNLs, or GPUs.  All the styles have a "kk" as a suffix in their
+style name.  "Section 5.3.3"_accelerate_kokkos.html gives details of
+what hardware and software is required on your system, and how to
+build and use this package.  Its styles can be invoked at run time via
+the "-sf kk" or "-suffix kk" "command-line
+switches"_Section_start.html#start_7.  Also see the "GPU"_#GPU,
+"OPT"_#OPT, "USER-INTEL"_#USER-INTEL, and "USER-OMP"_#USER_OMP
+packages, which have styles optimized for CPUs, KNLs, and GPUs.
 
-The simplest way to do this is to use Makefile.kokkos_cuda or
-Makefile.kokkos_omp or Makefile.kokkos_phi in src/MAKE/OPTIONS, via
-"make kokkos_cuda" or "make kokkos_omp" or "make kokkos_phi".  (Check
-the KOKKOS_ARCH setting in Makefile.kokkos_cuda), Or, as illustrated
-below, you can use the Make.py script with its "-kokkos" option to
-choose which hardware to build for.  Type "python src/Make.py -h
--kokkos" to see the details.  If these methods do not work on your
-system, you will need to read the "Section 5.3.3"_accelerate_kokkos.html
-doc page for details of what Makefile.machine settings are needed.
+You must have a C++11 compatible compiler to use this package.
 
-To install via make or Make.py for each of 3 hardware options:
+[Authors:] The KOKKOS package was created primarily by Christian Trott
+and Stan Moore (Sandia), with contributions from other folks as well.
+It uses the open-source "Kokkos library"_https://github.com/kokkos
+which was developed by Carter Edwards, Christian Trott, and others at
+Sandia, and which is included in the LAMMPS distribution in
+lib/kokkos.
 
-make yes-kokkos
-make kokkos_omp    # for CPUs with OpenMP
-make kokkos_cuda   # for GPUs, check the KOKKOS_ARCH setting in Makefile.kokkos_cuda
-make kokkos_phi    # for Xeon Phis :pre
+[Install or un-install:]
+
+For the KOKKOS package, you have 3 choices when building.  You can
+build with either CPU or KNL or GPU support.  Each choice requires
+additional settings in your Makefile.machine for the KOKKOS_DEVICES
+and KOKKOS_ARCH settings.  See the src/MAKE/OPTIONS/Makefile.kokkos*
+files for examples.
+
+For multicore CPUs using OpenMP:
+
+KOKKOS_DEVICES = OpenMP
+KOKKOS_ARCH = HSW           # HSW = Haswell, SNB = SandyBridge, BDW = Broadwell, etc
+
+For Intel KNLs using OpenMP:
+
+KOKKOS_DEVICES = OpenMP
+KOKKOS_ARCH = KNL
+
+For NVIDIA GPUs using Cuda:
+
+KOKKOS_DEVICES = Cuda
+KOKKOS_ARCH = Pascal60,Power8     # P100 hosted by an IBM Power8, etc
+KOKKOS_ARCH = Kepler37,Power8     # K80 hosted by an IBM Power8, etc
+
+For GPUs, you also need these 2 lines in your Makefile.machine before
+the CC line is defined, in this case for use with OpenMPI mpicxx.  The
+2 lines define a nvcc wrapper compiler, which will use nvcc for
+compiling Cuda files or use a C++ compiler for non-Kokkos, non-Cuda
+files.
 
-Make.py -p kokkos -kokkos omp -a machine           # for CPUs with OpenMP
-Make.py -p kokkos -kokkos cuda arch=35 -a machine  # for GPUs of style arch
-Make.py -p kokkos -kokkos phi -a machine           # for Xeon Phis
+KOKKOS_ABSOLUTE_PATH = $(shell cd $(KOKKOS_PATH); pwd)
+export OMPI_CXX = $(KOKKOS_ABSOLUTE_PATH)/config/nvcc_wrapper
+CC =		mpicxx
 
-To un-install via make or Make.py:
+Once you have an appropriate Makefile.machine, you can
+install/un-install the package and build LAMMPS in the usual manner.
+Note that you cannot build one executable to run on multiple hardware
+targets (CPU or KNL or GPU).  You need to build LAMMPS once for each
+hardware target, to produce a separate executable.  Also note that we
+do not recommend building with other acceleration packages installed
+(GPU, OPT, USER-INTEL, USER-OMP) when also building with KOKKOS.
 
+make yes-kokkos
+make machine :pre
+ 
 make no-kokkos
 make machine :pre
 
-Make.py -p ^kokkos -a machine :pre
+[Supporting info:]
 
-Supporting info: src/KOKKOS/README, lib/kokkos/README,
-"Section 5.3"_Section_accelerate.html#acc_3,
-"Section 5.3.3"_accelerate_kokkos.html,
-Pair Styles section of "Section 3.5"_Section_commands.html#cmd_5
-for any pair style listed with a (k), "package kokkos"_package.html,
-examples/accelerate, bench/FERMI, bench/KEPLER
+src/KOKKOS: filenames -> commands
+src/KOKKOS/README
+lib/kokkos/README
+"Section 5.3"_Section_accelerate.html#acc_3
+"Section 5.3.3"_accelerate_kokkos.html
+"Section 2.7 -k on ..."_Section_start.html#start_7
+"Section 2.7 -sf kk"_Section_start.html#start_7
+"Section 2.7 -pk kokkos"_Section_start.html#start_7
+"package kokkos"_package.html
+Styles sections of "Section 3.5"_Section_commands.html#cmd_5 for styles followed by (k)
+"Benchmarks page"_http://lammps.sandia.gov/bench.html of web site :ul
 
 :line
 
-KSPACE package :link(KSPACE),h5
+KSPACE package :link(KSPACE),h4
 
-Contents: A variety of long-range Coulombic solvers, and pair styles
-which compute the corresponding short-range portion of the pairwise
-Coulombic interactions.  These include Ewald, particle-particle
-particle-mesh (PPPM), and multilevel summation method (MSM) solvers.
+[Contents:]
 
-Building with the KSPACE package requires a 1d FFT library be present
-on your system for use by the PPPM solvers.  This can be the KISS FFT
-library provided with LAMMPS, or 3rd party libraries like FFTW or a
-vendor-supplied FFT library.  See step 6 of "Section
-2.2.2"_Section_start.html#start_2_2 of the manual for details of how
-to select different FFT options in your machine Makefile.  The Make.py
-tool has an "-fft" option which can insert these settings into your
-machine Makefile automatically.  Type "python src/Make.py -h -fft" to
-see the details.
+A variety of long-range Coulombic solvers, as well as pair styles
+which compute the corresponding short-range pairwise Coulombic
+interactions.  These include Ewald, particle-particle particle-mesh
+(PPPM), and multilevel summation method (MSM) solvers.
 
-To install via make or Make.py:
+[Install or un-install:]
+
+Building with this package requires a 1d FFT library be present on
+your system for use by the PPPM solvers.  This can be the KISS FFT
+library provided with LAMMPS, 3rd party libraries like FFTW, or a
+vendor-supplied FFT library.  See step 6 of "Section
+2.2.2"_Section_start.html#start_2_2 of the manual for details on how
+to select different FFT options in your machine Makefile.
 
 make yes-kspace
 make machine :pre
 
-Make.py -p kspace -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-kspace
 make machine :pre
 
-Make.py -p ^kspace -a machine :pre
+[Supporting info:]
 
-Supporting info: "kspace_style"_kspace_style.html,
-"doc/PDF/kspace.pdf"_PDF/kspace.pdf,
-"Section 6.7"_Section_howto.html#howto_7,
-"Section 6.8"_Section_howto.html#howto_8,
-"Section 6.9"_Section_howto.html#howto_9,
-"pair_style coul"_pair_coul.html, other pair style command doc pages
-which have "long" or "msm" in their style name,
-examples/peptide, bench/in.rhodo
+src/KSPACE: filenames -> commands
+"kspace_style"_kspace_style.html
+"doc/PDF/kspace.pdf"_PDF/kspace.pdf
+"Section 6.7"_Section_howto.html#howto_7
+"Section 6.8"_Section_howto.html#howto_8
+"Section 6.9"_Section_howto.html#howto_9
+"pair_style coul"_pair_coul.html
+Pair Styles section of "Section 3.5"_Section_commands.html#cmd_5 with "long" or "msm" in pair style name
+examples/peptide
+bench/in.rhodo :ul
 
 :line
 
-MANYBODY package :link(MANYBODY),h5
+MANYBODY package :link(MANYBODY),h4
+
+[Contents:]
 
-Contents: A variety of many-body and bond-order potentials.  These
-include (AI)REBO, EAM, EIM, BOP, Stillinger-Weber, and Tersoff
-potentials.  Do a directory listing, "ls src/MANYBODY", to see
-the full list.
+A variety of manybody and bond-order potentials.  These include
+(AI)REBO, BOP, EAM, EIM, Stillinger-Weber, and Tersoff potentials.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-manybody
 make machine :pre
 
-Make.py -p manybody -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-manybody
 make machine :pre
 
-Make.py -p ^manybody -a machine :pre
-
-Supporting info:
+[Supporting info:]
 
-Examples: Pair Styles section of "Section
-3.5"_Section_commands.html#cmd_5, examples/comb, examples/eim,
-examples/nb3d, examples/vashishta
+src/MANYBODY: filenames -> commands
+Pair Styles section of "Section 3.5"_Section_commands.html#cmd_5
+examples/comb
+examples/eim
+examples/nb3d
+examples/shear
+examples/streitz
+examples/vashishta
+bench/in.eam :ul
 
 :line
 
-MC package :link(MC),h5
+MC package :link(MC),h4
+
+[Contents:]
 
-Contents: Several fixes and a pair style that have Monte Carlo (MC) or
-MC-like attributes.  These include fixes for creating, breaking, and
-swapping bonds, and for performing atomic swaps and grand-canonical MC
-in conjuction with dynamics.
+Several fixes and a pair style that have Monte Carlo (MC) or MC-like
+attributes.  These include fixes for creating, breaking, and swapping
+bonds, for performing atomic swaps, and performing grand-canonical MC
+(GCMC) in conjuction with dynamics.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-mc
 make machine :pre
 
-Make.py -p mc -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-mc
 make machine :pre
 
-Make.py -p ^mc -a machine :pre
+[Supporting info:]
 
-Supporting info: "fix atom/swap"_fix_atom_swap.html, "fix
-bond/break"_fix_bond_break.html, "fix
-bond/create"_fix_bond_create.html, "fix bond/swap"_fix_bond_swap.html,
-"fix gcmc"_fix_gcmc.html, "pair_style dsmc"_pair_dsmc.html
+src/MC: filenames -> commands
+"fix atom/swap"_fix_atom_swap.html
+"fix bond/break"_fix_bond_break.html
+"fix bond/create"_fix_bond_create.html
+"fix bond/swap"_fix_bond_swap.html
+"fix gcmc"_fix_gcmc.html
+"pair_style dsmc"_pair_dsmc.html
+http://lammps.sandia.gov/movies.html#gcmc :ul
 
 :line
 
-MEAM package :link(MEAM),h5
+MEAM package :link(MEAM),h4
 
-Contents: A pair style for the modified embedded atom (MEAM)
-potential.
+[Contents:]
 
-Building LAMMPS with the MEAM package requires first building the MEAM
-library itself, which is a set of Fortran 95 files in lib/meam.
-Details of how to do this are in lib/meam/README.  As illustrated
-below, perform a "make" using one of the Makefile.machine files in
-lib/meam which should create a lib/meam/libmeam.a file.
-Makefile.gfortran and Makefile.ifort are examples for the GNU Fortran
-and Intel Fortran compilers.  The "make" also copies a
-lib/meam/Makefile.lammps.machine file to lib/meam/Makefile.lammps.
-This file has settings that enable the C++ compiler used to build
-LAMMPS to link with a Fortran library (typically the 2 compilers to be
-consistent e.g. both Intel compilers, or both GNU compilers).  If the
-settings in Makefile.lammps for your compilers and machine are not
-correct, the LAMMPS link will fail.  Note that the Make.py script has
-a "-meam" option to allow the MEAM library and LAMMPS to be built in
-one step.  Type "python src/Make.py -h -meam" to see the details.
+A pair style for the modified embedded atom (MEAM) potential.
 
-NOTE: The MEAM potential can run dramatically faster if built with the
-Intel Fortran compiler, rather than the GNU Fortran compiler.
+[Author:] Greg Wagner (Northwestern U) while at Sandia.
 
-To install via make or Make.py:
+[Install or un-install:]
 
-cd ~/lammps/lib/meam
-make -f Makefile.gfortran    # for example
-cd ~/lammps/src
-make yes-meam
-make machine :pre
+Before building LAMMPS with this package, you must first build the
+MEAM library in lib/meam.  You can do this manually if you prefer;
+follow the instructions in lib/meam/README.  You can also do it in one
+step from the lammps/src dir, using a command like these, which simply
+invoke the lib/meam/Install.py script with the specified args:
+
+make lib-meam                      # print help message
+make lib-meam args="-m gfortran"   # build with GNU Fortran compiler
+make lib-meam args="-m ifort"      # build with Intel ifort compiler :pre
 
-Make.py -p meam -meam make=gfortran -a machine :pre
+The build should produce two files: lib/meam/libmeam.a and
+lib/meam/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to link C++ (LAMMPS) with
+Fortran (MEAM library).  Typically the two compilers used for LAMMPS
+and the MEAM library need to be consistent (e.g. both Intel or both
+GNU compilers).  If necessary, you can edit/create a new
+lib/meam/Makefile.machine file for your system, which should define an
+EXTRAMAKE variable to specify a corresponding Makefile.lammps.machine
+file.
 
-To un-install via make or Make.py:
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-meam
+make machine :pre
 
 make no-meam
 make machine :pre
 
-Make.py -p ^meam -a machine :pre
+NOTE: You should test building the MEAM library with both the Intel
+and GNU compilers to see if a simulation runs faster with one versus
+the other on your system.
+
+[Supporting info:]
 
-Supporting info: lib/meam/README, "pair_style meam"_pair_meam.html,
-examples/meam
+src/MEAM: filenames -> commands
+src/meam/README
+lib/meam/README
+"pair_style meam"_pair_meam.html
+examples/meam :ul
 
 :line
 
-MISC package :link(MISC),h5
+MISC package :link(MISC),h4
 
-Contents: A variety of computes, fixes, and pair styles that are not
-commonly used, but don't align with other packages.  Do a directory
+[Contents:]
+
+A variety of compute, fix, pair, dump styles with specialized
+capabilities that don't align with other packages.  Do a directory
 listing, "ls src/MISC", to see the list of commands.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-misc
 make machine :pre
 
-Make.py -p misc -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-misc
 make machine :pre
 
-Make.py -p ^misc -a machine :pre
+[Supporting info:]
 
-Supporting info: "compute ti"_compute_ti.html, "fix
-evaporate"_fix_evaporate.html, "fix tmm"_fix_ttm.html, "fix
-viscosity"_fix_viscosity.html, examples/misc
+src/MISC: filenames -> commands
+"compute ti"_compute_ti.html
+"fix evaporate"_fix_evaporate.html
+"fix orient/fcc"_fix_orient.html
+"fix ttm"_fix_ttm.html
+"fix thermal/conductivity"_fix_thermal_conductivity.html
+"fix viscosity"_fix_viscosity.html
+examples/KAPPA
+examples/VISCOSITY
+http://lammps.sandia.gov/pictures.html#ttm
+http://lammps.sandia.gov/movies.html#evaporation :ul
 
 :line
 
-MOLECULE package :link(MOLECULE),h5
+MOLECULE package :link(MOLECULE),h4
 
-Contents: A large number of atom, pair, bond, angle, dihedral,
-improper styles that are used to model molecular systems with fixed
-covalent bonds.  The pair styles include terms for the Dreiding
-(hydrogen-bonding) and CHARMM force fields, and TIP4P water model.
+[Contents:]
 
-To install via make or Make.py:
+A large number of atom, pair, bond, angle, dihedral, improper styles
+that are used to model molecular systems with fixed covalent bonds.
+The pair styles include the Dreiding (hydrogen-bonding) and CHARMM
+force fields, and a TIP4P water model.
+
+[Install or un-install:]
 
 make yes-molecule
 make machine :pre
 
-Make.py -p molecule -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-molecule
 make machine :pre
 
-Make.py -p ^molecule -a machine :pre
-
-Supporting info:"atom_style"_atom_style.html,
-"bond_style"_bond_style.html, "angle_style"_angle_style.html,
-"dihedral_style"_dihedral_style.html,
-"improper_style"_improper_style.html, "pair_style
-hbond/dreiding/lj"_pair_hbond_dreiding.html, "pair_style
-lj/charmm/coul/charmm"_pair_charmm.html,
-"Section 6.3"_Section_howto.html#howto_3,
-examples/micelle, examples/peptide, bench/in.chain, bench/in.rhodo
+[Supporting info:]
+
+src/MOLECULE: filenames -> commands
+"atom_style"_atom_style.html
+"bond_style"_bond_style.html
+"angle_style"_angle_style.html
+"dihedral_style"_dihedral_style.html
+"improper_style"_improper_style.html
+"pair_style hbond/dreiding/lj"_pair_hbond_dreiding.html
+"pair_style lj/charmm/coul/charmm"_pair_charmm.html
+"Section 6.3"_Section_howto.html#howto_3
+examples/cmap
+examples/dreiding
+examples/micelle,
+examples/peptide
+bench/in.chain
+bench/in.rhodo :ul
 
 :line
 
-MPIIO package :link(MPIIO),h5
+MPIIO package :link(MPIIO),h4
+
+[Contents:]
 
-Contents: Support for parallel output/input of dump and restart files
-via the MPIIO library, which is part of the standard message-passing
-interface (MPI) library.  It adds "dump styles"_dump.html with a
-"mpiio" in their style name.  Restart files with an ".mpiio" suffix
-are also written and read in parallel.
+Support for parallel output/input of dump and restart files via the
+MPIIO library.  It adds "dump styles"_dump.html with a "mpiio" in
+their style name.  Restart files with an ".mpiio" suffix are also
+written and read in parallel.
 
-To install via make or Make.py:
+[Install or un-install:]
 
+Note that MPIIO is part of the standard message-passing interface
+(MPI) library, so you should not need any additional compiler or link
+settings, beyond what LAMMPS normally uses for MPI on your system.
+ 
 make yes-mpiio
 make machine :pre
+ 
+make no-mpiio
+make machine :pre
+ 
+[Supporting info:]
 
-Make.py -p mpiio -a machine :pre
+src/MPIIO: filenames -> commands
+"dump"_dump.html
+"restart"_restart.html
+"write_restart"_write_restart.html
+"read_restart"_read_restart.html :ul
 
-To un-install via make or Make.py:
+:line
+ 
+MSCG package :link(MSCG),h4
 
-make no-mpiio
+[Contents:]
+
+A "fix mscg"_fix_mscg.html command which can parameterize a
+Mulit-Scale Coarse-Graining (MSCG) model using the open-source "MS-CG
+library"_mscg.
+
+:link(mscg,https://github.com/uchicago-voth/MSCG-release)
+
+To use this package you must have the MS-CG library available on your
+system.
+
+[Authors:] The fix was written by Lauren Abbott (Sandia).  The MS-CG
+library was developed by Jacob Wagner in Greg Voth's group at the
+University of Chicago.
+
+[Install or un-install:]
+
+Before building LAMMPS with this package, you must first download and
+build the MS-CG library.  Building the MS-CG library and using it from
+LAMMPS requires a C++11 compatible compiler, and that LAPACK and GSL
+(GNU Scientific Library) libraries be installed on your machine.  See
+the lib/mscg/README and MSCG/Install files for more details.
+
+Assuming these libraries are in place, you can do the download and
+build of MS-CG manually if you prefer; follow the instructions in
+lib/mscg/README.  You can also do it in one step from the lammps/src
+dir, using a command like these, which simply invoke the
+lib/mscg/Install.py script with the specified args:
+
+make lib-mscg                                # print help message
+make lib-mscg args="-g -b -l"                # download and build in default lib/mscg/MSCG-release-master
+make lib-mscg args="-h . MSCG -g -b -l"      # download and build in lib/mscg/MSCG
+make lib-mscg args="-h ~ MSCG -g -b -l"      # download and build in ~/mscg :pre
+
+Note that the final -l switch is to create 2 symbolic (soft) links,
+"includelink" and "liblink", in lib/mscg to point to the MS-CG src
+dir.  When LAMMPS builds it will use these links.  You should not need
+to edit the lib/mscg/Makefile.lammps file.
+
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-mscg
+make machine :pre
+
+make no-mscg
 make machine :pre
 
-Make.py -p ^mpiio -a machine :pre
+[Supporting info:]
 
-Supporting info: "dump"_dump.html, "restart"_restart.html,
-"write_restart"_write_restart.html, "read_restart"_read_restart.html
+src/MSCG: filenames -> commands
+src/MSCG/README
+lib/mscg/README
+examples/mscg :ul
 
 :line
+ 
+OPT package :link(OPT),h4
 
-OPT package :link(OPT),h5
+[Contents:]
 
-Contents: A handful of pair styles with an "opt" in their style name
-which are optimized for improved CPU performance on single or multiple
-cores.  These include EAM, LJ, CHARMM, and Morse potentials.  "Section
-5.3.5"_accelerate_opt.html gives details of how to build and
-use this package.  See the KOKKOS, USER-INTEL, and USER-OMP packages,
-which also have styles optimized for CPU performance.
+A handful of pair styles which are optimized for improved CPU
+performance on single or multiple cores.  These include EAM, LJ,
+CHARMM, and Morse potentials.  The styles have an "opt" suffix in
+their style name.  "Section 5.3.5"_accelerate_opt.html gives details
+of how to build and use this package.  Its styles can be invoked at
+run time via the "-sf opt" or "-suffix opt" "command-line
+switches"_Section_start.html#start_7.  See also the "KOKKOS"_#KOKKOS,
+"USER-INTEL"_#USER-INTEL, and "USER-OMP"_#USER-OMP packages, which
+have styles optimized for CPU performance.
 
-Some C++ compilers, like the Intel compiler, require the compile flag
-"-restrict" to build LAMMPS with the OPT package.  It should be added
-to the CCFLAGS line of your Makefile.machine.  Or use Makefile.opt in
-src/MAKE/OPTIONS, via "make opt".  For compilers that use the flag,
-the Make.py command adds it automatically to the Makefile.auto file it
-creates and uses.
+[Authors:] James Fischer (High Performance Technologies), David Richie,
+and Vincent Natoli (Stone Ridge Technolgy).
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-opt
 make machine :pre
 
-Make.py -p opt -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-opt
 make machine :pre
 
-Make.py -p ^opt -a machine :pre
+NOTE: The compile flag "-restrict" must be used to build LAMMPS with
+the OPT package.  It should be added to the CCFLAGS line of your
+Makefile.machine.  See Makefile.opt in src/MAKE/OPTIONS for an
+example.
+
+CCFLAGS: add -restrict :ul
 
-Supporting info: "Section 5.3"_Section_accelerate.html#acc_3,
-"Section 5.3.5"_accelerate_opt.html, Pair Styles section of
-"Section 3.5"_Section_commands.html#cmd_5 for any pair style
-listed with an (t), examples/accelerate, bench/KEPLER
+[Supporting info:]
+
+src/OPT: filenames -> commands
+"Section 5.3"_Section_accelerate.html#acc_3
+"Section 5.3.5"_accelerate_opt.html
+"Section 2.7 -sf opt"_Section_start.html#start_7
+Pair Styles section of "Section 3.5"_Section_commands.html#cmd_5 for pair styles followed by (t)
+"Benchmarks page"_http://lammps.sandia.gov/bench.html of web site :ul
 
 :line
 
-PERI package :link(PERI),h5
+PERI package :link(PERI),h4
 
-Contents: Support for the Peridynamics method, a particle-based
-meshless continuum model.  The package includes an atom style, several
-computes which calculate diagnostics, and several Peridynamic pair
-styles which implement different materials models.
+[Contents:]
 
-To install via make or Make.py:
+An atom style, several pair styles which implement different
+Peridynamics materials models, and several computes which calculate
+diagnostics.  Peridynamics is a a particle-based meshless continuum
+model.
 
-make yes-peri
-make machine :pre
+[Authors:] The original package was created by Mike Parks (Sandia).
+Additional Peridynamics models were added by Rezwanur Rahman and John
+Foster (UTSA).
 
-Make.py -p peri -a machine :pre
+[Install or un-install:]
 
-To un-install via make or Make.py:
+make yes-peri
+make machine :pre
 
 make no-peri
 make machine :pre
 
-Make.py -p ^peri -a machine :pre
+[Supporting info:]
 
-Supporting info:
-"doc/PDF/PDLammps_overview.pdf"_PDF/PDLammps_overview.pdf,
-"doc/PDF/PDLammps_EPS.pdf"_PDF/PDLammps_EPS.pdf,
-"doc/PDF/PDLammps_VES.pdf"_PDF/PDLammps_VES.pdf, "atom_style
-peri"_atom_style.html, "compute damage/atom"_compute_damage_atom.html,
-"pair_style peri/pmb"_pair_peri.html, examples/peri
+src/PERI: filenames -> commands
+"doc/PDF/PDLammps_overview.pdf"_PDF/PDLammps_overview.pdf
+"doc/PDF/PDLammps_EPS.pdf"_PDF/PDLammps_EPS.pdf
+"doc/PDF/PDLammps_VES.pdf"_PDF/PDLammps_VES.pdf
+"atom_style peri"_atom_style.html
+"pair_style peri/*"_pair_peri.html
+"compute damage/atom"_compute_damage_atom.html
+"compute plasticity/atom"_compute_plasticity_atom.html
+examples/peri
+http://lammps.sandia.gov/movies.html#peri :ul
 
 :line
 
-POEMS package :link(POEMS),h5
+POEMS package :link(POEMS),h4
 
-Contents: A fix that wraps the Parallelizable Open source Efficient
-Multibody Software (POEMS) librar, which is able to simulate the
-dynamics of articulated body systems.  These are systems with multiple
-rigid bodies (collections of atoms or particles) whose motion is
-coupled by connections at hinge points.
+[Contents:]
 
-Building LAMMPS with the POEMS package requires first building the
-POEMS library itself, which is a set of C++ files in lib/poems.
-Details of how to do this are in lib/poems/README.  As illustrated
-below, perform a "make" using one of the Makefile.machine files in
-lib/poems which should create a lib/meam/libpoems.a file.
-Makefile.g++ and Makefile.icc are examples for the GNU and Intel C++
-compilers.  The "make" also creates a lib/poems/Makefile.lammps file
-which you should not need to change.  Note the Make.py script has a
-"-poems" option to allow the POEMS library and LAMMPS to be built in
-one step.  Type "python src/Make.py -h -poems" to see the details.
+A fix that wraps the Parallelizable Open source Efficient Multibody
+Software (POEMS) library, which is able to simulate the dynamics of
+articulated body systems.  These are systems with multiple rigid
+bodies (collections of particles) whose motion is coupled by
+connections at hinge points.
 
-To install via make or Make.py:
+[Author:] Rudra Mukherjee (JPL) while at RPI.
 
-cd ~/lammps/lib/poems
-make -f Makefile.g++    # for example
-cd ~/lammps/src
-make yes-poems
-make machine :pre
+[Install or un-install:]
+
+Before building LAMMPS with this package, you must first build the
+POEMS library in lib/poems.  You can do this manually if you prefer;
+follow the instructions in lib/poems/README.  You can also do it in
+one step from the lammps/src dir, using a command like these, which
+simply invoke the lib/poems/Install.py script with the specified args:
+
+make lib-poems                      # print help message
+make lib-poems args="-m g++"        # build with GNU g++ compiler
+make lib-poems args="-m icc"        # build with Intel icc compiler :pre
 
-Make.py -p poems -poems make=g++ -a machine :pre
+The build should produce two files: lib/poems/libpoems.a and
+lib/poems/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to build LAMMPS with the
+POEMS library (though typically the settings are just blank).  If
+necessary, you can edit/create a new lib/poems/Makefile.machine file
+for your system, which should define an EXTRAMAKE variable to specify
+a corresponding Makefile.lammps.machine file.
 
-To un-install via make or Make.py:
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-poems
+make machine :pre
 
 make no-meam
 make machine :pre
 
-Make.py -p ^meam -a machine :pre
+[Supporting info:]
 
-Supporting info: src/POEMS/README, lib/poems/README,
-"fix poems"_fix_poems.html, examples/rigid
+src/POEMS: filenames -> commands
+src/POEMS/README
+lib/poems/README
+"fix poems"_fix_poems.html
+examples/rigid :ul
 
 :line
 
-PYTHON package :link(PYTHON),h5
+PYTHON package :link(PYTHON),h4
 
-Contents: A "python"_python.html command which allow you to execute
-Python code from a LAMMPS input script.  The code can be in a separate
-file or embedded in the input script itself.  See "Section
-11.2"_Section_python.html#py_2 for an overview of using Python from
-LAMMPS and for other ways to use LAMMPS and Python together.
+[Contents:]
 
-Building with the PYTHON package assumes you have a Python shared
-library available on your system, which needs to be a Python 2
-version, 2.6 or later.  Python 3 is not yet supported.  The build uses
-the contents of the lib/python/Makefile.lammps file to find all the Python
-files required in the build/link process.  See the lib/python/README
-file if the settings in that file do not work on your system.  Note
-that the Make.py script has a "-python" option to allow an alternate
-lib/python/Makefile.lammps file to be specified and LAMMPS to be built
-in one step.  Type "python src/Make.py -h -python" to see the details.
+A "python"_python.html command which allow you to execute Python code
+from a LAMMPS input script.  The code can be in a separate file or
+embedded in the input script itself.  See "Section
+11.2"_Section_python.html#py_2 for an overview of using Python from
+LAMMPS in this manner and the entire section for other ways to use
+LAMMPS and Python together.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-python
 make machine :pre
 
-Make.py -p python -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-python
 make machine :pre
 
-Make.py -p ^python -a machine :pre
+NOTE: Building with the PYTHON package assumes you have a Python
+shared library available on your system, which needs to be a Python 2
+version, 2.6 or later.  Python 3 is not yet supported.  See the
+lib/python/README for more details.  Note that the build uses the
+lib/python/Makefile.lammps file in the compile/link process.  You
+should only need to create a new Makefile.lammps.* file (and copy it
+to Makefile.lammps) if the LAMMPS build fails.
 
-Supporting info: examples/python
+[Supporting info:]
+
+src/PYTHON: filenames -> commands
+"Section 11"_Section_python.html
+lib/python/README
+examples/python :ul
 
 :line
 
-QEQ package :link(QEQ),h5
+QEQ package :link(QEQ),h4
+
+[Contents:]
 
-Contents: Several fixes for performing charge equilibration (QEq) via
-severeal different algorithms.  These can be used with pair styles
-that use QEq as part of their formulation.
+Several fixes for performing charge equilibration (QEq) via different
+algorithms.  These can be used with pair styles that perform QEq as
+part of their formulation.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-qeq
 make machine :pre
 
-Make.py -p qeq -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-qeq
 make machine :pre
 
-Make.py -p ^qeq -a machine :pre
+[Supporting info:]
 
-Supporting info: "fix qeq/*"_fix_qeq.html, examples/qeq
+src/QEQ: filenames -> commands
+"fix qeq/*"_fix_qeq.html
+examples/qeq
+examples/streitz :ul
 
 :line
 
-REAX package :link(REAX),h5
+REAX package :link(REAX),h4
 
-Contents: A pair style for the ReaxFF potential, a universal reactive
-force field, as well as a "fix reax/bonds"_fix_reax_bonds.html command
-for monitoring molecules as bonds are created and destroyed.
+[Contents:]
 
-Building LAMMPS with the REAX package requires first building the REAX
-library itself, which is a set of Fortran 95 files in lib/reax.
-Details of how to do this are in lib/reax/README.  As illustrated
-below, perform a "make" using one of the Makefile.machine files in
-lib/reax which should create a lib/reax/libreax.a file.
-Makefile.gfortran and Makefile.ifort are examples for the GNU Fortran
-and Intel Fortran compilers.  The "make" also copies a
-lib/reax/Makefile.lammps.machine file to lib/reax/Makefile.lammps.
-This file has settings that enable the C++ compiler used to build
-LAMMPS to link with a Fortran library (typically the 2 compilers to be
-consistent e.g. both Intel compilers, or both GNU compilers).  If the
-settings in Makefile.lammps for your compilers and machine are not
-correct, the LAMMPS link will fail.  Note that the Make.py script has
-a "-reax" option to allow the REAX library and LAMMPS to be built in
-one step.  Type "python src/Make.py -h -reax" to see the details.
-
-To install via make or Make.py:
-
-cd ~/lammps/lib/reax
-make -f Makefile.gfortran    # for example
-cd ~/lammps/src
-make yes-reax
-make machine :pre
+A pair style which wraps a Fortran library which implements the ReaxFF
+potential, which is a universal reactive force field.  See the
+"USER-REAXC package"_#USER-REAXC for an alternate implementation in
+C/C++.  Also a "fix reax/bonds"_fix_reax_bonds.html command for
+monitoring molecules as bonds are created and destroyed.
+
+[Author:] Aidan Thompson (Sandia).
 
-Make.py -p reax -reax make=gfortran -a machine :pre
+[Install or un-install:]
 
-To un-install via make or Make.py:
+Before building LAMMPS with this package, you must first build the
+REAX library in lib/reax.  You can do this manually if you prefer;
+follow the instructions in lib/reax/README.  You can also do it in one
+step from the lammps/src dir, using a command like these, which simply
+invoke the lib/reax/Install.py script with the specified args:
+
+make lib-reax                      # print help message
+make lib-reax args="-m gfortran"   # build with GNU Fortran compiler
+make lib-reax args="-m ifort"      # build with Intel ifort compiler :pre
+
+The build should produce two files: lib/reax/libreax.a and
+lib/reax/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to link C++ (LAMMPS) with
+Fortran (REAX library).  Typically the two compilers used for LAMMPS
+and the REAX library need to be consistent (e.g. both Intel or both
+GNU compilers).  If necessary, you can edit/create a new
+lib/reax/Makefile.machine file for your system, which should define an
+EXTRAMAKE variable to specify a corresponding Makefile.lammps.machine
+file.
+
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-reax
+make machine :pre
 
 make no-reax
 make machine :pre
 
-Make.py -p ^reax -a machine :pre
+[Supporting info:]
 
-Supporting info: lib/reax/README, "pair_style reax"_pair_reax.html,
-"fix reax/bonds"_fix_reax_bonds.html, examples/reax
+src/REAX: filenames -> commands
+lib/reax/README
+"pair_style reax"_pair_reax.html
+"fix reax/bonds"_fix_reax_bonds.html
+examples/reax :ul
 
 :line
 
-REPLICA package :link(REPLICA),h5
+REPLICA package :link(REPLICA),h4
 
-Contents: A collection of multi-replica methods that are used by
-invoking multiple instances (replicas) of LAMMPS
-simulations. Communication between individual replicas is performed in
-different ways by the different methods.  See "Section
+[Contents:]
+
+A collection of multi-replica methods which can be used when running
+multiple LAMMPS simulations (replicas).  See "Section
 6.5"_Section_howto.html#howto_5 for an overview of how to run
-multi-replica simulations in LAMMPS.  Multi-replica methods included
-in the package are nudged elastic band (NEB), parallel replica
-dynamics (PRD), temperature accelerated dynamics (TAD), parallel
-tempering, and a verlet/split algorithm for performing long-range
-Coulombics on one set of processors, and the remainder of the force
-field calculation on another set.
+multi-replica simulations in LAMMPS.  Methods in the package include
+nudged elastic band (NEB), parallel replica dynamics (PRD),
+temperature accelerated dynamics (TAD), parallel tempering, and a
+verlet/split algorithm for performing long-range Coulombics on one set
+of processors, and the remainder of the force field calcalation on
+another set.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-replica
 make machine :pre
 
-Make.py -p replica -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-replica
 make machine :pre
 
-Make.py -p ^replica -a machine :pre
+[Supporting info:]
 
-Supporting info: "Section 6.5"_Section_howto.html#howto_5,
-"neb"_neb.html, "prd"_prd.html, "tad"_tad.html, "temper"_temper.html,
-"run_style verlet/split"_run_style.html, examples/neb, examples/prd,
-examples/tad
+src/REPLICA: filenames -> commands
+"Section 6.5"_Section_howto.html#howto_5
+"neb"_neb.html
+"prd"_prd.html
+"tad"_tad.html
+"temper"_temper.html,
+"run_style verlet/split"_run_style.html
+examples/neb
+examples/prd
+examples/tad :ul
 
 :line
 
-RIGID package :link(RIGID),h5
+RIGID package :link(RIGID),h4
+
+[Contents:]
 
-Contents: A collection of computes and fixes which enforce rigid
-constraints on collections of atoms or particles.  This includes SHAKE
-and RATTLE, as well as variants of rigid-body time integrators for a
-few large bodies or many small bodies.
+Fixes which enforce rigid constraints on collections of atoms or
+particles.  This includes SHAKE and RATTLE, as well as varous
+rigid-body integrators for a few large bodies or many small bodies.
+Also several computes which calculate properties of rigid bodies.
 
-To install via make or Make.py:
+To install/build:
 
 make yes-rigid
 make machine :pre
 
-Make.py -p rigid -a machine :pre
-
-To un-install via make or Make.py:
+To un-install/re-build:
 
 make no-rigid
 make machine :pre
 
-Make.py -p ^rigid -a machine :pre
+[Supporting info:]
 
-Supporting info: "compute erotate/rigid"_compute_erotate_rigid.html,
-"fix shake"_fix_shake.html, "fix rattle"_fix_shake.html, "fix
-rigid/*"_fix_rigid.html, examples/ASPHERE, examples/rigid
+src/RIGID: filenames -> commands
+"compute erotate/rigid"_compute_erotate_rigid.html
+fix shake"_fix_shake.html
+"fix rattle"_fix_shake.html
+"fix rigid/*"_fix_rigid.html
+examples/ASPHERE
+examples/rigid
+bench/in.rhodo
+http://lammps.sandia.gov/movies.html#box
+http://lammps.sandia.gov/movies.html#star :ul
 
 :line
 
-SHOCK package :link(SHOCK),h5
+SHOCK package :link(SHOCK),h4
+
+[Contents:]
 
-Contents: A small number of fixes useful for running impact
-simulations where a shock-wave passes through a material.
+Fixes for running impact simulations where a shock-wave passes through
+a material.
 
-To install via make or Make.py:
+[Install or un-install:]
 
 make yes-shock
 make machine :pre
 
-Make.py -p shock -a machine :pre
-
-To un-install via make or Make.py:
-
 make no-shock
 make machine :pre
 
-Make.py -p ^shock -a machine :pre
+[Supporting info:]
 
-Supporting info: "fix append/atoms"_fix_append_atoms.html, "fix
-msst"_fix_msst.html, "fix nphug"_fix_nphug.html, "fix
-wall/piston"_fix_wall_piston.html, examples/hugoniostat, examples/msst
+src/SHOCK: filenames -> commands
+"fix append/atoms"_fix_append_atoms.html
+"fix msst"_fix_msst.html
+"fix nphug"_fix_nphug.html
+"fix wall/piston"_fix_wall_piston.html
+examples/hugoniostat
+examples/msst :ul
 
 :line
 
-SNAP package :link(SNAP),h5
+SNAP package :link(SNAP),h4
 
-Contents: A pair style for the spectral neighbor analysis potential
-(SNAP), which is an empirical potential which can be quantum accurate
-when fit to an archive of DFT data.  Computes useful for analyzing
-properties of the potential are also included.
+[Contents:]
 
-To install via make or Make.py:
+A pair style for the spectral neighbor analysis potential (SNAP).
+SNAP is methodology for deriving a highly accurate classical potential
+fit to a large archive of quantum mechanical (DFT) data. Also several
+computes which analyze attributes of the potential.
 
-make yes-snap
-make machine :pre
+[Author:] Aidan Thompson (Sandia).
 
-Make.py -p snap -a machine :pre
+[Install or un-install:]
 
-To un-install via make or Make.py:
+make yes-snap
+make machine :pre
 
 make no-snap
 make machine :pre
 
-Make.py -p ^snap -a machine :pre
+[Supporting info:]
 
-Supporting info: "pair snap"_pair_snap.html, "compute
-sna/atom"_compute_sna_atom.html, "compute snad/atom"_compute_sna_atom.html,
-"compute snav/atom"_compute_sna_atom.html, examples/snap
+src/SNAP: filenames -> commands
+"pair snap"_pair_snap.html
+"compute sna/atom"_compute_sna_atom.html
+"compute snad/atom"_compute_sna_atom.html
+"compute snav/atom"_compute_sna_atom.html
+examples/snap :ul
 
 :line
 
-SRD package :link(SRD),h5
+SRD package :link(SRD),h4
 
-Contents: Two fixes which implement the Stochastic Rotation Dynamics
-(SRD) method for coarse-graining of a solvent, typically around large
-colloidal-scale particles.
+[Contents:]
 
-To install via make or Make.py:
+A pair of fixes which implement the Stochastic Rotation Dynamics (SRD)
+method for coarse-graining of a solvent, typically around large
+colloidal particles.
+
+To install/build:
 
 make yes-srd
 make machine :pre
 
-Make.py -p srd -a machine :pre
-
-To un-install via make or Make.py:
+To un-install/re-build:
 
 make no-srd
 make machine :pre
 
-Make.py -p ^srd -a machine :pre
+[Supporting info:]
 
-Supporting info: "fix srd"_fix_srd.html, "fix
-wall/srd"_fix_wall_srd.html, examples/srd, examples/ASPHERE
+src/SRD: filenames -> commands
+"fix srd"_fix_srd.html
+"fix wall/srd"_fix_wall_srd.html
+examples/srd
+examples/ASPHERE
+http://lammps.sandia.gov/movies.html#tri
+http://lammps.sandia.gov/movies.html#line
+http://lammps.sandia.gov/movies.html#poly :ul
 
 :line
 
-VORONOI package :link(VORONOI),h5
+VORONOI package :link(VORONOI),h4
 
-Contents: A "compute voronoi/atom"_compute_voronoi_atom.html command
-which computes the Voronoi tesselation of a collection of atoms or
-particles by wrapping the Voro++ lib
+[Contents:]
 
-To build LAMMPS with the KIM package you must have previously
-installed the KIM API (library) on your system.  The lib/kim/README
-file explains how to download and install KIM.  Building with the KIM
-package also uses the lib/kim/Makefile.lammps file in the compile/link
-process.  You should not need to edit this file.
+A compute command which calculates the Voronoi tesselation of a
+collection of atoms by wrapping the "Voro++ library"_voronoi.  This
+can be used to calculate the local volume or each atoms or its near
+neighbors.
 
+:link(voronoi,http://math.lbl.gov/voro++)
 
-To build LAMMPS with the VORONOI package you must have previously
-installed the Voro++ library on your system.  The lib/voronoi/README
-file explains how to download and install Voro++.  There is a
-lib/voronoi/install.py script which automates the process.  Type
-"python install.py" to see instructions.  The final step is to create
-soft links in the lib/voronoi directory for "includelink" and
-"liblink" which point to installed Voro++ directories.  Building with
-the VORONOI package uses the contents of the
-lib/voronoi/Makefile.lammps file in the compile/link process.  You
-should not need to edit this file.  Note that the Make.py script has a
-"-voronoi" option to allow the Voro++ library to be downloaded and/or
-installed and LAMMPS to be built in one step.  Type "python
-src/Make.py -h -voronoi" to see the details.
+To use this package you must have the Voro++ library available on your
+system.
 
-To install via make or Make.py:
+[Author:] Daniel Schwen (INL) while at LANL.  The open-source Voro++
+library was written by Chris Rycroft (Harvard U) while at UC Berkeley
+and LBNL.
 
-cd ~/lammps/lib/voronoi
-python install.py -g -b -l    # download Voro++, build in lib/voronoi, create links
-cd ~/lammps/src
-make yes-voronoi
-make machine :pre
+[Install or un-install:]
+
+Before building LAMMPS with this package, you must first download and
+build the Voro++ library.  You can do this manually if you prefer;
+follow the instructions in lib/voronoi/README.  You can also do it in
+one step from the lammps/src dir, using a command like these, which
+simply invoke the lib/voronoi/Install.py script with the specified
+args:
+
+make lib-voronoi                                # print help message
+make lib-voronoi args="-g -b -l"                # download and build in default lib/voronoi/voro++-0.4.6
+make lib-voronoi args="-h . voro++ -g -b -l"    # download and build in lib/voronoi/voro++
+make lib-voronoi args="-h ~ voro++ -g -b -l"    # download and build in ~/voro++ :pre
+
+Note that the final -l switch is to create 2 symbolic (soft) links,
+"includelink" and "liblink", in lib/voronoi to point to the Voro++ src
+dir.  When LAMMPS builds it will use these links.  You should not need
+to edit the lib/voronoi/Makefile.lammps file.
 
-Make.py -p voronoi -voronoi install="-g -b -l" -a machine :pre
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
 
-To un-install via make or Make.py:
+make yes-voronoi
+make machine :pre
 
 make no-voronoi
 make machine :pre
 
-Make.py -p ^voronoi -a machine :pre
-
-Supporting info: src/VORONOI/README, lib/voronoi/README, "compute
-voronoi/atom"_compute_voronoi_atom.html, examples/voronoi
+[Supporting info:]
+
+src/VORONOI: filenames -> commands
+src/VORONOI/README
+lib/voronoi/README
+"compute voronoi/atom"_compute_voronoi_atom.html
+examples/voronoi :ul
 
 :line
+:line
+
+USER-ATC package :link(USER-ATC),h4
 
-4.2 User packages :h4,link(pkg_2)
+[Contents:]
 
-The current list of user-contributed packages is as follows:
+ATC stands for atoms-to-continuum.  This package implements a "fix
+atc"_fix_atc.html command to either couple molecular dynamics with
+continuum finite element equations or perform on-the-fly conversion of
+atomic information to continuum fields.
 
-Package, Description, Author(s), Doc page, Example, Pic/movie, Library
-"USER-ATC"_#USER-ATC, atom-to-continuum coupling, Jones & Templeton & Zimmerman (1), "fix atc"_fix_atc.html, USER/atc, "atc"_atc, lib/atc
-"USER-AWPMD"_#USER-AWPMD, wave-packet MD, Ilya Valuev (JIHT), "pair_style awpmd/cut"_pair_awpmd.html, USER/awpmd, -, lib/awpmd
-"USER-CG-CMM"_#USER-CG-CMM, coarse-graining model, Axel Kohlmeyer (Temple U), "pair_style lj/sdk"_pair_sdk.html, USER/cg-cmm, "cg"_cg, -
-"USER-CGDNA"_#USER-CGDNA, coarse-grained DNA force fields, Oliver Henrich (U Strathclyde Glasgow), src/USER-CGDNA/README, USER/cgdna, -, -
-"USER-COLVARS"_#USER-COLVARS, collective variables, Fiorin & Henin & Kohlmeyer (2), "fix colvars"_fix_colvars.html, USER/colvars, "colvars"_colvars, lib/colvars
-"USER-DIFFRACTION"_#USER-DIFFRACTION, virutal x-ray and electron diffraction, Shawn Coleman (ARL),"compute xrd"_compute_xrd.html, USER/diffraction, -, -
-"USER-DPD"_#USER-DPD, reactive dissipative particle dynamics (DPD), Larentzos & Mattox & Brennan (5), src/USER-DPD/README, USER/dpd, -, -
-"USER-DRUDE"_#USER-DRUDE, Drude oscillators, Dequidt & Devemy & Padua (3), "tutorial"_tutorial_drude.html, USER/drude, -, -
-"USER-EFF"_#USER-EFF, electron force field, Andres Jaramillo-Botero (Caltech), "pair_style eff/cut"_pair_eff.html, USER/eff, "eff"_eff, -
-"USER-FEP"_#USER-FEP, free energy perturbation, Agilio Padua (U Blaise Pascal Clermont-Ferrand), "compute fep"_compute_fep.html, USER/fep, -, -
-"USER-H5MD"_#USER-H5MD, dump output via HDF5, Pierre de Buyl (KU Leuven), "dump h5md"_dump_h5md.html, -, -, lib/h5md
-"USER-INTEL"_#USER-INTEL, Vectorized CPU and Intel(R) coprocessor styles, W. Michael Brown (Intel), "Section 5.3.2"_accelerate_intel.html, examples/intel, -, -
-"USER-LB"_#USER-LB, Lattice Boltzmann fluid, Colin Denniston (U Western Ontario), "fix lb/fluid"_fix_lb_fluid.html, USER/lb, -, -
-"USER-MGPT"_#USER-MGPT, fast MGPT multi-ion potentials, Tomas Oppelstrup & John Moriarty (LLNL), "pair_style mgpt"_pair_mgpt.html, USER/mgpt, -, -
-"USER-MISC"_#USER-MISC, single-file contributions, USER-MISC/README, USER-MISC/README, -, -, -
-"USER-MANIFOLD"_#USER-MANIFOLD, motion on 2d surface, Stefan Paquay (Eindhoven U of Technology), "fix manifoldforce"_fix_manifoldforce.html, USER/manifold, "manifold"_manifold, -
-"USER-MOLFILE"_#USER-MOLFILE, "VMD"_VMD molfile plug-ins, Axel Kohlmeyer (Temple U), "dump molfile"_dump_molfile.html, -, -, VMD-MOLFILE
-"USER-NC-DUMP"_#USER-NC-DUMP, dump output via NetCDF, Lars Pastewka (Karlsruhe Institute of Technology, KIT), "dump nc / dump nc/mpiio"_dump_nc.html, -, -, lib/netcdf
-"USER-OMP"_#USER-OMP, OpenMP threaded styles, Axel Kohlmeyer (Temple U), "Section 5.3.4"_accelerate_omp.html, -, -, -
-"USER-PHONON"_#USER-PHONON, phonon dynamical matrix, Ling-Ti Kong (Shanghai Jiao Tong U), "fix phonon"_fix_phonon.html, USER/phonon, -, -
-"USER-QMMM"_#USER-QMMM, QM/MM coupling, Axel Kohlmeyer (Temple U), "fix qmmm"_fix_qmmm.html, USER/qmmm, -, lib/qmmm
-"USER-QTB"_#USER-QTB, quantum nuclear effects, Yuan Shen (Stanford), "fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, -, -
-"USER-QUIP"_#USER-QUIP, QUIP/libatoms interface, Albert Bartok-Partay (U Cambridge), "pair_style quip"_pair_quip.html, USER/quip, -, lib/quip
-"USER-REAXC"_#USER-REAXC, C version of ReaxFF, Metin Aktulga (LBNL), "pair_style reaxc"_pair_reax_c.html, reax, -, -
-"USER-SMD"_#USER-SMD, smoothed Mach dynamics, Georg Ganzenmuller (EMI), "SMD User Guide"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, -, -
-"USER-SMTBQ"_#USER-SMTBQ, Second Moment Tight Binding - QEq potential, Salles & Maras & Politano & Tetot (4), "pair_style smtbq"_pair_smtbq.html, USER/smtbq, -, -
-"USER-SPH"_#USER-SPH, smoothed particle hydrodynamics, Georg Ganzenmuller (EMI), "SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, "sph"_sph, -
-"USER-TALLY"_#USER-TALLY, Pairwise tallied computes, Axel Kohlmeyer (Temple U), "compute XXX/tally"_compute_tally.html, USER/tally, -, -
-"USER-VTK"_#USER-VTK, VTK-style dumps, Berger and Queteschiner (6), "compute custom/vtk"_dump_custom_vtk.html, -, -, lib/vtk
-:tb(ea=c)
+[Authors:] Reese Jones, Jeremy Templeton, Jon Zimmerman (Sandia).
 
-:link(atc,http://lammps.sandia.gov/pictures.html#atc)
-:link(cg,http://lammps.sandia.gov/pictures.html#cg)
-:link(eff,http://lammps.sandia.gov/movies.html#eff)
-:link(manifold,http://lammps.sandia.gov/movies.html#manifold)
-:link(sph,http://lammps.sandia.gov/movies.html#sph)
-:link(VMD,http://www.ks.uiuc.edu/Research/vmd)
+[Install or un-install:]
+  
+Before building LAMMPS with this package, you must first build the ATC
+library in lib/atc.  You can do this manually if you prefer; follow
+the instructions in lib/atc/README.  You can also do it in one step
+from the lammps/src dir, using a command like these, which simply
+invoke the lib/atc/Install.py script with the specified args:
 
-The "Authors" column lists a name(s) if a specific person is
-responsible for creating and maintaining the package.
+make lib-atc                      # print help message
+make lib-atc args="-m g++"        # build with GNU g++ compiler
+make lib-atc args="-m icc"        # build with Intel icc compiler :pre
 
-(1) The ATC package was created by Reese Jones, Jeremy Templeton, and
-Jon Zimmerman (Sandia).
+The build should produce two files: lib/atc/libatc.a and
+lib/atc/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to build LAMMPS with the ATC
+library.  If necessary, you can edit/create a new
+lib/atc/Makefile.machine file for your system, which should define an
+EXTRAMAKE variable to specify a corresponding Makefile.lammps.machine
+file.
 
-(2) The COLVARS package was created by Axel Kohlmeyer (Temple U) using
-the colvars module library written by Giacomo Fiorin (Temple U) and
-Jerome Henin (LISM, Marseille, France).
+Note that the Makefile.lammps file has settings for the BLAS and
+LAPACK linear algebra libraries.  As explained in lib/atc/README these
+can either exist on your system, or you can use the files provided in
+lib/linalg.  In the latter case you also need to build the library
+in lib/linalg with a command like these:
 
-(3) The DRUDE package was created by Alain Dequidt (U Blaise Pascal
-Clermont-Ferrand) and co-authors Julien Devemy (CNRS) and Agilio Padua
-(U Blaise Pascal).
+make lib-linalg                      # print help message
+make lib-atc args="-m gfortran"      # build with GNU Fortran compiler
 
-(4) The SMTBQ package was created by Nicolas Salles, Emile Maras,
-Olivier Politano, and Robert Tetot (LAAS-CNRS, France).
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
 
-(5) The USER-DPD package was created by James Larentzos (ARL), Timothy
-Mattox (Engility), and John Brennan (ARL).
+make yes-user-atc
+make machine :pre
+ 
+make no-user-atc
+make machine :pre
+ 
+[Supporting info:]
 
-(6) The USER-VTK package was created by Richard Berger (JKU) and
-Daniel Queteschiner (DCS Computing).
+src/USER-ATC: filenames -> commands
+src/USER-ATC/README
+"fix atc"_fix_atc.html
+examples/USER/atc
+http://lammps.sandia.gov/pictures.html#atc :ul
 
-The "Doc page" column links to either a sub-section of the
-"Section 6"_Section_howto.html of the manual, or an input script
-command implemented as part of the package, or to additional
-documentation provided within the package.
+:line
 
-The "Example" column is a sub-directory in the examples directory of
-the distribution which has an input script that uses the package.
-E.g. "peptide" refers to the examples/peptide directory.
+USER-AWPMD package :link(USER-AWPMD),h4
 
-The "Library" column lists an external library which must be built
-first and which LAMMPS links to when it is built.  If it is listed as
-lib/package, then the code for the library is under the lib directory
-of the LAMMPS distribution.  See the lib/package/README file for info
-on how to build the library.  If it is not listed as lib/package, then
-it is a third-party library not included in the LAMMPS distribution.
-See details on all of this below for individual packages.
+[Contents:]
 
-:line
+AWPMD stands for Antisymmetrized Wave Packet Molecular Dynamics.  This
+package implements an atom, pair, and fix style which allows electrons
+to be treated as explicit particles in a classical molecular dynamics
+model.
 
-USER-ATC package :link(USER-ATC),h5
+[Author:] Ilya Valuev (JIHT, Russia).
 
-Contents: ATC stands for atoms-to-continuum.  This package implements
-a "fix atc"_fix_atc.html command to either couple MD with continuum
-finite element equations or perform on-the-fly post-processing of
-atomic information to continuum fields.  See src/USER-ATC/README for
-more details.
-
-To build LAMMPS with this package ...
-
-To install via make or Make.py:
+[Install or un-install:]
+  
+Before building LAMMPS with this package, you must first build the
+AWPMD library in lib/awpmd.  You can do this manually if you prefer;
+follow the instructions in lib/awpmd/README.  You can also do it in
+one step from the lammps/src dir, using a command like these, which
+simply invoke the lib/awpmd/Install.py script with the specified args:
 
-make yes-user-atc
-make machine :pre
+make lib-awpmd                      # print help message
+make lib-awpmd args="-m g++"        # build with GNU g++ compiler
+make lib-awpmd args="-m icc"        # build with Intel icc compiler :pre
 
-Make.py -p atc -a machine :pre
+The build should produce two files: lib/awpmd/libawpmd.a and
+lib/awpmd/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to build LAMMPS with the
+AWPMD library.  If necessary, you can edit/create a new
+lib/awpmd/Makefile.machine file for your system, which should define
+an EXTRAMAKE variable to specify a corresponding
+Makefile.lammps.machine file.
 
-To un-install via make or Make.py:
+Note that the Makefile.lammps file has settings for the BLAS and
+LAPACK linear algebra libraries.  As explained in lib/awpmd/README
+these can either exist on your system, or you can use the files
+provided in lib/linalg.  In the latter case you also need to build the
+library in lib/linalg with a command like these:
 
-make no-user-atc
-make machine :pre
+make lib-linalg                      # print help message
+make lib-atc args="-m gfortran"      # build with GNU Fortran compiler
 
-Make.py -p ^atc -a machine :pre
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
 
-Supporting info:src/USER-ATC/README, "fix atc"_fix_atc.html,
-examples/USER/atc
+make yes-user-awpmd
+make machine :pre
+ 
+make no-user-awpmd
+make machine :pre
+ 
+[Supporting info:]
 
-Authors: Reese Jones (rjones at sandia.gov), Jeremy Templeton (jatempl
-at sandia.gov) and Jon Zimmerman (jzimmer at sandia.gov) at Sandia.
-Contact them directly if you have questions.
+src/USER-AWPMD: filenames -> commands
+src/USER-AWPMD/README
+"pair awpmd/cut"_pair_awpmd.html
+"fix nve/awpmd"_fix_nve_awpmd.html
+examples/USER/awpmd :ul
 
 :line
 
-USER-AWPMD package :link(USER-AWPMD),h5
+USER-CGDNA package :link(USER-CGDNA),h4
 
-Contents: AWPMD stands for Antisymmetrized Wave Packet Molecular
-Dynamics.  This package implements an atom, pair, and fix style which
-allows electrons to be treated as explicit particles in an MD
-calculation.  See src/USER-AWPMD/README for more details.
+[Contents:]
 
-To build LAMMPS with this package ...
+Several pair styles, a bond style, and integration fixes for
+coarse-grained models of single- and double-stranded DNA based on the
+oxDNA model of Doye, Louis and Ouldridge at the University of Oxford.
+This includes Langevin-type rigid-body integrators with improved
+stability.
 
-Supporting info: src/USER-AWPMD/README, "fix
-awpmd/cut"_pair_awpmd.html, examples/USER/awpmd
+[Author:] Oliver Henrich (University of Edinburgh).
 
-Author: Ilya Valuev at the JIHT in Russia (valuev at
-physik.hu-berlin.de).  Contact him directly if you have questions.
+[Install or un-install:]
+  
+make yes-user-cgdna
+make machine :pre
+ 
+make no-user-cgdna
+make machine :pre
+ 
+[Supporting info:]
+
+src/USER-CGDNA: filenames -> commands
+/src/USER-CGDNA/README
+"pair_style oxdna/*"_pair_oxdna.html
+"pair_style oxdna2/*"_pair_oxdna2.html
+"bond_style oxdna/*"_bond_oxdna.html
+"bond_style oxdna2/*"_bond_oxdna2.html
+"fix nve/dotc/langevin"_fix_nve_dotc_langevin.html :ul
 
 :line
 
-USER-CG-CMM package :link(USER-CG-CMM),h5
+USER-CGSDK package :link(USER-CGSDK),h4
 
-Contents: CG-CMM stands for coarse-grained ??.  This package
-implements several pair styles and an angle style using the coarse
-grained parametrization of Shinoda, DeVane, Klein, Mol Sim, 33, 27
-(2007) (SDK), with extensions to simulate ionic liquids, electrolytes,
-lipids and charged amino acids.  See src/USER-CG-CMM/README for more
-details.
+[Contents:]
 
-Supporting info: src/USER-CG-CMM/README, "pair lj/sdk"_pair_sdk.html,
-"pair lj/sdk/coul/long"_pair_sdk.html, "angle sdk"_angle_sdk.html,
-examples/USER/cg-cmm
+Several pair styles and an angle style which implement the
+coarse-grained SDK model of Shinoda, DeVane, and Klein which enables
+simulation of ionic liquids, electrolytes, lipids and charged amino
+acids.
 
-Author: Axel Kohlmeyer at Temple U (akohlmey at gmail.com).  Contact
-him directly if you have questions.
-
-:line
+[Author:] Axel Kohlmeyer (Temple U).
 
-USER-CGDNA package :link(USER-CGDNA),h5
-
-Contents: The CGDNA package implements coarse-grained force fields for
-single- and double-stranded DNA. These are at the moment mainly the
-oxDNA and oxDNA2 models, developed by Doye, Louis and Ouldridge at the University
-of Oxford.  The package also contains Langevin-type rigid-body
-integrators with improved stability.
+[Install or un-install:]
+  
+make yes-user-cgsdk
+make machine :pre
+ 
+make no-user-cgsdk
+make machine :pre
+ 
+[Supporting info:]
 
-See these doc pages to get started:
+src/USER-CGSDK: filenames -> commands
+src/USER-CGSDK/README
+"pair_style lj/sdk/*"_pair_sdk.html
+"angle_style sdk"_angle_sdk.html
+examples/USER/cgsdk
+http://lammps.sandia.gov/pictures.html#cg :ul
 
-"bond_style oxdna/fene"_bond_oxdna.html
-"bond_style oxdna2/fene"_bond_oxdna.html
-"pair_style oxdna/..."_pair_oxdna.html
-"pair_style oxdna2/..."_pair_oxdna2.html
-"fix nve/dotc/langevin"_fix_nve_dotc_langevin.html :ul
+:line
 
-Supporting info: /src/USER-CGDNA/README, "bond_style
-oxdna/fene"_bond_oxdna.html, "bond_style
-oxdna2/fene"_bond_oxdna.html, "pair_style
-oxdna/..."_pair_oxdna.html, "pair_style
-oxdna2/..."_pair_oxdna2.html, "fix
-nve/dotc/langevin"_fix_nve_dotc_langevin.html
+USER-COLVARS package :link(USER-COLVARS),h4
+
+[Contents:]
+
+COLVARS stands for collective variables, which can be used to
+implement various enhanced sampling methods, including Adaptive
+Biasing Force, Metadynamics, Steered MD, Umbrella Sampling and
+Restraints.  A "fix colvars"_fix_colvars.html command is implemented
+which wraps a COLVARS library, which implements these methods.
+simulations.
+
+[Authors:] Axel Kohlmeyer (Temple U).  The COLVARS library was written
+by Giacomo Fiorin (ICMS, Temple University, Philadelphia, PA, USA) and
+Jerome Henin (LISM, CNRS, Marseille, France).
+
+[Install or un-install:]
+  
+Before building LAMMPS with this package, you must first build the
+COLVARS library in lib/colvars.  You can do this manually if you
+prefer; follow the instructions in lib/colvars/README.  You can also
+do it in one step from the lammps/src dir, using a command like these,
+which simply invoke the lib/colvars/Install.py script with the
+specified args:
+
+make lib-colvars                      # print help message
+make lib-colvars args="-m g++"        # build with GNU g++ compiler :pre
+
+The build should produce two files: lib/colvars/libcolvars.a and
+lib/colvars/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to build LAMMPS with the
+COLVARS library (though typically the settings are just blank).  If
+necessary, you can edit/create a new lib/colvars/Makefile.machine file
+for your system, which should define an EXTRAMAKE variable to specify
+a corresponding Makefile.lammps.machine file.
+
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-user-colvars
+make machine :pre
+ 
+make no-user-colvars
+make machine :pre
+ 
+[Supporting info:]
 
-Author: Oliver Henrich at the University of Strathclyde, Glasgow 
-(oliver.henrich at strath.ac.uk, also ohenrich at ph.ed.ac.uk). 
-Contact him directly if you have any questions.
+src/USER-COLVARS: filenames -> commands
+"doc/PDF/colvars-refman-lammps.pdf"_PDF/colvars-refman-lammps.pdf
+src/USER-COLVARS/README
+lib/colvars/README
+"fix colvars"_fix_colvars.html
+examples/USER/colvars :ul
 
 :line
 
-USER-COLVARS package :link(USER-COLVARS),h5
+USER-DIFFRACTION package :link(USER-DIFFRACTION),h4
 
-Contents: COLVARS stands for collective variables which can be used to
-implement Adaptive Biasing Force, Metadynamics, Steered MD, Umbrella
-Sampling and Restraints.  This package implements a "fix
-colvars"_fix_colvars.html command which wraps a COLVARS library which
-can perform those kinds of simulations.  See src/USER-COLVARS/README
-for more details.
+[Contents:]
 
-Supporting info:
-"doc/PDF/colvars-refman-lammps.pdf"_PDF/colvars-refman-lammps.pdf,
-src/USER-COLVARS/README, lib/colvars/README, "fix
-colvars"_fix_colvars.html, examples/USER/colvars
+Two computes and a fix for calculating x-ray and electron diffraction
+intensities based on kinematic diffraction theory.
 
-Authors: Axel Kohlmeyer at Temple U (akohlmey at gmail.com) wrote the
-fix.  The COLVARS library itself is written and maintained by Giacomo
-Fiorin (ICMS, Temple University, Philadelphia, PA, USA) and Jerome
-Henin (LISM, CNRS, Marseille, France).  Contact them directly if you
-have questions.
+[Author:] Shawn Coleman while at the U Arkansas.
 
-:line
+[Install or un-install:]
+  
+make yes-user-diffraction
+make machine :pre
+ 
+make no-user-diffraction
+make machine :pre
+ 
+[Supporting info:]
 
-USER-DIFFRACTION package :link(USER-DIFFRACTION),h5
+src/USER-DIFFRACTION: filenames -> commands
+"compute saed"_compute_saed.html
+"compute xrd"_compute_xrd.html
+"fix saed/vtk"_fix_saed_vtk.html
+examples/USER/diffraction :ul
 
-Contents: This packages implements two computes and a fix for
-calculating x-ray and electron diffraction intensities based on
-kinematic diffraction theory.  See src/USER-DIFFRACTION/README for
-more details.
+:line
 
-Supporting info: "compute saed"_compute_saed.html, "compute
-xrd"_compute_xrd.html, "fix saed/vtk"_fix_saed_vtk.html,
-examples/USER/diffraction
+USER-DPD package :link(USER-DPD),h4
 
-Author: Shawn P. Coleman (shawn.p.coleman8.ctr at mail.mil) while at
-the University of Arkansas.  Contact him directly if you have
-questions.
+[Contents:]
 
-:line
+DPD stands for dissipative particle dynamics.  This package implements
+coarse-grained DPD-based models for energetic, reactive molecular
+crystalline materials.  It includes many pair styles specific to these
+systems, including for reactive DPD, where each particle has internal
+state for multiple species and a coupled set of chemical reaction ODEs
+are integrated each timestep.  Highly accurate time intergrators for
+isothermal, isoenergetic, isobaric and isenthalpic conditions are
+included.  These enable long timesteps via the Shardlow splitting
+algorithm.
 
-USER-DPD package :link(USER-DPD),h5
+[Authors:] Jim Larentzos (ARL), Tim Mattox (Engility Corp), and and John
+Brennan (ARL).
 
-Contents: DPD stands for dissipative particle dynamics, This package
-implements DPD for isothermal, isoenergetic, isobaric and isenthalpic
-conditions.  It also has extensions for performing reactive DPD, where
-each particle has internal state for multiple species and a coupled
-set of chemical reaction ODEs are integrated each timestep.  The DPD
-equations of motion are integrated efficiently through the Shardlow
-splitting algorithm.  See src/USER-DPD/README for more details.
+[Install or un-install:]
+  
+make yes-user-dpd
+make machine :pre
+ 
+make no-user-dpd
+make machine :pre
+ 
+[Supporting info:]
 
-Supporting info: /src/USER-DPD/README, "compute dpd"_compute_dpd.html
+src/USER-DPD: filenames -> commands
+/src/USER-DPD/README
+"compute dpd"_compute_dpd.html
 "compute dpd/atom"_compute_dpd_atom.html
-"fix eos/cv"_fix_eos_table.html "fix eos/table"_fix_eos_table.html
-"fix eos/table/rx"_fix_eos_table_rx.html "fix shardlow"_fix_shardlow.html
-"fix rx"_fix_rx.html "pair table/rx"_pair_table_rx.html
-"pair dpd/fdt"_pair_dpd_fdt.html "pair dpd/fdt/energy"_pair_dpd_fdt.html
-"pair exp6/rx"_pair_exp6_rx.html "pair multi/lucy"_pair_multi_lucy.html
-"pair multi/lucy/rx"_pair_multi_lucy_rx.html, examples/USER/dpd
-
-Authors: James Larentzos (ARL) (james.p.larentzos.civ at mail.mil),
-Timothy Mattox (Engility Corp) (Timothy.Mattox at engilitycorp.com)
-and John Brennan (ARL) (john.k.brennan.civ at mail.mil).  Contact them
-directly if you have questions.
+"fix eos/cv"_fix_eos_table.html
+"fix eos/table"_fix_eos_table.html
+"fix eos/table/rx"_fix_eos_table_rx.html
+"fix shardlow"_fix_shardlow.html
+"fix rx"_fix_rx.html
+"pair table/rx"_pair_table_rx.html
+"pair dpd/fdt"_pair_dpd_fdt.html
+"pair dpd/fdt/energy"_pair_dpd_fdt.html
+"pair exp6/rx"_pair_exp6_rx.html
+"pair multi/lucy"_pair_multi_lucy.html
+"pair multi/lucy/rx"_pair_multi_lucy_rx.html
+examples/USER/dpd :ul
 
 :line
 
-USER-DRUDE package :link(USER-DRUDE),h5
+USER-DRUDE package :link(USER-DRUDE),h4
 
-Contents: This package contains methods for simulating polarizable
-systems using thermalized Drude oscillators.  It has computes, fixes,
-and pair styles for this purpose.  See "Section
+[Contents:]
+
+Fixes, pair styles, and a compute to simulate thermalized Drude
+oscillators as a model of polarization.  See "Section
 6.27"_Section_howto.html#howto_27 for an overview of how to use the
-package.  See src/USER-DRUDE/README for additional details.  There are
-auxiliary tools for using this package in tools/drude.
+package.  There are auxiliary tools for using this package in
+tools/drude.
 
-Supporting info: "Section 6.27"_Section_howto.html#howto_27,
-src/USER-DRUDE/README, "fix drude"_fix_drude.html, "fix
-drude/transform/*"_fix_drude_transform.html, "compute
-temp/drude"_compute_temp_drude.html, "pair thole"_pair_thole.html,
-"pair lj/cut/thole/long"_pair_thole.html, examples/USER/drude,
-tools/drude
+[Authors:] Alain Dequidt (U Blaise Pascal Clermont-Ferrand), Julien
+Devemy (CNRS), and Agilio Padua (U Blaise Pascal).
 
-Authors: Alain Dequidt at Universite Blaise Pascal Clermont-Ferrand
-(alain.dequidt at univ-bpclermont.fr); co-authors: Julien Devemy,
-Agilio Padua.  Contact them directly if you have questions.
+[Install or un-install:]
+  
+make yes-user-drude
+make machine :pre
+ 
+make no-user-drude
+make machine :pre
+ 
+[Supporting info:]
+
+src/USER-DRUDE: filenames -> commands
+"Section 6.27"_Section_howto.html#howto_27
+"Section 6.25"_Section_howto.html#howto_25
+src/USER-DRUDE/README
+"fix drude"_fix_drude.html
+"fix drude/transform/*"_fix_drude_transform.html
+"compute temp/drude"_compute_temp_drude.html
+"pair thole"_pair_thole.html
+"pair lj/cut/thole/long"_pair_thole.html
+examples/USER/drude
+tools/drude :ul
 
 :line
 
-USER-EFF package :link(USER-EFF),h5
+USER-EFF package :link(USER-EFF),h4
 
-Contents: EFF stands for electron force field.  This package contains
-atom, pair, fix and compute styles which implement the eFF as
+[Contents:]
+
+EFF stands for electron force field which allows a classical MD code
+to model electrons as particles of variable radius.  This package
+contains atom, pair, fix and compute styles which implement the eFF as
 described in A. Jaramillo-Botero, J. Su, Q. An, and W.A. Goddard III,
-JCC, 2010. The eFF potential was first introduced by Su and Goddard,
-in 2007.  See src/USER-EFF/README for more details.  There are
-auxiliary tools for using this package in tools/eff; see its README
-file.
+JCC, 2010.  The eFF potential was first introduced by Su and Goddard,
+in 2007.  There are auxiliary tools for using this package in
+tools/eff; see its README file.
 
-Supporting info:
+[Author:] Andres Jaramillo-Botero (CalTech).
 
-Author: Andres Jaramillo-Botero at CalTech (ajaramil at
-wag.caltech.edu).  Contact him directly if you have questions.
+[Install or un-install:]
+  
+make yes-user-eff
+make machine :pre
+ 
+make no-user-eff
+make machine :pre
+ 
+[Supporting info:]
+
+src/USER-EFF: filenames -> commands
+src/USER-EFF/README
+"atom_style electron"_atom_style.html
+"fix nve/eff"_fix_nve_eff.html
+"fix nvt/eff"_fix_nvt_eff.html
+"fix npt/eff"_fix_npt_eff.html
+"fix langevin/eff"_fix_langevin_eff.html
+"compute temp/eff"_compute_temp_eff.html
+"pair eff/cut"_pair_eff.html
+"pair eff/inline"_pair_eff.html
+examples/USER/eff
+tools/eff/README
+tools/eff
+http://lammps.sandia.gov/movies.html#eff :ul
 
 :line
 
-USER-FEP package :link(USER-FEP),h5
+USER-FEP package :link(USER-FEP),h4
+
+[Contents:]
 
-Contents: FEP stands for free energy perturbation.  This package
-provides methods for performing FEP simulations by using a "fix
+FEP stands for free energy perturbation.  This package provides
+methods for performing FEP simulations by using a "fix
 adapt/fep"_fix_adapt_fep.html command with soft-core pair potentials,
-which have a "soft" in their style name.  See src/USER-FEP/README for
-more details.  There are auxiliary tools for using this package in
-tools/fep; see its README file.
+which have a "soft" in their style name.  There are auxiliary tools
+for using this package in tools/fep; see its README file.
 
-Supporting info: src/USER-FEP/README, "fix
-adapt/fep"_fix_adapt_fep.html, "compute fep"_compute_fep.html,
-"pair_style */soft"_pair_lj_soft.html, examples/USER/fep
+[Author:] Agilio Padua (Universite Blaise Pascal Clermont-Ferrand)
 
-Author: Agilio Padua at Universite Blaise Pascal Clermont-Ferrand
-(agilio.padua at univ-bpclermont.fr). Contact him directly if you have
-questions.
+[Install or un-install:]
+  
+make yes-user-fep
+make machine :pre
+ 
+make no-user-fep
+make machine :pre
+ 
+[Supporting info:]
+
+src/USER-FEP: filenames -> commands
+src/USER-FEP/README
+"fix adapt/fep"_fix_adapt_fep.html
+"compute fep"_compute_fep.html
+"pair_style */soft"_pair_lj_soft.html
+examples/USER/fep
+tools/fep/README
+tools/fep :ul
 
 :line
 
-USER-H5MD package :link(USER-H5MD),h5
+USER-H5MD package :link(USER-H5MD),h4
 
-Contents: H5MD stands for HDF5 for MD.  "HDF5"_HDF5 is a binary,
-portable, self-describing file format, used by many scientific
-simulations.  H5MD is a format for molecular simulations, built on top
-of HDF5.  This package implements a "dump h5md"_dump_h5md.html command
-to output LAMMPS snapshots in this format.  See src/USER-H5MD/README
-for more details.
+[Contents:]
 
-:link(HDF5,http://www.hdfgroup.org/HDF5/)
+H5MD stands for HDF5 for MD.  "HDF5"_HDF5 is a portable, binary,
+self-describing file format, used by many scientific simulations.
+H5MD is a format for molecular simulations, built on top of HDF5.
+This package implements a "dump h5md"_dump_h5md.html command to output
+LAMMPS snapshots in this format.
 
-Supporting info: src/USER-H5MD/README, lib/h5md/README, "dump
-h5md"_dump_h5md.html
+:link(HDF5,http://www.hdfgroup.org/HDF5)
 
-Author: Pierre de Buyl at KU Leuven (see http://pdebuyl.be) created
-this package as well as the H5MD format and library.  Contact him
-directly if you have questions.
+To use this package you must have the HDF5 library available on your
+system.
 
-:line
-
-USER-INTEL package :link(USER-INTEL),h5
+[Author:] Pierre de Buyl (KU Leuven) created both the package and the
+H5MD format.
 
-Contents: Dozens of pair, bond, angle, dihedral, and improper styles
-that are optimized for Intel CPUs and the Intel Xeon Phi (in offload
-mode).  All of them have an "intel" in their style name.  "Section
-5.3.2"_accelerate_intel.html gives details of what hardware
-and compilers are required on your system, and how to build and use
-this package.  Also see src/USER-INTEL/README for more details. See
-the KOKKOS, OPT, and USER-OMP packages, which also have CPU and
-Phi-enabled styles.
+[Install or un-install:]
 
-Supporting info: examples/accelerate, src/USER-INTEL/TEST
+Note that to follow these steps to compile and link to the CH5MD
+library, you need the standard HDF5 software package installed on your
+system, which should include the h5cc compiler and the HDF5 library.
 
-"Section 5.3"_Section_accelerate.html#acc_3
+Before building LAMMPS with this package, you must first build the
+CH5MD library in lib/h5md.  You can do this manually if you prefer;
+follow the instructions in lib/h5md/README.  You can also do it in one
+step from the lammps/src dir, using a command like these, which simply
+invoke the lib/h5md/Install.py script with the specified args:
 
-Author: Mike Brown at Intel (michael.w.brown at intel.com).  Contact
-him directly if you have questions.
-
-For the USER-INTEL package, you have 2 choices when building.  You can
-build with CPU or Phi support.  The latter uses Xeon Phi chips in
-"offload" mode.  Each of these modes requires additional settings in
-your Makefile.machine for CCFLAGS and LINKFLAGS.
+make lib-h5md                     # print help message
+make lib-hm5d args="-m h5cc"      # build with h5cc compiler :pre
 
-For CPU mode (if using an Intel compiler):
+The build should produce two files: lib/h5md/libch5md.a and
+lib/h5md/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to build LAMMPS with the
+system HDF5 library.  If necessary, you can edit/create a new
+lib/h5md/Makefile.machine file for your system, which should define an
+EXTRAMAKE variable to specify a corresponding Makefile.lammps.machine
+file.
 
-CCFLAGS: add -fopenmp, -DLAMMPS_MEMALIGN=64, -restrict, -xHost, -fno-alias, -ansi-alias, -override-limits
-LINKFLAGS: add -fopenmp :ul
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+  
+make yes-user-h5md
+make machine :pre
+ 
+make no-user-h5md
+make machine :pre
+ 
+[Supporting info:]
 
-For Phi mode add the following in addition to the CPU mode flags:
+src/USER-H5MD: filenames -> commands
+src/USER-H5MD/README
+lib/h5md/README
+"dump h5md"_dump_h5md.html :ul
 
-CCFLAGS: add -DLMP_INTEL_OFFLOAD and
-LINKFLAGS: add -offload :ul
+:line
 
-And also add this to CCFLAGS:
+USER-INTEL package :link(USER-INTEL),h4
 
--offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\"" :pre
+[Contents:]
 
-Examples:
+Dozens of pair, fix, bond, angle, dihedral, improper, and kspace
+styles which are optimized for Intel CPUs and KNLs (Knights Landing).
+All of them have an "intel" in their style name.  "Section
+5.3.2"_accelerate_intel.html gives details of what hardware and
+compilers are required on your system, and how to build and use this
+package.  Its styles can be invoked at run time via the "-sf intel" or
+"-suffix intel" "command-line switches"_Section_start.html#start_7.
+Also see the "KOKKOS"_#KOKKOS, "OPT"_#OPT, and "USER-OMP"_#USER-OMP
+packages, which have styles optimized for CPUs and KNLs.
 
-:line
+You need to have an Intel compiler, version 14 or higher to take full
+advantage of this package.
 
-USER-LB package :link(USER-LB),h5
+[Author:] Mike Brown (Intel).
 
-Supporting info:
+[Install or un-install:]
 
-This package contains a LAMMPS implementation of a background
-Lattice-Boltzmann fluid, which can be used to model MD particles
-influenced by hydrodynamic forces.
+For the USER-INTEL package, you have 2 choices when building.  You can
+build with either CPU or KNL support.  Each choice requires additional
+settings in your Makefile.machine for CCFLAGS and LINKFLAGS and
+optimized malloc libraries.  See the
+src/MAKE/OPTIONS/Makefile.intel_cpu and src/MAKE/OPTIONS/Makefile.knl
+files for examples.
+
+For CPUs:
+
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
+                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc -ltbbmalloc_proxy
+
+For KNLs:
+
+OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
+                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
+
+Once you have an appropriate Makefile.machine, you can
+install/un-install the package and build LAMMPS in the usual manner.
+Note that you cannot build one executable to run on multiple hardware
+targets (Intel CPUs or KNL).  You need to build LAMMPS once for each
+hardware target, to produce a separate executable.
+
+You should also typically install the USER-OMP package, as it can be
+used in tandem with the USER-INTEL package to good effect, as
+explained in "Section 5.3.2"_accelerate_intel.html.
+
+make yes-user-intel yes-user-omp
+make machine :pre
+ 
+make no-user-intel no-user-omp
+make machine :pre
 
-See this doc page and its related commands to get started:
+[Supporting info:]
 
-"fix lb/fluid"_fix_lb_fluid.html
+src/USER-INTEL: filenames -> commands
+src/USER-INTEL/README
+"Section 5.3"_Section_accelerate.html#acc_3
+"Section 5.3.2"_accelerate_gpu.html
+"Section 2.7 -sf intel"_Section_start.html#start_7
+"Section 2.7 -pk intel"_Section_start.html#start_7
+"package intel"_package.html
+Styles sections of "Section 3.5"_Section_commands.html#cmd_5 for styles followed by (i)
+src/USER-INTEL/TEST
+"Benchmarks page"_http://lammps.sandia.gov/bench.html of web site :ul
 
-The people who created this package are Frances Mackay (fmackay at
-uwo.ca) and Colin (cdennist at uwo.ca) Denniston, University of
-Western Ontario.  Contact them directly if you have questions.
+:line
 
-Examples: examples/USER/lb
+USER-LB package :link(USER-LB),h4
 
-:line
+[Contents:]
 
-USER-MGPT package :link(USER-MGPT),h5
+Fixes which implement a background Lattice-Boltzmann (LB) fluid, which
+can be used to model MD particles influenced by hydrodynamic forces.
 
-Supporting info:
+[Authors:] Frances Mackay and Colin Denniston (University of Western
+Ontario).
 
-This package contains a fast implementation for LAMMPS of
-quantum-based MGPT multi-ion potentials.  The MGPT or model GPT method
-derives from first-principles DFT-based generalized pseudopotential
-theory (GPT) through a series of systematic approximations valid for
-mid-period transition metals with nearly half-filled d bands.  The
-MGPT method was originally developed by John Moriarty at Lawrence
-Livermore National Lab (LLNL).
+[Install or un-install:]
+  
+make yes-user-lb
+make machine :pre
+ 
+make no-user-lb
+make machine :pre
+ 
+[Supporting info:]
 
-In the general matrix representation of MGPT, which can also be
-applied to f-band actinide metals, the multi-ion potentials are
-evaluated on the fly during a simulation through d- or f-state matrix
-multiplication, and the forces that move the ions are determined
-analytically.  The {mgpt} pair style in this package calculates forces
-and energies using an optimized matrix-MGPT algorithm due to Tomas
-Oppelstrup at LLNL.
+src/USER-LB: filenames -> commands
+src/USER-LB/README
+"fix lb/fluid"_fix_lb_fluid.html
+"fix lb/momentum"_fix_lb_momentum.html
+"fix lb/viscous"_fix_lb_viscous.html
+examples/USER/lb :ul
 
-See this doc page to get started:
+:line
 
-"pair_style mgpt"_pair_mgpt.html
+USER-MGPT package :link(USER-MGPT),h4
 
-The persons who created the USER-MGPT package are Tomas Oppelstrup
-(oppelstrup2@llnl.gov) and John Moriarty (moriarty2@llnl.gov)
-Contact them directly if you have any questions.
+[Contents:]
 
-Examples: examples/USER/mgpt
+A pair style which provides a fast implementation of the quantum-based
+MGPT multi-ion potentials.  The MGPT or model GPT method derives from
+first-principles DFT-based generalized pseudopotential theory (GPT)
+through a series of systematic approximations valid for mid-period
+transition metals with nearly half-filled d bands.  The MGPT method
+was originally developed by John Moriarty at LLNL.  The pair style in
+this package calculates forces and energies using an optimized
+matrix-MGPT algorithm due to Tomas Oppelstrup at LLNL.
 
-:line
+[Authors:] Tomas Oppelstrup and John Moriarty (LLNL).
 
-USER-MISC package :link(USER-MISC),h5
+[Install or un-install:]
+  
+make yes-user-mgpt
+make machine :pre
+ 
+make no-user-mgpt
+make machine :pre
+ 
+[Supporting info:]
 
-Supporting info:
+src/USER-MGPT: filenames -> commands
+src/USER-MGPT/README
+"pair_style mgpt"_pair_mgpt.html
+examples/USER/mgpt :ul
 
-The files in this package are a potpourri of (mostly) unrelated
-features contributed to LAMMPS by users.  Each feature is a single
-pair of files (*.cpp and *.h).
+:line
 
-More information about each feature can be found by reading its doc
-page in the LAMMPS doc directory.  The doc page which lists all LAMMPS
-input script commands is as follows:
+USER-MISC package :link(USER-MISC),h4
 
-"Section 3.5"_Section_commands.html#cmd_5
+[Contents:]
 
-User-contributed features are listed at the bottom of the fix,
-compute, pair, etc sections.
+A potpourri of (mostly) unrelated features contributed to LAMMPS by
+users.  Each feature is a single fix, compute, pair, bond, angle,
+dihedral, improper, or command style.
 
-The list of features and author of each is given in the
+[Authors:] The author for each style in the package is listed in the
 src/USER-MISC/README file.
 
-You should contact the author directly if you have specific questions
-about the feature or its coding.
+[Install or un-install:]
+  
+make yes-user-misc
+make machine :pre
+ 
+make no-user-misc
+make machine :pre
+ 
+[Supporting info:]
 
-Examples: examples/USER/misc
+src/USER-MISC: filenames -> commands
+src/USER-MISC/README
+one doc page per individual command listed in src/USER-MISC/README
+examples/USER/misc :ul
 
 :line
 
-USER-MANIFOLD package :link(USER-MANIFOLD),h5
+USER-MANIFOLD package :link(USER-MANIFOLD),h4
 
-Supporting info:
+[Contents:]
 
-This package contains a dump molfile command which uses molfile
-plugins that are bundled with the
-"VMD"_http://www.ks.uiuc.edu/Research/vmd molecular visualization and
-analysis program, to enable LAMMPS to dump its information in formats
-compatible with various molecular simulation tools.
+Several fixes and a "manifold" class which enable simulations of
+particles constrained to a manifold (a 2D surface within the 3D
+simulation box).  This is done by applying the RATTLE constraint
+algorithm to formulate single-particle constraint functions
+g(xi,yi,zi) = 0 and their derivative (i.e. the normal of the manifold)
+n = grad(g).
 
-This package allows LAMMPS to perform MD simulations of particles
-constrained on a manifold (i.e., a 2D subspace of the 3D simulation
-box). It achieves this using the RATTLE constraint algorithm applied
-to single-particle constraint functions g(xi,yi,zi) = 0 and their
-derivative (i.e. the normal of the manifold) n = grad(g).
+[Author:] Stefan Paquay (Eindhoven University of Technology (TU/e), The
+Netherlands)
 
-See this doc page to get started:
+[Install or un-install:]
+  
+make yes-user-manifold
+make machine :pre
+ 
+make no-user-manifold
+make machine :pre
+ 
+[Supporting info:]
 
+src/USER-MANIFOLD: filenames -> commands
+src/USER-MANIFOLD/README
+"doc/manifolds"_manifolds.html
 "fix manifoldforce"_fix_manifoldforce.html
-
-The person who created this package is Stefan Paquay, at the Eindhoven
-University of Technology (TU/e), The Netherlands (s.paquay at tue.nl).
-Contact him directly if you have questions.
+"fix nve/manifold/rattle"_fix_nve_manifold/rattle.html
+"fix nvt/manifold/rattle"_fix_nvt_manifold/rattle.html
+examples/USER/manifold
+http://lammps.sandia.gov/movies.html#manifold :ul
 
 :line
 
-USER-MOLFILE package :link(USER-MOLFILE),h5
+USER-MOLFILE package :link(USER-MOLFILE),h4
+
+[Contents:]
+
+A "dump molfile"_dump_molfile.html command which uses molfile plugins
+that are bundled with the "VMD"_http://www.ks.uiuc.edu/Research/vmd
+molecular visualization and analysis program, to enable LAMMPS to dump
+snapshots in formats compatible with various molecular simulation
+tools.
+
+To use this package you must have the desired VMD plugins available on
+your system.
+
+Note that this package only provides the interface code, not the
+plugins themselves, which will be accessed when requesting a specific
+plugin via the "dump molfile"_dump_molfile.html command.  Plugins can
+be obtained from a VMD installation which has to match the platform
+that you are using to compile LAMMPS for. By adding plugins to VMD,
+support for new file formats can be added to LAMMPS (or VMD or other
+programs that use them) without having to recompile the application
+itself.  More information about the VMD molfile plugins can be found
+at
+"http://www.ks.uiuc.edu/Research/vmd/plugins/molfile"_http://www.ks.uiuc.edu/Research/vmd/plugins/molfile.
+
+[Author:] Axel Kohlmeyer (Temple U).
+
+[Install or un-install:]
+  
+Note that the lib/molfile/Makefile.lammps file has a setting for a
+dynamic loading library libdl.a that should is typically present on
+all systems, which is required for LAMMPS to link with this package.
+If the setting is not valid for your system, you will need to edit the
+Makefile.lammps file.  See lib/molfile/README and
+lib/molfile/Makefile.lammps for details.
+
+make yes-user-molfile
+make machine :pre
+ 
+make no-user-molfile
+make machine :pre
+ 
+[Supporting info:]
+
+src/USER-MOLFILE: filenames -> commands
+src/USER-MOLFILE/README
+lib/molfile/README
+"dump molfile"_dump_molfile.html :ul
 
-Supporting info:
+:line
 
-This package contains a dump molfile command which uses molfile
-plugins that are bundled with the
-"VMD"_http://www.ks.uiuc.edu/Research/vmd molecular visualization and
-analysis program, to enable LAMMPS to dump its information in formats
-compatible with various molecular simulation tools.
+USER-NETCDF package :link(USER-NETCDF),h4
 
-The package only provides the interface code, not the plugins.  These
-can be obtained from a VMD installation which has to match the
-platform that you are using to compile LAMMPS for. By adding plugins
-to VMD, support for new file formats can be added to LAMMPS (or VMD or
-other programs that use them) without having to recompile the
-application itself.
+[Contents:]
 
-See this doc page to get started:
+Dump styles for writing NetCDF formatted dump files.  NetCDF is a
+portable, binary, self-describing file format developed on top of
+HDF5. The file contents follow the AMBER NetCDF trajectory conventions
+(http://ambermd.org/netcdf/nctraj.xhtml), but include extensions.
 
-"dump molfile"_dump_molfile.html
+To use this package you must have the NetCDF library available on your
+system.
 
-The person who created this package is Axel Kohlmeyer at Temple U
-(akohlmey at gmail.com).  Contact him directly if you have questions.
+Note that NetCDF files can be directly visualized with the following
+tools:
 
-:line
+"Ovito"_ovito (Ovito supports the AMBER convention and the extensions mentioned above)
+"VMD"_vmd
+"AtomEye"_atomeye (the libAtoms version of AtomEye contains a NetCDF reader not present in the standard distribution) :ul
+
+:link(ovito,http://www.ovito.org)
+:link(atomeye,http://www.libatoms.org)
 
-USER-NC-DUMP package :link(USER-NC-DUMP),h5
+[Author:] Lars Pastewka (Karlsruhe Institute of Technology).
 
-Contents: Dump styles for writing NetCDF format files.  NetCDF is a binary,
-portable, self-describing file format on top of HDF5. The file format
-contents follow the AMBER NetCDF trajectory conventions
-(http://ambermd.org/netcdf/nctraj.xhtml), but include extensions to this
-convention. This package implements a "dump nc"_dump_nc.html command
-and a "dump nc/mpiio"_dump_nc.html command to output LAMMPS snapshots
-in this format.  See src/USER-NC-DUMP/README for more details.
+[Install or un-install:]
+   
+Note that to follow these steps, you need the standard NetCDF software
+package installed on your system.  The lib/netcdf/Makefile.lammps file
+has settings for NetCDF include and library files that LAMMPS needs to
+compile and linkk with this package.  If the settings are not valid
+for your system, you will need to edit the Makefile.lammps file.  See
+lib/netcdf/README for details.
 
-NetCDF files can be directly visualized with the following tools:
+make yes-user-netcdf
+make machine :pre
+  
+make no-user-netcdf
+make machine :pre
 
-Ovito (http://www.ovito.org/). Ovito supports the AMBER convention
-and all of the above extensions. :ulb,l
-VMD (http://www.ks.uiuc.edu/Research/vmd/) :l
-AtomEye (http://www.libatoms.org/). The libAtoms version of AtomEye contains
-a NetCDF reader that is not present in the standard distribution of AtomEye :l,ule
+[Supporting info:]
 
-The person who created these files is Lars Pastewka at
-Karlsruhe Institute of Technology (lars.pastewka at kit.edu).
-Contact him directly if you have questions.
+src/USER-NETCDF: filenames -> commands
+src/USER-NETCDF/README
+lib/netcdf/README
+"dump netcdf"_dump_netcdf.html :ul
 
 :line
 
-USER-OMP package :link(USER-OMP),h5
+USER-OMP package :link(USER-OMP),h4
 
-Supporting info:
+[Contents:]
 
-This package provides OpenMP multi-threading support and
-other optimizations of various LAMMPS pair styles, dihedral
-styles, and fix styles.
+Hundreds of pair, fix, compute, bond, angle, dihedral, improper, and
+kspace styles which are altered to enable threading on many-core CPUs
+via OpenMP directives.  All of them have an "omp" in their style name.
+"Section 5.3.4"_accelerate_omp.html gives details of what hardware and
+compilers are required on your system, and how to build and use this
+package.  Its styles can be invoked at run time via the "-sf omp" or
+"-suffix omp" "command-line switches"_Section_start.html#start_7.
+Also see the "KOKKOS"_#KOKKOS, "OPT"_#OPT, and
+"USER-INTEL"_#USER-INTEL packages, which have styles optimized for
+CPUs.
 
-See this section of the manual to get started:
+[Author:] Axel Kohlmeyer (Temple U).
 
-"Section 5.3"_Section_accelerate.html#acc_3
+NOTE: The compile flags "-restrict" and "-fopenmp" must be used to
+build LAMMPS with the USER-OMP package, as well as the link flag
+"-fopenmp".  They should be added to the CCFLAGS and LINKFLAGS lines
+of your Makefile.machine.  See src/MAKE/OPTIONS/Makefile.omp for an
+example.
 
-The person who created this package is Axel Kohlmeyer at Temple U
-(akohlmey at gmail.com).  Contact him directly if you have questions.
+Once you have an appropriate Makefile.machine, you can
+install/un-install the package and build LAMMPS in the usual manner:
 
-For the USER-OMP package, your Makefile.machine needs additional
-settings for CCFLAGS and LINKFLAGS.
+[Install or un-install:]
+  
+make yes-user-omp
+make machine :pre
+ 
+make no-user-omp
+make machine :pre
 
 CCFLAGS: add -fopenmp and -restrict
 LINKFLAGS: add -fopenmp :ul
 
-Examples: examples/accelerate, bench/KEPLER
+[Supporting info:]
+
+src/USER-OMP: filenames -> commands
+src/USER-OMP/README
+"Section 5.3"_Section_accelerate.html#acc_3
+"Section 5.3.4"_accelerate_omp.html
+"Section 2.7 -sf omp"_Section_start.html#start_7
+"Section 2.7 -pk omp"_Section_start.html#start_7
+"package omp"_package.html
+Styles sections of "Section 3.5"_Section_commands.html#cmd_5 for styles followed by (o)
+"Benchmarks page"_http://lammps.sandia.gov/bench.html of web site :ul
 
 :line
 
-USER-PHONON package :link(USER-PHONON),h5
+USER-PHONON package :link(USER-PHONON),h4
+
+[Contents:]
 
-This package contains a fix phonon command that calculates dynamical
+A "fix phonon"_fix_phonon.html command that calculates dynamical
 matrices, which can then be used to compute phonon dispersion
 relations, directly from molecular dynamics simulations.
 
-See this doc page to get started:
-
-"fix phonon"_fix_phonon.html
+[Author:] Ling-Ti Kong (Shanghai Jiao Tong University).
 
-The person who created this package is Ling-Ti Kong (konglt at
-sjtu.edu.cn) at Shanghai Jiao Tong University.  Contact him directly
-if you have questions.
+[Install or un-install:]
+   
+make yes-user-phonon
+make machine :pre
+  
+make no-user-phonon
+make machine :pre
+  
+[Supporting info:]
 
-Examples: examples/USER/phonon
+src/USER-PHONON: filenames -> commands
+src/USER-PHONON/README
+"fix phonon"_fix_phonon.html
+examples/USER/phonon :ul
 
 :line
 
-USER-QMMM package :link(USER-QMMM),h5
+USER-QMMM package :link(USER-QMMM),h4
 
-Supporting info:
+[Contents:]
 
-This package provides a fix qmmm command which allows LAMMPS to be
-used in a QM/MM simulation, currently only in combination with pw.x
-code from the "Quantum ESPRESSO"_espresso package.
+A "fix qmmm"_fix_qmmm.html command which allows LAMMPS to be used in a
+QM/MM simulation, currently only in combination with the "Quantum
+ESPRESSO"_espresso package.  
 
 :link(espresso,http://www.quantum-espresso.org)
 
+To use this package you must have Quantum ESPRESSO available on your
+system.
+
 The current implementation only supports an ONIOM style mechanical
 coupling to the Quantum ESPRESSO plane wave DFT package.
 Electrostatic coupling is in preparation and the interface has been
 written in a manner that coupling to other QM codes should be possible
 without changes to LAMMPS itself.
 
-See this doc page to get started:
+[Author:] Axel Kohlmeyer (Temple U).
+
+[Install or un-install:]
 
-"fix qmmm"_fix_qmmm.html
+Before building LAMMPS with this package, you must first build the
+QMMM library in lib/qmmm.  You can do this manually if you prefer;
+follow the first two steps explained in lib/colvars/README.  You can
+also do it in one step from the lammps/src dir, using a command like
+these, which simply invoke the lib/colvars/Install.py script with the
+specified args:
 
-as well as the lib/qmmm/README file.
+make lib-qmmm                      # print help message
+make lib-qmmm args="-m gfortran"   # build with GNU Fortran compiler :pre
 
-The person who created this package is Axel Kohlmeyer at Temple U
-(akohlmey at gmail.com).  Contact him directly if you have questions.
+The build should produce two files: lib/qmmm/libqmmm.a and
+lib/qmmm/Makefile.lammps.  The latter is copied from an existing
+Makefile.lammps.* and has settings needed to build LAMMPS with the
+QMMM library (though typically the settings are just blank).  If
+necessary, you can edit/create a new lib/qmmm/Makefile.machine file
+for your system, which should define an EXTRAMAKE variable to specify
+a corresponding Makefile.lammps.machine file.
+
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-user-qmmm
+make machine :pre
+  
+make no-user-qmmm
+make machine :pre
+
+NOTE: The LAMMPS executable these steps produce is not yet functional
+for a QM/MM simulation.  You must also build Quantum ESPRESSO and
+create a new executable which links LAMMPS and Quanutm ESPRESSO
+together.  These are steps 3 and 4 described in the lib/qmmm/README
+file.
+  
+[Supporting info:]
+
+src/USER-QMMM: filenames -> commands
+src/USER-QMMM/README
+lib/qmmm/README
+"fix phonon"_fix_phonon.html
+lib/qmmm/example-ec/README
+lib/qmmm/example-mc/README :ul
 
 :line
 
-USER-QTB package :link(USER-QTB),h5
+USER-QTB package :link(USER-QTB),h4
 
-Supporting info:
+[Contents:]
 
-This package provides a self-consistent quantum treatment of the
+Two fixes which provide a self-consistent quantum treatment of
 vibrational modes in a classical molecular dynamics simulation.  By
 coupling the MD simulation to a colored thermostat, it introduces zero
-point energy into the system, alter the energy power spectrum and the
-heat capacity towards their quantum nature. This package could be of
-interest if one wants to model systems at temperatures lower than
-their classical limits or when temperatures ramp up across the
-classical limits in the simulation.
+point energy into the system, altering the energy power spectrum and
+the heat capacity to account for their quantum nature. This is useful
+when modeling systems at temperatures lower than their classical
+limits or when temperatures ramp across the classical limits in a
+simulation.
 
-See these two doc pages to get started:
+[Author:] Yuan Shen (Stanford U).
+
+[Install or un-install:]
+   
+make yes-user-qtb
+make machine :pre
+  
+make no-user-qtb
+make machine :pre
+  
+[Supporting info:]
 
-"fix qtb"_fix_qtb.html provides quantum nulcear correction through a
-colored thermostat and can be used with other time integration schemes
-like "fix nve"_fix_nve.html or "fix nph"_fix_nh.html.
+src/USER-QTB: filenames -> commands
+src/USER-QTB/README
+"fix qtb"_fix_qtb.html
+"fix qbmsst"_fix_qbmsst.html
+examples/USER/qtb :ul
 
-"fix qbmsst"_fix_qbmsst.html enables quantum nuclear correction of a
-multi-scale shock technique simulation by coupling the quantum thermal
-bath with the shocked system.
+:line
 
-The person who created this package is Yuan Shen (sy0302 at
-stanford.edu) at Stanford University.  Contact him directly if you
-have questions.
+USER-QUIP package :link(USER-QUIP),h4
 
-Examples: examples/USER/qtb
+[Contents:]
 
-:line
+A "pair_style quip"_pair_quip.html command which wraps the "QUIP
+libAtoms library"_quip, which includes a variety of interatomic
+potentials, including Gaussian Approximation Potential (GAP) models
+developed by the Cambridge University group.
 
-USER-QUIP package :link(USER-QUIP),h5
+:link(quip,https://github.com/libAtoms/QUIP)
 
-Supporting info:
+To use this package you must have the QUIP libAatoms library available
+on your system.
 
-Examples: examples/USER/quip
+[Author:] Albert Bartok (Cambridge University)
 
-:line
+[Install or un-install:]
 
-USER-REAXC package :link(USER-REAXC),h5
+Note that to follow these steps to compile and link to the QUIP
+library, you must first download and build QUIP on your systems.  It
+can be obtained from GitHub.  See step 1 and step 1.1 in the
+lib/quip/README file for details on how to do this.  Note that it
+requires setting two environment variables, QUIP_ROOT and QUIP_ARCH,
+which will be accessed by the lib/quip/Makefile.lammps file which is
+used when you compile and link LAMMPS with this package.  You should
+only need to edit this file if the LAMMPS build can not use its
+settings to successfully build on your system.
 
-Supporting info:
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
 
-This package contains a implementation for LAMMPS of the ReaxFF force
-field.  ReaxFF uses distance-dependent bond-order functions to
-represent the contributions of chemical bonding to the potential
-energy.  It was originally developed by Adri van Duin and the Goddard
-group at CalTech.
+make yes-user-quip
+make machine :pre
+  
+make no-user-quip
+make machine :pre
+  
+[Supporting info:]
 
-The USER-REAXC version of ReaxFF (pair_style reax/c), implemented in
-C, should give identical or very similar results to pair_style reax,
-which is a ReaxFF implementation on top of a Fortran library, a
-version of which library was originally authored by Adri van Duin.
+src/USER-QUIP: filenames -> commands
+src/USER-QUIP/README
+"pair_style quip"_pair_quip.html
+examples/USER/quip :ul
 
-The reax/c version should be somewhat faster and more scalable,
-particularly with respect to the charge equilibration calculation.  It
-should also be easier to build and use since there are no complicating
-issues with Fortran memory allocation or linking to a Fortran library.
+:line
 
-For technical details about this implementation of ReaxFF, see
-this paper:
+USER-REAXC package :link(USER-REAXC),h4
 
-Parallel and Scalable Reactive Molecular Dynamics: Numerical Methods
-and Algorithmic Techniques, H. M. Aktulga, J. C. Fogarty,
-S. A. Pandit, A. Y. Grama, Parallel Computing, in press (2011).
+[Contents:]
 
-See the doc page for the pair_style reax/c command for details
-of how to use it in LAMMPS.
+A pair style which implements the ReaxFF potential in C/C++ (in
+contrast to the "REAX package"_#REAX and its Fortran library).  ReaxFF
+is universal reactive force field.  See the src/USER-REAXC/README file
+for more info on differences between the two packages.  Also two fixes
+for monitoring molecules as bonds are created and destroyed.
 
-The person who created this package is Hasan Metin Aktulga (hmaktulga
-at lbl.gov), while at Purdue University.  Contact him directly, or
-Aidan Thompson at Sandia (athomps at sandia.gov), if you have
-questions.
+[Author:] Hasan Metin Aktulga (MSU) while at Purdue University.
 
-Examples: examples/reax
+[Install or un-install:]
+   
+make yes-user-reaxc
+make machine :pre
+  
+make no-user-reaxc
+make machine :pre
+  
+[Supporting info:]
+
+src/USER-REAXC: filenames -> commands
+src/USER-REAXC/README
+"pair_style reax/c"_pair_reaxc.html
+"fix reax/c/bonds"_fix_reax_bonds.html
+"fix reax/c/species"_fix_reaxc_species.html
+examples/reax :ul
 
 :line
 
-USER-SMD package :link(USER-SMD),h5
+USER-SMD package :link(USER-SMD),h4
 
-Supporting info:
+[Contents:]
 
-This package implements smoothed Mach dynamics (SMD) in
-LAMMPS.  Currently, the package has the following features:
+An atom style, fixes, computes, and several pair styles which
+implements smoothed Mach dynamics (SMD) for solids, which is a model
+related to smoothed particle hydrodynamics (SPH) for liquids (see the
+"USER-SPH package"_#USER-SPH).
 
-* Does liquids via traditional Smooth Particle Hydrodynamics (SPH)
+This package solves solids mechanics problems via a state of the art
+stabilized meshless method with hourglass control.  It can specify
+hydrostatic interactions independently from material strength models,
+i.e. pressure and deviatoric stresses are separated.  It provides many
+material models (Johnson-Cook, plasticity with hardening,
+Mie-Grueneisen, Polynomial EOS) and allows new material models to be
+added.  It implements rigid boundary conditions (walls) which can be
+specified as surface geometries from *.STL files.
 
-* Also solves solids mechanics problems via a state of the art
-  stabilized meshless method with hourglass control.
+[Author:] Georg Ganzenmuller (Fraunhofer-Institute for High-Speed
+Dynamics, Ernst Mach Institute, Germany).
 
-* Can specify hydrostatic interactions independently from material
-  strength models, i.e. pressure and deviatoric stresses are separated.
+[Install or un-install:]
 
-* Many material models available (Johnson-Cook, plasticity with
-  hardening, Mie-Grueneisen, Polynomial EOS).  Easy to add new
-  material models.
+Before building LAMMPS with this package, you must first download the
+Eigen library.  Eigen is a template library, so you do not need to
+build it, just download it.  You can do this manually if you prefer;
+follow the instructions in lib/smd/README.  You can also do it in one
+step from the lammps/src dir, using a command like these, which simply
+invoke the lib/smd/Install.py script with the specified args:
 
-* Rigid boundary conditions (walls) can be loaded as surface geometries
-  from *.STL files.
+make lib-smd                            # print help message
+make lib-smd args="-g -l"               # download in default lib/smd/eigen-eigen-*
+make lib-smd args="-h . eigen -g -l"    # download in lib/smd/eigen
+make lib-smd args="-h ~ eigen -g -l"    # download and build in ~/eigen :pre
 
-See the file doc/PDF/SMD_LAMMPS_userguide.pdf to get started.
+Note that the final -l switch is to create a symbolic (soft) link
+named "includelink" in lib/smd to point to the Eigen dir.  When LAMMPS
+builds it will use this link.  You should not need to edit the
+lib/smd/Makefile.lammps file.
 
-There are example scripts for using this package in examples/USER/smd.
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
 
-The person who created this package is Georg Ganzenmuller at the
-Fraunhofer-Institute for High-Speed Dynamics, Ernst Mach Institute in
-Germany (georg.ganzenmueller at emi.fhg.de).  Contact him directly if
-you have questions.
+make yes-user-smd
+make machine :pre
+  
+make no-user-smd
+make machine :pre
+  
+[Supporting info:]
 
-Examples: examples/USER/smd
+src/USER-SMD: filenames -> commands
+src/USER-SMD/README
+doc/PDF/SMD_LAMMPS_userguide.pdf
+examples/USER/smd
+http://lammps.sandia.gov/movies.html#smd :ul
 
 :line
 
-USER-SMTBQ package :link(USER-SMTBQ),h5
+USER-SMTBQ package :link(USER-SMTBQ),h4
 
-Supporting info:
+[Contents:]
 
-This package implements the Second Moment Tight Binding - QEq (SMTB-Q)
-potential for the description of ionocovalent bonds in oxides.
+A pair style which implements a Second Moment Tight Binding model with
+QEq charge equilibration (SMTBQ) potential for the description of
+ionocovalent bonds in oxides.
 
-There are example scripts for using this package in
-examples/USER/smtbq.
+[Authors:] Nicolas Salles, Emile Maras, Olivier Politano, and Robert
+Tetot (LAAS-CNRS, France).
 
-See this doc page to get started:
+[Install or un-install:]
+   
+make yes-user-smtbq
+make machine :pre
+  
+make no-user-smtbq
+make machine :pre
+  
+[Supporting info:]
 
+src/USER-SMTBQ: filenames -> commands
+src/USER-SMTBQ/README
 "pair_style smtbq"_pair_smtbq.html
+examples/USER/smtbq :ul
 
-The persons who created the USER-SMTBQ package are Nicolas Salles,
-Emile Maras, Olivier Politano, Robert Tetot, who can be contacted at
-these email addresses: lammps@u-bourgogne.fr, nsalles@laas.fr.  Contact
-them directly if you have any questions.
+:line
 
-Examples: examples/USER/smtbq
+USER-SPH package :link(USER-SPH),h4
 
-:line
+[Contents:]
 
-USER-SPH package :link(USER-SPH),h5
+An atom style, fixes, computes, and several pair styles which
+implements smoothed particle hydrodynamics (SPH) for liquids.  See the
+related "USER-SMD package"_#USER-SMD package for smooth Mach dynamics
+(SMD) for solids.
 
-Supporting info:
+This package contains ideal gas, Lennard-Jones equation of states,
+Tait, and full support for complete (i.e. internal-energy dependent)
+equations of state.  It allows for plain or Monaghans XSPH integration
+of the equations of motion.  It has options for density continuity or
+density summation to propagate the density field.  It has
+"set"_set.html command options to set the internal energy and density
+of particles from the input script and allows the same quantities to
+be output with thermodynamic output or to dump files via the "compute
+property/atom"_compute_property_atom.html command.
 
-This package implements smoothed particle hydrodynamics (SPH) in
-LAMMPS.  Currently, the package has the following features:
+[Author:] Georg Ganzenmuller (Fraunhofer-Institute for High-Speed
+Dynamics, Ernst Mach Institute, Germany).
 
-* Tait, ideal gas, Lennard-Jones equation of states, full support for
-  complete (i.e. internal-energy dependent) equations of state
+[Install or un-install:]
+   
+make yes-user-sph
+make machine :pre
+  
+make no-user-sph
+make machine :pre
+  
+[Supporting info:]
 
-* Plain or Monaghans XSPH integration of the equations of motion
+src/USER-SPH: filenames -> commands
+src/USER-SPH/README
+doc/PDF/SPH_LAMMPS_userguide.pdf
+examples/USER/sph
+http://lammps.sandia.gov/movies.html#sph :ul
 
-* Density continuity or density summation to propagate the density field
+:line
 
-* Commands to set internal energy and density of particles from the
-  input script
+USER-TALLY package :link(USER-TALLY),h4
 
-* Output commands to access internal energy and density for dumping and
-  thermo output
+[Contents:]
 
-See the file doc/PDF/SPH_LAMMPS_userguide.pdf to get started.
+Several compute styles that can be called when pairwise interactions
+are calculated to tally information (forces, heat flux, energy,
+stress, etc) about individual interactions.
 
-There are example scripts for using this package in examples/USER/sph.
+[Author:] Axel Kohlmeyer (Temple U).
 
-The person who created this package is Georg Ganzenmuller at the
-Fraunhofer-Institute for High-Speed Dynamics, Ernst Mach Institute in
-Germany (georg.ganzenmueller at emi.fhg.de).  Contact him directly if
-you have questions.
+[Install or un-install:]
+   
+make yes-user-tally
+make machine :pre
+  
+make no-user-tally
+make machine :pre
+  
+[Supporting info:]
 
-Examples: examples/USER/sph
+src/USER-TALLY: filenames -> commands
+src/USER-TALLY/README
+"compute */tally"_compute_tally.html
+examples/USER/tally :ul
 
 :line
 
-USER-TALLY package :link(USER-TALLY),h5
+USER-VTK package :link(USER-VTK),h4
 
-Supporting info:
+[Contents:]
 
-Examples: examples/USER/tally
+A "dump custom/vtk"_dump_custom_vtk.html command which outputs
+snapshot info in the "VTK format"_vtk, enabling visualization by
+"Paraview"_paraview or other visuzlization packages.
 
-:line
+:link(vtk,http://www.vtk.org)
+:link(paraview,http://www.paraview.org)
+
+To use this package you must have VTK library available on your
+system.
+
+[Authors:] Richard Berger (JKU) and Daniel Queteschiner (DCS Computing).
 
-USER-VTK package :link(USER-VTK),h5
+[Install or un-install:]
+   
+The lib/vtk/Makefile.lammps file has settings for accessing VTK files
+and its library, which are required for LAMMPS to build and link with
+this package.  If the settings are not valid for your system, check if
+one of the other lib/vtk/Makefile.lammps.* files is compatible and
+copy it to Makefile.lammps.  If none of the provided files work, you
+will need to edit the Makefile.lammps file.
+
+You can then install/un-install the package and build LAMMPS in the
+usual manner:
+
+make yes-user-vtk
+make machine :pre
+  
+make no-user-vtk
+make machine :pre
+  
+[Supporting info:]
 
+src/USER-VTK: filenames -> commands
+src/USER-VTK/README
+lib/vtk/README
+"dump custom/vtk"_dump_custom_vtk.html :ul
diff --git a/doc/src/Section_start.txt b/doc/src/Section_start.txt
index 5a5de9ac9bb8d09d6a84ea42efab7d4059f4b21a..0a7209765e6f9bc2e62d14b020337bae35f72d4e 100644
--- a/doc/src/Section_start.txt
+++ b/doc/src/Section_start.txt
@@ -14,12 +14,11 @@ experienced users.
 2.1 "What's in the LAMMPS distribution"_#start_1
 2.2 "Making LAMMPS"_#start_2
 2.3 "Making LAMMPS with optional packages"_#start_3
-2.4 "Building LAMMPS via the Make.py script"_#start_4
-2.5 "Building LAMMPS as a library"_#start_5
-2.6 "Running LAMMPS"_#start_6
-2.7 "Command-line options"_#start_7
-2.8 "Screen output"_#start_8
-2.9 "Tips for users of previous versions"_#start_9 :all(b)
+2.5 "Building LAMMPS as a library"_#start_4
+2.6 "Running LAMMPS"_#start_5
+2.7 "Command-line options"_#start_6
+2.8 "Screen output"_#start_7
+2.9 "Tips for users of previous versions"_#start_8 :all(b)
 
 :line
 
@@ -80,7 +79,7 @@ This section has the following sub-sections:
 
 Read this first :h5,link(start_2_1)
 
-If you want to avoid building LAMMPS yourself, read the preceding
+If you want to avoid building LAMMPS yourself, read the preceeding
 section about options available for downloading and installing
 executables.  Details are discussed on the "download"_download page.
 
@@ -96,7 +95,7 @@ make serial :pre
 Note that on a facility supercomputer, there are often "modules"
 loaded in your environment that provide the compilers and MPI you
 should use.  In this case, the "mpicxx" compile/link command in
-Makefile.mpi should just work by accessing those modules.
+Makefile.mpi should simply work by accessing those modules.
 
 It may be the case that one of the other Makefile.machine files in the
 src/MAKE sub-directories is a better match to your system (type "make"
@@ -107,33 +106,35 @@ make stampede :pre
 If any of these builds (with an existing Makefile.machine) works on
 your system, then you're done!
 
+If you need to install an optional package with a LAMMPS command you
+want to use, and the package does not depend on an extra library, you
+can simply type
+
+make name :pre
+
+before invoking (or re-invoking) the above steps.  "Name" is the
+lower-case name of the package, e.g. replica or user-misc.
+
 If you want to do one of the following:
 
-use optional LAMMPS features that require additional libraries
-use optional packages that require additional libraries
-use optional accelerator packages that require special compiler/linker settings
-run on a specialized platform that has its own compilers, settings, or other libs to use :ul
+use a LAMMPS command that requires an extra library (e.g. "dump image"_dump_image.html)
+build with a package that requires an extra library
+build with an accelerator package that requires special compiler/linker settings
+run on a machine that has its own compilers, settings, or libraries :ul
 
 then building LAMMPS is more complicated.  You may need to find where
-auxiliary libraries exist on your machine or install them if they
-don't.  You may need to build additional libraries that are part of
-the LAMMPS package, before building LAMMPS.  You may need to edit a
+extra libraries exist on your machine or install them if they don't.
+You may need to build extra libraries that are included in the LAMMPS
+distribution, before building LAMMPS itself.  You may need to edit a
 Makefile.machine file to make it compatible with your system.
 
-Note that there is a Make.py tool in the src directory that automates
-several of these steps, but you still have to know what you are doing.
-"Section 2.4"_#start_4 below describes the tool.  It is a convenient
-way to work with installing/un-installing various packages, the
-Makefile.machine changes required by some packages, and the auxiliary
-libraries some of them use.
-
 Please read the following sections carefully.  If you are not
 comfortable with makefiles, or building codes on a Unix platform, or
 running an MPI job on your machine, please find a local expert to help
-you.  Many compilation, linking, and run problems that users have are
-often not really LAMMPS issues - they are peculiar to the user's
-system, compilers, libraries, etc.  Such questions are better answered
-by a local expert.
+you.  Many compilation, linking, and run problems users experience are
+often not LAMMPS issues - they are peculiar to the user's system,
+compilers, libraries, etc.  Such questions are better answered by a
+local expert.
 
 If you have a build problem that you are convinced is a LAMMPS issue
 (e.g. the compiler complains about a line of LAMMPS source code), then
@@ -251,7 +252,7 @@ re-compile, after typing "make clean" (which will describe different
 clean options).
 
 The LMP_INC variable is used to include options that turn on ifdefs
-within the LAMMPS code.  The options that are currently recognized are:
+within the LAMMPS code.  The options that are currently recogized are:
 
 -DLAMMPS_GZIP
 -DLAMMPS_JPEG
@@ -362,7 +363,7 @@ installed on your platform.  If MPI is installed on your system in the
 usual place (under /usr/local), you also may not need to specify these
 3 variables, assuming /usr/local is in your path.  On some large
 parallel machines which use "modules" for their compile/link
-environments, you may simply need to include the correct module in
+environements, you may simply need to include the correct module in
 your build environment, before building LAMMPS.  Or the parallel
 machine may have a vendor-provided MPI which the compiler has no
 trouble finding.
@@ -430,7 +431,7 @@ use the KISS library described above.
 You may also need to set the FFT_INC, FFT_PATH, and FFT_LIB variables,
 so the compiler and linker can find the needed FFT header and library
 files.  Note that on some large parallel machines which use "modules"
-for their compile/link environments, you may simply need to include
+for their compile/link environements, you may simply need to include
 the correct module in your build environment.  Or the parallel machine
 may have a vendor-provided FFT library which the compiler has no
 trouble finding.
@@ -450,12 +451,13 @@ you must also manually specify the correct library, namely -lsfftw or
 
 The FFT_INC variable also allows for a -DFFT_SINGLE setting that will
 use single-precision FFTs with PPPM, which can speed-up long-range
-calculations, particularly in parallel or on GPUs.  Fourier transform
+calulations, particularly in parallel or on GPUs.  Fourier transform
 and related PPPM operations are somewhat insensitive to floating point
 truncation errors and thus do not always need to be performed in
 double precision.  Using the -DFFT_SINGLE setting trades off a little
 accuracy for reduced memory use and parallel communication costs for
-transposing 3d FFT data.
+transposing 3d FFT data.  Note that single precision FFTs have only
+been tested with the FFTW3, FFTW2, MKL, and KISS FFT options.
 
 Step 7 :h6
 
@@ -507,13 +509,13 @@ You should get the executable lmp_foo when the build is complete.
 
 Errors that can occur when making LAMMPS: h5 :link(start_2_3)
 
-NOTE: If an error occurs when building LAMMPS, the compiler or linker
-will state very explicitly what the problem is.  The error message
-should give you a hint as to which of the steps above has failed, and
-what you need to do in order to fix it.  Building a code with a
-Makefile is a very logical process.  The compiler and linker need to
-find the appropriate files and those files need to be compatible with
-LAMMPS source files.  When a make fails, there is usually a very
+If an error occurs when building LAMMPS, the compiler or linker will
+state very explicitly what the problem is.  The error message should
+give you a hint as to which of the steps above has failed, and what
+you need to do in order to fix it.  Building a code with a Makefile is
+a very logical process.  The compiler and linker need to find the
+appropriate files and those files need to be compatible with LAMMPS
+settings and source files.  When a make fails, there is usually a very
 simple reason, which you or a local expert will need to fix.
 
 Here are two non-obvious errors that can occur:
@@ -556,7 +558,8 @@ Typing "make clean-all" or "make clean-machine" will delete *.o object
 files created when LAMMPS is built, for either all builds or for a
 particular machine.
 
-Changing the LAMMPS size limits via -DLAMMPS_SMALLBIG or -DLAMMPS_BIGBIG or -DLAMMPS_SMALLSMALL :h6
+Changing the LAMMPS size limits via -DLAMMPS_SMALLBIG or
+-DLAMMPS_BIGBIG or -DLAMMPS_SMALLSMALL :h6
 
 As explained above, any of these 3 settings can be specified on the
 LMP_INC line in your low-level src/MAKE/Makefile.foo.
@@ -655,11 +658,6 @@ This section has the following sub-sections:
 2.3.3 "Packages that require extra libraries"_#start_3_3
 2.3.4 "Packages that require Makefile.machine settings"_#start_3_4 :all(b)
 
-Note that the following "Section 2.4"_#start_4 describes the Make.py
-tool which can be used to install/un-install packages and build the
-auxiliary libraries which some of them use.  It can also auto-edit a
-Makefile.machine to add settings needed by some packages.
-
 :line
 
 Package basics: :h5,link(start_3_1)
@@ -669,235 +667,221 @@ are always included, plus optional packages.  Packages are groups of
 files that enable a specific set of features.  For example, force
 fields for molecular systems or granular systems are in packages.
 
-"Section 4"_Section_packages.html in the manual has details
-about all the packages, including specific instructions for building
-LAMMPS with each package, which are covered in a more general manner
+"Section 4"_Section_packages.html in the manual has details about all
+the packages, which come in two flavors: [standard] and [user]
+packages. It also has specific instructions for building LAMMPS with
+any package which requires an extra library.  General instructions are
 below.
 
 You can see the list of all packages by typing "make package" from
-within the src directory of the LAMMPS distribution.  This also lists
-various make commands that can be used to manipulate packages.
+within the src directory of the LAMMPS distribution.  It will also
+list various make commands that can be used to manage packages.
 
 If you use a command in a LAMMPS input script that is part of a
 package, you must have built LAMMPS with that package, else you will
 get an error that the style is invalid or the command is unknown.
-Every command's doc page specifies if it is part of a package.  You can
-also type
+Every command's doc page specfies if it is part of a package.  You can
+type
 
 lmp_machine -h :pre
 
 to run your executable with the optional "-h command-line
-switch"_#start_7 for "help", which will simply list the styles and
-commands known to your executable, and immediately exit.
-
-There are two kinds of packages in LAMMPS, standard and user packages.
-More information about the contents of standard and user packages is
-given in "Section 4"_Section_packages.html of the manual.  The
-difference between standard and user packages is as follows:
-
-Standard packages, such as molecule or kspace, are supported by the
-LAMMPS developers and are written in a syntax and style consistent
-with the rest of LAMMPS.  This means we will answer questions about
-them, debug and fix them if necessary, and keep them compatible with
-future changes to LAMMPS.
-
-User packages, such as user-atc or user-omp, have been contributed by
-users, and always begin with the user prefix.  If they are a single
-command (single file), they are typically in the user-misc package.
-Otherwise, they are a set of files grouped together which add a
-specific functionality to the code.
-
-User packages don't necessarily meet the requirements of the standard
-packages.  If you have problems using a feature provided in a user
-package, you may need to contact the contributor directly to get help.
-Information on how to submit additions you make to LAMMPS as single
-files or either a standard or user-contributed package are given in
-"this section"_Section_modify.html#mod_15 of the documentation.
+switch"_#start_7 for "help", which will list the styles and commands
+known to your executable, and immediately exit.
 
 :line
 
 Including/excluding packages :h5,link(start_3_2)
 
-To use (or not use) a package you must include it (or exclude it)
-before building LAMMPS.  From the src directory, this is typically as
-simple as:
+To use (or not use) a package you must install it (or un-install it)
+before building LAMMPS.  From the src directory, this is as simple as:
 
 make yes-colloid
 make mpi :pre
 
 or
 
-make no-manybody
+make no-user-omp
 make mpi :pre
 
-NOTE: You should NOT include/exclude packages and build LAMMPS in a
+NOTE: You should NOT install/un-install packages and build LAMMPS in a
 single make command using multiple targets, e.g. make yes-colloid mpi.
 This is because the make procedure creates a list of source files that
 will be out-of-date for the build if the package configuration changes
 within the same command.
 
-Some packages have individual files that depend on other packages
-being included.  LAMMPS checks for this and does the right thing.
-I.e. individual files are only included if their dependencies are
-already included.  Likewise, if a package is excluded, other files
+Any package can be installed or not in a LAMMPS build, independent of
+all other packages.  However, some packages include files derived from
+files in other packages.  LAMMPS checks for this and does the right
+thing.  I.e. individual files are only included if their dependencies
+are already included.  Likewise, if a package is excluded, other files
 dependent on that package are also excluded.
 
+NOTE: The one exception is that we do not recommend building with both
+the KOKKOS package installed and any of the other acceleration
+packages (GPU, OPT, USER-INTEL, USER-OMP) also installed.  This is
+because of how Kokkos sometimes builds using a wrapper compiler which
+can make it difficult to invoke all the compile/link flags correctly
+for both Kokkos and non-Kokkos files.
+
 If you will never run simulations that use the features in a
 particular packages, there is no reason to include it in your build.
-For some packages, this will keep you from having to build auxiliary
-libraries (see below), and will also produce a smaller executable
-which may run a bit faster.
-
-When you download a LAMMPS tarball, these packages are pre-installed
-in the src directory: KSPACE, MANYBODY,MOLECULE, because they are so
-commonly used.  When you download LAMMPS source files from the SVN or
-Git repositories, no packages are pre-installed.
-
-Packages are included or excluded by typing "make yes-name" or "make
-no-name", where "name" is the name of the package in lower-case, e.g.
-name = kspace for the KSPACE package or name = user-atc for the
-USER-ATC package.  You can also type "make yes-standard", "make
-no-standard", "make yes-std", "make no-std", "make yes-user", "make
-no-user", "make yes-lib", "make no-lib", "make yes-all", or "make
-no-all" to include/exclude various sets of packages.  Type "make
-package" to see all of the package-related make options.
-
-NOTE: Inclusion/exclusion of a package works by simply moving files
-back and forth between the main src directory and sub-directories with
-the package name (e.g. src/KSPACE, src/USER-ATC), so that the files
-are seen or not seen when LAMMPS is built.  After you have included or
-excluded a package, you must re-build LAMMPS.
-
-Additional package-related make options exist to help manage LAMMPS
-files that exist in both the src directory and in package
-sub-directories.  You do not normally need to use these commands
-unless you are editing LAMMPS files or have downloaded a patch from
-the LAMMPS WWW site.
-
-Typing "make package-update" or "make pu" will overwrite src files
-with files from the package sub-directories if the package has been
-included.  It should be used after a patch is installed, since patches
-only update the files in the package sub-directory, but not the src
-files.  Typing "make package-overwrite" will overwrite files in the
-package sub-directories with src files.
+For some packages, this will keep you from having to build extra
+libraries, and will also produce a smaller executable which may run a
+bit faster.
+
+When you download a LAMMPS tarball, three packages are pre-installed
+in the src directory -- KSPACE, MANYBODY, MOLECULE -- because they are
+so commonly used.  When you download LAMMPS source files from the SVN
+or Git repositories, no packages are pre-installed.
+
+Packages are installed or un-installed by typing
+
+make yes-name
+make no-name :pre
+
+where "name" is the name of the package in lower-case, e.g.  name =
+kspace for the KSPACE package or name = user-atc for the USER-ATC
+package.  You can also type any of these commands:
+
+make yes-all | install all packages
+make no-all | un-install all packages
+make yes-standard or make yes-std | install standard packages
+make no-standard or make no-std| un-install standard packages
+make yes-user | install user packages
+make no-user | un-install user packages
+make yes-lib | install packages that require extra libraries
+make no-lib | un-install packages that require extra libraries
+make yes-ext | install packages that require external libraries
+make no-ext | un-install packages that require external libraries :tb(s=|)
+
+which install/un-install various sets of packages.  Typing "make
+package" will list all the these commands.
+
+NOTE: Installing or un-installing a package works by simply moving
+files back and forth between the main src directory and
+sub-directories with the package name (e.g. src/KSPACE, src/USER-ATC),
+so that the files are included or excluded when LAMMPS is built.
+After you have installed or un-installed a package, you must re-build
+LAMMPS for the action to take effect.
+
+The following make commands help manage files that exist in both the
+src directory and in package sub-directories.  You do not normally
+need to use these commands unless you are editing LAMMPS files or have
+downloaded a patch from the LAMMPS web site.
 
 Typing "make package-status" or "make ps" will show which packages are
-currently included. For those that are included, it will list any
+currently installed. For those that are installed, it will list any
 files that are different in the src directory and package
-sub-directory.  Typing "make package-diff" lists all differences
-between these files.  Again, type "make package" to see all of the
-package-related make options.
+sub-directory.
 
-:line
+Typing "make package-update" or "make pu" will overwrite src files
+with files from the package sub-directories if the package is
+installed.  It should be used after a patch has been applied, since
+patches only update the files in the package sub-directory, but not
+the src files.
 
-Packages that require extra libraries :h5,link(start_3_3)
+Typing "make package-overwrite" will overwrite files in the package
+sub-directories with src files.
 
-A few of the standard and user packages require additional auxiliary
-libraries.  Many of them are provided with LAMMPS, in which case they
-must be compiled first, before LAMMPS is built, if you wish to include
-that package.  If you get a LAMMPS build error about a missing
-library, this is likely the reason.  See the
-"Section 4"_Section_packages.html doc page for a list of
-packages that have these kinds of auxiliary libraries.
-
-The lib directory in the distribution has sub-directories with package
-names that correspond to the needed auxiliary libs, e.g. lib/gpu.
-Each sub-directory has a README file that gives more details.  Code
-for most of the auxiliary libraries is included in that directory.
-Examples are the USER-ATC and MEAM packages.
-
-A few of the lib sub-directories do not include code, but do include
-instructions (and sometimes scripts) that automate the process of
-downloading the auxiliary library and installing it so LAMMPS can link
-to it.  Examples are the KIM, VORONOI, USER-MOLFILE, and USER-SMD
-packages.
-
-The lib/python directory (for the PYTHON package) contains only a
-choice of Makefile.lammps.* files.  This is because no auxiliary code
-or libraries are needed, only the Python library and other system libs
-that should already available on your system.  However, the
-Makefile.lammps file is needed to tell LAMMPS which libs to use and
-where to find them.
-
-For libraries with provided code, the sub-directory README file
-(e.g. lib/atc/README) has instructions on how to build that library.
-This information is also summarized in "Section
-4"_Section_packages.html.  Typically this is done by typing
-something like:
+Typing "make package-diff" lists all differences between these files.
 
-make -f Makefile.g++ :pre
-
-If one of the provided Makefiles is not appropriate for your system
-you will need to edit or add one.  Note that all the Makefiles have a
-setting for EXTRAMAKE at the top that specifies a Makefile.lammps.*
-file.
-
-If the library build is successful, it will produce 2 files in the lib
-directory:
-
-libpackage.a
-Makefile.lammps :pre
-
-The Makefile.lammps file will typically be a copy of one of the
-Makefile.lammps.* files in the library directory.
-
-Note that you must insure that the settings in Makefile.lammps are
-appropriate for your system.  If they are not, the LAMMPS build may
-fail.  To fix this, you can edit or create a new Makefile.lammps.*
-file for your system, and copy it to Makefile.lammps.
-
-As explained in the lib/package/README files, the settings in
-Makefile.lammps are used to specify additional system libraries and
-their locations so that LAMMPS can build with the auxiliary library.
-For example, if the MEAM package is used, the auxiliary library
-consists of F90 code, built with a Fortran complier.  To link that
-library with LAMMPS (a C++ code) via whatever C++ compiler LAMMPS is
-built with, typically requires additional Fortran-to-C libraries be
-included in the link.  Another example are the BLAS and LAPACK
-libraries needed to use the USER-ATC or USER-AWPMD packages.
-
-For libraries without provided code, the sub-directory README file has
-information on where to download the library and how to build it,
-e.g. lib/voronoi/README and lib/smd/README.  The README files also
-describe how you must either (a) create soft links, via the "ln"
-command, in those directories to point to where you built or installed
-the packages, or (b) check or edit the Makefile.lammps file in the
-same directory to provide that information.
-
-Some of the sub-directories, e.g. lib/voronoi, also have an install.py
-script which can be used to automate the process of
-downloading/building/installing the auxiliary library, and setting the
-needed soft links.  Type "python install.py" for further instructions.
-
-As with the sub-directories containing library code, if the soft links
-or settings in the lib/package/Makefile.lammps files are not correct,
-the LAMMPS build will typically fail.
+Again, just type "make package" to see all of the package-related make
+options.
 
 :line
 
-Packages that require Makefile.machine settings :h5,link(start_3_4)
-
-A few packages require specific settings in Makefile.machine, to
-either build or use the package effectively.  These are the
-USER-INTEL, KOKKOS, USER-OMP, and OPT packages, used for accelerating
-code performance on CPUs or other hardware, as discussed in "Section
-5.3"_Section_accelerate.html#acc_3.
+Packages that require extra libraries :h5,link(start_3_3)
 
-A summary of what Makefile.machine changes are needed for each of
-these packages is given in "Section 4"_Section_packages.html.
-The details are given on the doc pages that describe each of these
-accelerator packages in detail:
+A few of the standard and user packages require extra libraries.  See
+"Section 4"_Section_packages.html for two tables of packages which
+indicate which ones require libraries.  For each such package, the
+Section 4 doc page gives details on how to build the extra library,
+including how to download it if necessary.  The basic ideas are
+summarized here.
+
+[System libraries:]
+
+Packages in the tables "Section 4"_Section_packages.html with a "sys"
+in the last column link to system libraries that typically already
+exist on your machine.  E.g. the python package links to a system
+Python library.  If your machine does not have the required library,
+you will have to download and install it on your machine, in either
+the system or user space.
+
+[Internal libraries:]
+
+Packages in the tables "Section 4"_Section_packages.html with an "int"
+in the last column link to internal libraries whose source code is
+included with LAMMPS, in the lib/name directory where name is the
+package name.  You must first build the library in that directory
+before building LAMMPS with that package installed.  E.g. the gpu
+package links to a library you build in the lib/gpu dir.  You can
+often do the build in one step by typing "make lib-name args=..."
+from the src dir, with appropriate arguments.  You can leave off the
+args to see a help message.  See "Section 4"_Section_packages.html for
+details for each package.
+
+[External libraries:]
+
+Packages in the tables "Section 4"_Section_packages.html with an "ext"
+in the last column link to exernal libraries whose source code is not
+included with LAMMPS.  You must first download and install the library
+before building LAMMPS with that package installed.  E.g. the voronoi
+package links to the freely available "Voro++ library"_voronoi.  You
+can often do the download/build in one step by typing "make lib-name
+args=..." from the src dir, with appropriate arguments.  You can leave
+off the args to see a help message.  See "Section
+4"_Section_packages.html for details for each package.
+
+:link(voronoi,http://math.lbl.gov/voro++)
+
+[Possible errors:]
+
+There are various common errors which can occur when building extra
+libraries or when building LAMMPS with packages that require the extra
+libraries.
+
+If you cannot build the extra library itself successfully, you may
+need to edit or create an appropriate Makefile for your machine, e.g.
+with appropriate compiler or system settings.  Provided makefiles are
+typically in the lib/name directory.  E.g. see the Makefile.* files in
+lib/gpu.
+
+The LAMMPS build often uses settings in a lib/name/Makefile.lammps
+file which either exists in the LAMMPS distribution or is created or
+copied from a lib/name/Makefile.lammps.* file when the library is
+built.  If those settings are not correct for your machine you will
+need to edit or create an appropriate Makefile.lammps file.
+
+Package-specific details for these steps are given in "Section
+4"_Section_packages.html an in README files in the lib/name
+directories.
+
+[Compiler options needed for accelerator packages:]
+
+Several packages contain code that is optimized for specific hardware,
+e.g. CPU, KNL, or GPU.  These are the OPT, GPU, KOKKOS, USER-INTEL,
+and USER-OMP packages.  Compiling and linking the source files in
+these accelerator packages for optimal performance requires specific
+settings in the Makefile.machine file you use.
+
+A summary of the Makefile.machine settings needed for each of these
+packages is given in "Section 4"_Section_packages.html.  More info is
+given on the doc pages that describe each package in detail:
 
 5.3.1 "USER-INTEL package"_accelerate_intel.html
+5.3.2 "GPU package"_accelerate_intel.html
 5.3.3 "KOKKOS package"_accelerate_kokkos.html
 5.3.4 "USER-OMP package"_accelerate_omp.html
 5.3.5 "OPT package"_accelerate_opt.html :all(b)
 
-You can also look at the following machine Makefiles in
-src/MAKE/OPTIONS, which include the changes.  Note that the USER-INTEL
-and KOKKOS packages allow for settings that build LAMMPS for different
-hardware.  The USER-INTEL package builds for CPU and the Xeon Phi, the
-KOKKOS package builds for OpenMP, GPUs (Cuda), and the Xeon Phi.
+You can also use or examine the following machine Makefiles in
+src/MAKE/OPTIONS, which include the settings.  Note that the
+USER-INTEL and KOKKOS packages can use settings that build LAMMPS for
+different hardware.  The USER-INTEL package can be compiled for Intel
+CPUs and KNLs; the KOKKOS package builds for CPUs (OpenMP), GPUs
+(Cuda), and Intel KNLs.
 
 Makefile.intel_cpu
 Makefile.intel_phi
@@ -907,127 +891,9 @@ Makefile.kokkos_phi
 Makefile.omp
 Makefile.opt :ul
 
-Also note that the Make.py tool, described in the next "Section
-2.4"_#start_4 can automatically add the needed info to an existing
-machine Makefile, using simple command-line arguments.
-
-:line
-
-2.4 Building LAMMPS via the Make.py tool :h4,link(start_4)
-
-The src directory includes a Make.py script, written in Python, which
-can be used to automate various steps of the build process.  It is
-particularly useful for working with the accelerator packages, as well
-as other packages which require auxiliary libraries to be built.
-
-The goal of the Make.py tool is to allow any complex multi-step LAMMPS
-build to be performed as a single Make.py command.  And you can
-archive the commands, so they can be re-invoked later via the -r
-(redo) switch.  If you find some LAMMPS build procedure that can't be
-done in a single Make.py command, let the developers know, and we'll
-see if we can augment the tool.
-
-You can run Make.py from the src directory by typing either:
-
-Make.py -h
-python Make.py -h :pre
-
-which will give you help info about the tool.  For the former to work,
-you may need to edit the first line of Make.py to point to your local
-Python.  And you may need to insure the script is executable:
-
-chmod +x Make.py :pre
-
-Here are examples of build tasks you can perform with Make.py:
-
-Install/uninstall packages: Make.py -p no-lib kokkos omp intel
-Build specific auxiliary libs: Make.py -a lib-atc lib-meam
-Build libs for all installed packages: Make.py -p cuda gpu -gpu mode=double arch=31 -a lib-all
-Create a Makefile from scratch with compiler and MPI settings: Make.py -m none -cc g++ -mpi mpich -a file
-Augment Makefile.serial with settings for installed packages: Make.py -p intel -intel cpu -m serial -a file
-Add JPG and FFTW support to Makefile.mpi: Make.py -m mpi -jpg -fft fftw -a file
-Build LAMMPS with a parallel make using Makefile.mpi: Make.py -j 16 -m mpi -a exe
-Build LAMMPS and libs it needs using Makefile.serial with accelerator settings: Make.py -p gpu intel -intel cpu -a lib-all file serial :tb(s=:)
-
-The bench and examples directories give Make.py commands that can be
-used to build LAMMPS with the various packages and options needed to
-run all the benchmark and example input scripts.  See these files for
-more details:
-
-bench/README
-bench/FERMI/README
-bench/KEPLER/README
-bench/PHI/README
-examples/README
-examples/accelerate/README
-examples/accelerate/make.list :ul
-
-All of the Make.py options and syntax help can be accessed by using
-the "-h" switch.
-
-E.g. typing "Make.py -h" gives
-
-Syntax: Make.py switch args ...
-  switches can be listed in any order
-  help switch:
-    -h prints help and syntax for all other specified switches
-  switch for actions:
-    -a lib-all, lib-dir, clean, file, exe or machine
-    list one or more actions, in any order
-    machine is a Makefile.machine suffix, must be last if used
-  one-letter switches:
-    -d (dir), -j (jmake), -m (makefile), -o (output),
-    -p (packages), -r (redo), -s (settings), -v (verbose)
-  switches for libs:
-    -atc, -awpmd, -colvars, -cuda
-    -gpu, -meam, -poems, -qmmm, -reax
-  switches for build and makefile options:
-    -intel, -kokkos, -cc, -mpi, -fft, -jpg, -png :pre
-
-Using the "-h" switch with other switches and actions gives additional
-info on all the other specified switches or actions.  The "-h" can be
-anywhere in the command-line and the other switches do not need their
-arguments.  E.g. type "Make.py -h -d -atc -intel" will print:
-
--d dir
-  dir = LAMMPS home dir
-  if -d not specified, working dir must be lammps/src :pre
-
--atc make=suffix lammps=suffix2
-  all args are optional and can be in any order
-  make = use Makefile.suffix (def = g++)
-  lammps = use Makefile.lammps.suffix2 (def = EXTRAMAKE in makefile) :pre
-
--intel mode
-  mode = cpu or phi (def = cpu)
-    build Intel package for CPU or Xeon Phi :pre
-
-Note that Make.py never overwrites an existing Makefile.machine.
-Instead, it creates src/MAKE/MINE/Makefile.auto, which you can save or
-rename if desired.  Likewise it creates an executable named
-src/lmp_auto, which you can rename using the -o switch if desired.
-
-The most recently executed Make.py command is saved in
-src/Make.py.last.  You can use the "-r" switch (for redo) to re-invoke
-the last command, or you can save a sequence of one or more Make.py
-commands to a file and invoke the file of commands using "-r".  You
-can also label the commands in the file and invoke one or more of them
-by name.
-
-A typical use of Make.py is to start with a valid Makefile.machine for
-your system, that works for a vanilla LAMMPS build, i.e. when optional
-packages are not installed.  You can then use Make.py to add various
-settings (FFT, JPG, PNG) to the Makefile.machine as well as change its
-compiler and MPI options.  You can also add additional packages to the
-build, as well as build the needed supporting libraries.
-
-You can also use Make.py to create a new Makefile.machine from
-scratch, using the "-m none" switch, if you also specify what compiler
-and MPI options to use, via the "-cc" and "-mpi" switches.
-
 :line
 
-2.5 Building LAMMPS as a library :h4,link(start_5)
+2.4 Building LAMMPS as a library :h4,link(start_4)
 
 LAMMPS can be built as either a static or shared library, which can
 then be called from another application or a scripting language.  See
@@ -1063,7 +929,7 @@ src/MAKE/Makefile.foo and perform the build in the directory
 Obj_shared_foo.  This is so that each file can be compiled with the
 -fPIC flag which is required for inclusion in a shared library.  The
 build will create the file liblammps_foo.so which another application
-can link to dynamically.  It will also create a soft link liblammps.so,
+can link to dyamically.  It will also create a soft link liblammps.so,
 which will point to the most recently built shared library.  This is
 the file the Python wrapper loads by default.
 
@@ -1149,7 +1015,7 @@ interface and how to extend it for your needs.
 
 :line
 
-2.6 Running LAMMPS :h4,link(start_6)
+2.5 Running LAMMPS :h4,link(start_5)
 
 By default, LAMMPS runs by reading commands from standard input.  Thus
 if you run the LAMMPS executable by itself, e.g.
@@ -1281,7 +1147,7 @@ more processors or setup a smaller problem.
 
 :line
 
-2.7 Command-line options :h4,link(start_7)
+2.6 Command-line options :h4,link(start_6)
 
 At run time, LAMMPS recognizes several optional command-line switches
 which may be used in any order.  Either the full word or a one-or-two
@@ -1415,8 +1281,8 @@ LAMMPS is compiled with CUDA=yes.
 numa Nm :pre
 
 This option is only relevant when using pthreads with hwloc support.
-In this case Nm defines the number of NUMA regions (typically sockets)
-on a node which will be utilized by a single MPI rank.  By default Nm
+In this case Nm defines the number of NUMA regions (typicaly sockets)
+on a node which will be utilizied by a single MPI rank.  By default Nm
 = 1.  If this option is used the total number of worker-threads per
 MPI rank is threads*numa.  Currently it is always almost better to
 assign at least one MPI rank per NUMA region, and leave numa set to
@@ -1480,7 +1346,7 @@ replica runs on on one or a few processors.  Note that with MPI
 installed on a machine (e.g. your desktop), you can run on more
 (virtual) processors than you have physical processors.
 
-To run multiple independent simulations from one input script, using
+To run multiple independent simulatoins from one input script, using
 multiple partitions, see "Section 6.4"_Section_howto.html#howto_4
 of the manual.  World- and universe-style "variables"_variable.html
 are useful in this context.
@@ -1711,7 +1577,7 @@ negative numeric value.  It is OK if the first value1 starts with a
 
 :line
 
-2.8 LAMMPS screen output :h4,link(start_8)
+2.7 LAMMPS screen output :h4,link(start_7)
 
 As LAMMPS reads an input script, it prints information to both the
 screen and a log file about significant actions it takes to setup a
@@ -1759,7 +1625,7 @@ The first section provides a global loop timing summary. The {loop time}
 is the total wall time for the section.  The {Performance} line is
 provided for convenience to help predicting the number of loop
 continuations required and for comparing performance with other,
-similar MD codes.  The {CPU use} line provides the CPU utilization per
+similar MD codes.  The {CPU use} line provides the CPU utilzation per
 MPI task; it should be close to 100% times the number of OpenMP
 threads (or 1 of no OpenMP). Lower numbers correspond to delays due
 to file I/O or insufficient thread utilization.
@@ -1867,7 +1733,7 @@ communication, roughly 75% in the example above.
 
 :line
 
-2.9 Tips for users of previous LAMMPS versions :h4,link(start_9)
+2.8 Tips for users of previous LAMMPS versions :h4,link(start_8)
 
 The current C++ began with a complete rewrite of LAMMPS 2001, which
 was written in F90.  Features of earlier versions of LAMMPS are listed
diff --git a/doc/src/Section_tools.txt b/doc/src/Section_tools.txt
index 03611c7cdb2db0a8cf57183b34e68aaf920fca33..d95c4f0cd40555a18a1c0a315d54cff6f851e53d 100644
--- a/doc/src/Section_tools.txt
+++ b/doc/src/Section_tools.txt
@@ -369,15 +369,18 @@ supports it.  It has its own WWW page at
 
 msi2lmp tool :h4,link(msi)
 
-The msi2lmp sub-directory contains a tool for creating LAMMPS input
-data files from BIOVIA's Materias Studio files (formerly Accelrys'
+The msi2lmp sub-directory contains a tool for creating LAMMPS template
+input and data files from BIOVIA's Materias Studio files (formerly Accelrys'
 Insight MD code, formerly MSI/Biosym and its Discover MD code).
 
 This tool was written by John Carpenter (Cray), Michael Peachey
 (Cray), and Steve Lustig (Dupont). Several people contributed changes
 to remove bugs and adapt its output to changes in LAMMPS.
 
-See the README file for more information.
+This tool has several known limitations and is no longer under active
+development, so there are no changes except for the occasional bugfix.
+
+See the README file in the tools/msi2lmp folder for more information.
 
 :line
 
diff --git a/doc/src/angle_sdk.txt b/doc/src/angle_sdk.txt
index 785585f840fc75ba75f37f1f2bec2c896dcfe24b..0cc535e543f878bba02b38b9a5fa18e65da0aea2 100644
--- a/doc/src/angle_sdk.txt
+++ b/doc/src/angle_sdk.txt
@@ -46,7 +46,7 @@ from the pair_style.
 [Restrictions:]
 
 This angle style can only be used if LAMMPS was built with the
-USER-CG-CMM package.  See the "Making
+USER-CGSDK package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info on packages.
 
 [Related commands:]
diff --git a/doc/src/bonds.txt b/doc/src/bonds.txt
index 3b50f6482f9ed4bab849048c4f15f7f436aabe58..169d56ecbe4e7c75fce57abd58078dff9404a0cb 100644
--- a/doc/src/bonds.txt
+++ b/doc/src/bonds.txt
@@ -16,7 +16,6 @@ Bond Styles :h1
    bond_none
    bond_nonlinear
    bond_oxdna
-   bond_oxdna2
    bond_quartic
    bond_table
    bond_zero
diff --git a/doc/src/compute_sna_atom.txt b/doc/src/compute_sna_atom.txt
index e2df706473e178202b1fbb10985cade08dce831f..f82df0d81601eb16d6f0ba578ab6783cc735f7a5 100644
--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@@ -24,7 +24,7 @@ twojmax = band limit for bispectrum components (non-negative integer) :l
 R_1, R_2,... = list of cutoff radii, one for each type (distance units) :l
 w_1, w_2,... = list of neighbor weights, one for each type  :l
 zero or more keyword/value pairs may be appended :l
-keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} :l
+keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag}:l
   {diagonal} value = {0} or {1} or {2} or {3}
      {0} = all j1, j2, j <= twojmax, j2 <= j1
      {1} = subset satisfying j1 == j2
@@ -36,7 +36,10 @@ keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} :l
      {1} = use switching function
   {bzeroflag} value = {0} or {1}
      {0} = do not subtract B0
-     {1} = subtract B0 :pre
+     {1} = subtract B0
+  {quadraticflag} value = {0} or {1}
+     {0} = do not generate quadratic terms
+     {1} = generate quadratic terms :pre
 :ule
 
 [Examples:]
@@ -151,7 +154,7 @@ linear mapping from radial distance to polar angle {theta0} on the
 The argument {twojmax} and the keyword {diagonal} define which
 bispectrum components are generated. See section below on output for a
 detailed explanation of the number of bispectrum components and the
-ordered in which they are listed
+ordered in which they are listed.
 
 The keyword {switchflag} can be used to turn off the switching
 function.
@@ -162,6 +165,14 @@ the calculated bispectrum components. This optional keyword is only
 available for compute {sna/atom}, as {snad/atom} and {snav/atom}
 are unaffected by the removal of constant terms.
 
+The keyword {quadraticflag} determines whether or not the
+quadratic analogs to the bispectrum quantities are generated.
+These are formed by taking the outer product of the vector
+of bispectrum components with itself.
+See section below on output for a
+detailed explanation of the number of quadratic terms and the
+ordered in which they are listed.
+
 NOTE: If you have a bonded system, then the settings of
 "special_bonds"_special_bonds.html command can remove pairwise
 interactions between atoms in the same bond, angle, or dihedral.  This
@@ -180,7 +191,7 @@ command that includes all pairs in the neighbor list.
 
 Compute {sna/atom} calculates a per-atom array, each column
 corresponding to a particular bispectrum component.  The total number
-of columns and the identities of the bispectrum component contained in
+of columns and the identity of the bispectrum component contained in
 each column depend on the values of {twojmax} and {diagonal}, as
 described by the following piece of python code:
 
@@ -213,6 +224,19 @@ block contains six sub-blocks corresponding to the {xx}, {yy}, {zz},
 notation.  Each of these sub-blocks contains one column for each
 bispectrum component, the same as for compute {sna/atom}
 
+For example, if {K}=30 and ntypes=1, the number of columns in the per-atom
+arrays generated by {sna/atom}, {snad/atom}, and {snav/atom}
+are 30, 90, and 180, respectively. With {quadratic} value=1,
+the numbers of columns are 930, 2790, and 5580, respectively.
+
+If the {quadratic} keyword value is set to 1, then additional
+columns are appended to each per-atom array, corresponding to
+a matrix of quantities that are products of two bispectrum components. If the
+number of bispectrum components is {K}, then the number of matrix elements
+is {K}^2. These are output in subblocks of {K}^2 columns, using the same
+ordering of columns and sub-blocks as was used for the bispectrum
+components.
+
 These values can be accessed by any command that uses per-atom values
 from a compute as input.  See "Section
 6.15"_Section_howto.html#howto_15 for an overview of LAMMPS output
@@ -231,7 +255,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 [Default:]
 
 The optional keyword defaults are {diagonal} = 0, {rmin0} = 0,
-{switchflag} = 1, {bzeroflag} = 0.
+{switchflag} = 1, {bzeroflag} = 1, {quadraticflag} = 0,
 
 :line
 
diff --git a/doc/src/dump.txt b/doc/src/dump.txt
index cb9a5ba74112849fb220344a720ea5e66015fe75..69a00eb473735f9115bae72efb682095675ff4e6 100644
--- a/doc/src/dump.txt
+++ b/doc/src/dump.txt
@@ -7,12 +7,12 @@
 :line
 
 dump command :h3
-"dump custom/vtk"_dump_custom_vtk.html command :h3
+"dump vtk"_dump_vtk.html command :h3
 "dump h5md"_dump_h5md.html command :h3
+"dump molfile"_dump_molfile.html command :h3
+"dump netcdf"_dump_netcdf.html command :h3
 "dump image"_dump_image.html command :h3
 "dump movie"_dump_image.html command :h3
-"dump molfile"_dump_molfile.html command :h3
-"dump nc"_dump_nc.html command :h3
 
 [Syntax:]
 
@@ -20,7 +20,7 @@ dump ID group-ID style N file args :pre
 
 ID = user-assigned name for the dump :ulb,l
 group-ID = ID of the group of atoms to be dumped :l
-style = {atom} or {atom/gz} or {atom/mpiio} or {cfg} or {cfg/gz} or {cfg/mpiio} or {dcd} or {xtc} or {xyz} or {xyz/gz} or {xyz/mpiio} or {h5md} or {image} or {movie} or {molfile} or {local} or {custom} or {custom/gz} or {custom/mpiio} :l
+style = {atom} or {atom/gz} or {atom/mpiio} or {cfg} or {cfg/gz} or {cfg/mpiio} or {custom} or {custom/gz} or {custom/mpiio} or {dcd} or {h5md} or {image} or or {local} or {molfile} or {movie} or {netcdf} or {netcdf/mpiio} or {vtk} or {xtc} or {xyz} or {xyz/gz} or {xyz/mpiio} :l
 N = dump every this many timesteps :l
 file = name of file to write dump info to :l
 args = list of arguments for a particular style :l
@@ -30,33 +30,22 @@ args = list of arguments for a particular style :l
   {cfg} args = same as {custom} args, see below
   {cfg/gz} args = same as {custom} args, see below
   {cfg/mpiio} args = same as {custom} args, see below
+  {custom}, {custom/gz}, {custom/mpiio} args = see below
   {dcd} args = none
+  {h5md} args = discussed on "dump h5md"_dump_h5md.html doc page
+  {image} args = discussed on "dump image"_dump_image.html doc page
+  {local} args = see below
+  {molfile} args = discussed on "dump molfile"_dump_molfile.html doc page
+  {movie} args = discussed on "dump image"_dump_image.html doc page
+  {netcdf} args = discussed on "dump netcdf"_dump_netcdf.html doc page
+  {netcdf/mpiio} args = discussed on "dump netcdf"_dump_netcdf.html doc page
+  {vtk} args = same as {custom} args, see below, also "dump vtk"_dump_vtk.html doc page
   {xtc} args = none
-  {xyz} args = none :pre
-  {xyz/gz} args = none :pre
+  {xyz} args = none
+  {xyz/gz} args = none
   {xyz/mpiio} args = none :pre
 
-  {custom/vtk} args = similar to custom args below, discussed on "dump custom/vtk"_dump_custom_vtk.html doc page :pre
-
-  {h5md} args = discussed on "dump h5md"_dump_h5md.html doc page :pre
-
-  {image} args = discussed on "dump image"_dump_image.html doc page :pre
-
-  {movie} args = discussed on "dump image"_dump_image.html doc page :pre
-
-  {molfile} args = discussed on "dump molfile"_dump_molfile.html doc page
-
-  {nc} args = discussed on "dump nc"_dump_nc.html doc page :pre
-
-  {local} args = list of local attributes
-    possible attributes = index, c_ID, c_ID\[I\], f_ID, f_ID\[I\]
-      index = enumeration of local values
-      c_ID = local vector calculated by a compute with ID
-      c_ID\[I\] = Ith column of local array calculated by a compute with ID, I can include wildcard (see below)
-      f_ID = local vector calculated by a fix with ID
-      f_ID\[I\] = Ith column of local array calculated by a fix with ID, I can include wildcard (see below) :pre
-
-  {custom} or {custom/gz} or {custom/mpiio} args = list of atom attributes
+{custom} or {custom/gz} or {custom/mpiio} args = list of atom attributes :l
     possible attributes = id, mol, proc, procp1, type, element, mass,
                           x, y, z, xs, ys, zs, xu, yu, zu,
                           xsu, ysu, zsu, ix, iy, iz,
@@ -94,6 +83,15 @@ args = list of arguments for a particular style :l
       v_name = per-atom vector calculated by an atom-style variable with name
       d_name = per-atom floating point vector with name, managed by fix property/atom
       i_name = per-atom integer vector with name, managed by fix property/atom :pre
+
+{local} args = list of local attributes :l
+    possible attributes = index, c_ID, c_ID\[I\], f_ID, f_ID\[I\]
+      index = enumeration of local values
+      c_ID = local vector calculated by a compute with ID
+      c_ID\[I\] = Ith column of local array calculated by a compute with ID, I can include wildcard (see below)
+      f_ID = local vector calculated by a fix with ID
+      f_ID\[I\] = Ith column of local array calculated by a fix with ID, I can include wildcard (see below) :pre
+
 :ule
 
 [Examples:]
diff --git a/doc/src/dump_custom_vtk.txt b/doc/src/dump_custom_vtk.txt
deleted file mode 100644
index d4c16193d89f22eb956f0bdcc99b64c6b72ba8b5..0000000000000000000000000000000000000000
--- a/doc/src/dump_custom_vtk.txt
+++ /dev/null
@@ -1,347 +0,0 @@
- "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
-
-:link(lws,http://lammps.sandia.gov)
-:link(ld,Manual.html)
-:link(lc,Section_commands.html#comm)
-
-:line
-
-dump custom/vtk command :h3
-
-[Syntax:]
-
-dump ID group-ID style N file args :pre
-
-ID = user-assigned name for the dump :ulb,l
-group-ID = ID of the group of atoms to be dumped :l
-style = {custom/vtk} :l
-N = dump every this many timesteps :l
-file = name of file to write dump info to :l
-args = list of arguments for a particular style :l
-  {custom/vtk} args = list of atom attributes
-    possible attributes = id, mol, proc, procp1, type, element, mass,
-                          x, y, z, xs, ys, zs, xu, yu, zu,
-                          xsu, ysu, zsu, ix, iy, iz,
-                          vx, vy, vz, fx, fy, fz,
-                          q, mux, muy, muz, mu,
-                          radius, diameter, omegax, omegay, omegaz,
-                          angmomx, angmomy, angmomz, tqx, tqy, tqz,
-                          c_ID, c_ID\[N\], f_ID, f_ID\[N\], v_name :pre
-
-      id = atom ID
-      mol = molecule ID
-      proc = ID of processor that owns atom
-      procp1 = ID+1 of processor that owns atom
-      type = atom type
-      element = name of atom element, as defined by "dump_modify"_dump_modify.html command
-      mass = atom mass
-      x,y,z = unscaled atom coordinates
-      xs,ys,zs = scaled atom coordinates
-      xu,yu,zu = unwrapped atom coordinates
-      xsu,ysu,zsu = scaled unwrapped atom coordinates
-      ix,iy,iz = box image that the atom is in
-      vx,vy,vz = atom velocities
-      fx,fy,fz = forces on atoms
-      q = atom charge
-      mux,muy,muz = orientation of dipole moment of atom
-      mu = magnitude of dipole moment of atom
-      radius,diameter = radius,diameter of spherical particle
-      omegax,omegay,omegaz = angular velocity of spherical particle
-      angmomx,angmomy,angmomz = angular momentum of aspherical particle
-      tqx,tqy,tqz = torque on finite-size particles
-      c_ID = per-atom vector calculated by a compute with ID
-      c_ID\[I\] = Ith column of per-atom array calculated by a compute with ID, I can include wildcard (see below)
-      f_ID = per-atom vector calculated by a fix with ID
-      f_ID\[I\] = Ith column of per-atom array calculated by a fix with ID, I can include wildcard (see below)
-      v_name = per-atom vector calculated by an atom-style variable with name
-      d_name = per-atom floating point vector with name, managed by fix property/atom
-      i_name = per-atom integer vector with name, managed by fix property/atom :pre
-:ule
-
-[Examples:]
-
-dump dmpvtk all custom/vtk 100 dump*.myforce.vtk id type vx fx
-dump dmpvtp flow custom/vtk 100 dump*.%.displace.vtp id type c_myD\[1\] c_myD\[2\] c_myD\[3\] v_ke :pre
-
-The style {custom/vtk} is similar to the "custom"_dump.html style but
-uses the VTK library to write data to VTK simple legacy or XML format
-depending on the filename extension specified. This can be either
-{*.vtk} for the legacy format or {*.vtp} and {*.vtu}, respectively,
-for the XML format; see the "VTK
-homepage"_http://www.vtk.org/VTK/img/file-formats.pdf for a detailed
-description of these formats.  Since this naming convention conflicts
-with the way binary output is usually specified (see below),
-"dump_modify binary"_dump_modify.html allows to set the binary
-flag for this dump style explicitly.
-
-[Description:]
-
-Dump a snapshot of atom quantities to one or more files every N
-timesteps in a format readable by the "VTK visualization
-toolkit"_http://www.vtk.org or other visualization tools that use it,
-e.g. "ParaView"_http://www.paraview.org.  The timesteps on which dump
-output is written can also be controlled by a variable; see the
-"dump_modify every"_dump_modify.html command for details.
-
-Only information for atoms in the specified group is dumped.  The
-"dump_modify thresh and region"_dump_modify.html commands can also
-alter what atoms are included; see details below.
-
-As described below, special characters ("*", "%") in the filename
-determine the kind of output.
-
-IMPORTANT NOTE: Because periodic boundary conditions are enforced only
-on timesteps when neighbor lists are rebuilt, the coordinates of an
-atom written to a dump file may be slightly outside the simulation
-box.
-
-IMPORTANT NOTE: Unless the "dump_modify sort"_dump_modify.html
-option is invoked, the lines of atom information written to dump files
-will be in an indeterminate order for each snapshot.  This is even
-true when running on a single processor, if the "atom_modify
-sort"_atom_modify.html option is on, which it is by default.  In this
-case atoms are re-ordered periodically during a simulation, due to
-spatial sorting.  It is also true when running in parallel, because
-data for a single snapshot is collected from multiple processors, each
-of which owns a subset of the atoms.
-
-For the {custom/vtk} style, sorting is off by default. See the
-"dump_modify"_dump_modify.html doc page for details.
-
-:line
-
-The dimensions of the simulation box are written to a separate file
-for each snapshot (either in legacy VTK or XML format depending on
-the format of the main dump file) with the suffix {_boundingBox}
-appended to the given dump filename.
-
-For an orthogonal simulation box this information is saved as a
-rectilinear grid (legacy .vtk or .vtr XML format).
-
-Triclinic simulation boxes (non-orthogonal) are saved as
-hexahedrons in either legacy .vtk or .vtu XML format.
-
-Style {custom/vtk} allows you to specify a list of atom attributes
-to be written to the dump file for each atom.  Possible attributes
-are listed above.  In contrast to the {custom} style, the attributes
-are rearranged to ensure correct ordering of vector components
-(except for computes and fixes - these have to be given in the right
-order) and duplicate entries are removed.
-
-You cannot specify a quantity that is not defined for a particular
-simulation - such as {q} for atom style {bond}, since that atom style
-doesn't assign charges.  Dumps occur at the very end of a timestep,
-so atom attributes will include effects due to fixes that are applied
-during the timestep.  An explanation of the possible dump custom/vtk attributes
-is given below. Since position data is required to write VTK files "x y z"
-do not have to be specified explicitly.
-
-The VTK format uses a single snapshot of the system per file, thus
-a wildcard "*" must be included in the filename, as discussed below.
-Otherwise the dump files will get overwritten with the new snapshot
-each time.
-
-:line
-
-Dumps are performed on timesteps that are a multiple of N (including
-timestep 0) and on the last timestep of a minimization if the
-minimization converges.  Note that this means a dump will not be
-performed on the initial timestep after the dump command is invoked,
-if the current timestep is not a multiple of N.  This behavior can be
-changed via the "dump_modify first"_dump_modify.html command, which
-can also be useful if the dump command is invoked after a minimization
-ended on an arbitrary timestep.  N can be changed between runs by
-using the "dump_modify every"_dump_modify.html command.
-The "dump_modify every"_dump_modify.html command
-also allows a variable to be used to determine the sequence of
-timesteps on which dump files are written.  In this mode a dump on the
-first timestep of a run will also not be written unless the
-"dump_modify first"_dump_modify.html command is used.
-
-Dump filenames can contain two wildcard characters.  If a "*"
-character appears in the filename, then one file per snapshot is
-written and the "*" character is replaced with the timestep value.
-For example, tmp.dump*.vtk becomes tmp.dump0.vtk, tmp.dump10000.vtk,
-tmp.dump20000.vtk, etc.  Note that the "dump_modify pad"_dump_modify.html
-command can be used to insure all timestep numbers are the same length
-(e.g. 00010), which can make it easier to read a series of dump files
-in order with some post-processing tools.
-
-If a "%" character appears in the filename, then each of P processors
-writes a portion of the dump file, and the "%" character is replaced
-with the processor ID from 0 to P-1 preceded by an underscore character.
-For example, tmp.dump%.vtp becomes tmp.dump_0.vtp, tmp.dump_1.vtp, ...
-tmp.dump_P-1.vtp, etc.  This creates smaller files and can be a fast
-mode of output on parallel machines that support parallel I/O for output.
-
-By default, P = the number of processors meaning one file per
-processor, but P can be set to a smaller value via the {nfile} or
-{fileper} keywords of the "dump_modify"_dump_modify.html command.
-These options can be the most efficient way of writing out dump files
-when running on large numbers of processors.
-
-For the legacy VTK format "%" is ignored and P = 1, i.e., only
-processor 0 does write files.
-
-Note that using the "*" and "%" characters together can produce a
-large number of small dump files!
-
-If {dump_modify binary} is used, the dump file (or files, if "*" or
-"%" is also used) is written in binary format.  A binary dump file
-will be about the same size as a text version, but will typically
-write out much faster.
-
-:line
-
-This section explains the atom attributes that can be specified as
-part of the {custom/vtk} style.
-
-The {id}, {mol}, {proc}, {procp1}, {type}, {element}, {mass}, {vx},
-{vy}, {vz}, {fx}, {fy}, {fz}, {q} attributes are self-explanatory.
-
-{Id} is the atom ID.  {Mol} is the molecule ID, included in the data
-file for molecular systems.  {Proc} is the ID of the processor (0 to
-Nprocs-1) that currently owns the atom.  {Procp1} is the proc ID+1,
-which can be convenient in place of a {type} attribute (1 to Ntypes)
-for coloring atoms in a visualization program.  {Type} is the atom
-type (1 to Ntypes).  {Element} is typically the chemical name of an
-element, which you must assign to each type via the "dump_modify
-element"_dump_modify.html command.  More generally, it can be any
-string you wish to associated with an atom type.  {Mass} is the atom
-mass.  {Vx}, {vy}, {vz}, {fx}, {fy}, {fz}, and {q} are components of
-atom velocity and force and atomic charge.
-
-There are several options for outputting atom coordinates.  The {x},
-{y}, {z} attributes write atom coordinates "unscaled", in the
-appropriate distance "units"_units.html (Angstroms, sigma, etc).  Use
-{xs}, {ys}, {zs} if you want the coordinates "scaled" to the box size,
-so that each value is 0.0 to 1.0.  If the simulation box is triclinic
-(tilted), then all atom coords will still be between 0.0 and 1.0.
-I.e. actual unscaled (x,y,z) = xs*A + ys*B + zs*C, where (A,B,C) are
-the non-orthogonal vectors of the simulation box edges, as discussed
-in "Section 6.12"_Section_howto.html#howto_12.
-
-Use {xu}, {yu}, {zu} if you want the coordinates "unwrapped" by the
-image flags for each atom.  Unwrapped means that if the atom has
-passed thru a periodic boundary one or more times, the value is
-printed for what the coordinate would be if it had not been wrapped
-back into the periodic box.  Note that using {xu}, {yu}, {zu} means
-that the coordinate values may be far outside the box bounds printed
-with the snapshot.  Using {xsu}, {ysu}, {zsu} is similar to using
-{xu}, {yu}, {zu}, except that the unwrapped coordinates are scaled by
-the box size. Atoms that have passed through a periodic boundary will
-have the corresponding coordinate increased or decreased by 1.0.
-
-The image flags can be printed directly using the {ix}, {iy}, {iz}
-attributes.  For periodic dimensions, they specify which image of the
-simulation box the atom is considered to be in.  An image of 0 means
-it is inside the box as defined.  A value of 2 means add 2 box lengths
-to get the true value.  A value of -1 means subtract 1 box length to
-get the true value.  LAMMPS updates these flags as atoms cross
-periodic boundaries during the simulation.
-
-The {mux}, {muy}, {muz} attributes are specific to dipolar systems
-defined with an atom style of {dipole}.  They give the orientation of
-the atom's point dipole moment.  The {mu} attribute gives the
-magnitude of the atom's dipole moment.
-
-The {radius} and {diameter} attributes are specific to spherical
-particles that have a finite size, such as those defined with an atom
-style of {sphere}.
-
-The {omegax}, {omegay}, and {omegaz} attributes are specific to
-finite-size spherical particles that have an angular velocity.  Only
-certain atom styles, such as {sphere} define this quantity.
-
-The {angmomx}, {angmomy}, and {angmomz} attributes are specific to
-finite-size aspherical particles that have an angular momentum.  Only
-the {ellipsoid} atom style defines this quantity.
-
-The {tqx}, {tqy}, {tqz} attributes are for finite-size particles that
-can sustain a rotational torque due to interactions with other
-particles.
-
-The {c_ID} and {c_ID\[I\]} attributes allow per-atom vectors or arrays
-calculated by a "compute"_compute.html to be output.  The ID in the
-attribute should be replaced by the actual ID of the compute that has
-been defined previously in the input script.  See the
-"compute"_compute.html command for details.  There are computes for
-calculating the per-atom energy, stress, centro-symmetry parameter,
-and coordination number of individual atoms.
-
-Note that computes which calculate global or local quantities, as
-opposed to per-atom quantities, cannot be output in a dump custom/vtk
-command.  Instead, global quantities can be output by the
-"thermo_style custom"_thermo_style.html command, and local quantities
-can be output by the dump local command.
-
-If {c_ID} is used as a attribute, then the per-atom vector calculated
-by the compute is printed.  If {c_ID\[I\]} is used, then I must be in
-the range from 1-M, which will print the Ith column of the per-atom
-array with M columns calculated by the compute.  See the discussion
-above for how I can be specified with a wildcard asterisk to
-effectively specify multiple values.
-
-The {f_ID} and {f_ID\[I\]} attributes allow vector or array per-atom
-quantities calculated by a "fix"_fix.html to be output.  The ID in the
-attribute should be replaced by the actual ID of the fix that has been
-defined previously in the input script.  The "fix
-ave/atom"_fix_ave_atom.html command is one that calculates per-atom
-quantities.  Since it can time-average per-atom quantities produced by
-any "compute"_compute.html, "fix"_fix.html, or atom-style
-"variable"_variable.html, this allows those time-averaged results to
-be written to a dump file.
-
-If {f_ID} is used as a attribute, then the per-atom vector calculated
-by the fix is printed.  If {f_ID\[I\]} is used, then I must be in the
-range from 1-M, which will print the Ith column of the per-atom array
-with M columns calculated by the fix.  See the discussion above for
-how I can be specified with a wildcard asterisk to effectively specify
-multiple values.
-
-The {v_name} attribute allows per-atom vectors calculated by a
-"variable"_variable.html to be output.  The name in the attribute
-should be replaced by the actual name of the variable that has been
-defined previously in the input script.  Only an atom-style variable
-can be referenced, since it is the only style that generates per-atom
-values.  Variables of style {atom} can reference individual atom
-attributes, per-atom atom attributes, thermodynamic keywords, or
-invoke other computes, fixes, or variables when they are evaluated, so
-this is a very general means of creating quantities to output to a
-dump file.
-
-The {d_name} and {i_name} attributes allow to output custom per atom
-floating point or integer properties that are managed by
-"fix property/atom"_fix_property_atom.html.
-
-See "Section 10"_Section_modify.html of the manual for information
-on how to add new compute and fix styles to LAMMPS to calculate
-per-atom quantities which could then be output into dump files.
-
-:line
-
-[Restrictions:]
-
-The {custom/vtk} style does not support writing of gzipped dump files.
-
-The {custom/vtk} dump style is part of the USER-VTK package. It is
-only enabled if LAMMPS was built with that package. See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.
-
-To use this dump style, you also must link to the VTK library.  See
-the info in lib/vtk/README and insure the Makefile.lammps file in that
-directory is appropriate for your machine.
-
-The {custom/vtk} dump style neither supports buffering nor custom
-format strings.
-
-[Related commands:]
-
-"dump"_dump.html, "dump image"_dump_image.html,
-"dump_modify"_dump_modify.html, "undump"_undump.html
-
-[Default:]
-
-By default, files are written in ASCII format. If the file extension
-is not one of .vtk, .vtp or .vtu, the legacy VTK file format is used.
-
diff --git a/doc/src/dump_h5md.txt b/doc/src/dump_h5md.txt
index d797e633e63adb884ada2147bfdf2e00ac081a92..93c87d85b7a6064140b6cf320199d3cd99114e10 100644
--- a/doc/src/dump_h5md.txt
+++ b/doc/src/dump_h5md.txt
@@ -17,9 +17,7 @@ group-ID = ID of the group of atoms to be imaged :l
 h5md = style of dump command (other styles {atom} or {cfg} or {dcd} or {xtc} or {xyz} or {local} or {custom} are discussed on the "dump"_dump.html doc page) :l
 N = dump every this many timesteps :l
 file.h5 = name of file to write to :l
-args = list of data elements to dump, with their dump "subintervals".
-At least one element must be given and image may only be present if
-position is specified first. :l
+args = list of data elements to dump, with their dump "subintervals"
   position options
   image
   velocity options
@@ -29,15 +27,17 @@ position is specified first. :l
   box value = {yes} or {no}
   create_group value = {yes} or {no}
   author value = quoted string :pre
+:ule
 
-For the elements {position}, {velocity}, {force} and {species}, one
-may specify a sub-interval to write the data only every N_element
-iterations of the dump (i.e. every N*N_element time steps). This is
-specified by the option
+Note that at least one element must be specified and image may only be
+present if position is specified first.
 
-  every N_element :pre
+For the elements {position}, {velocity}, {force} and {species}, a
+sub-interval may be specified to write the data only every N_element
+iterations of the dump (i.e. every N*N_element time steps). This is
+specified by this option directly following the element declaration:
 
-that follows directly the element declaration.
+every N_element :pre
 
 :ule
 
diff --git a/doc/src/dump_nc.txt b/doc/src/dump_nc.txt
deleted file mode 100644
index 0b81ee6a32bdd884ccc15ee4e6d1624bee72ad13..0000000000000000000000000000000000000000
--- a/doc/src/dump_nc.txt
+++ /dev/null
@@ -1,66 +0,0 @@
-"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
-
-:link(lws,http://lammps.sandia.gov)
-:link(ld,Manual.html)
-:link(lc,Section_commands.html#comm)
-
-:line
-
-dump nc command :h3
-dump nc/mpiio command :h3
-
-[Syntax:]
-
-dump ID group-ID nc N file.nc args
-dump ID group-ID nc/mpiio N file.nc args :pre
-
-ID = user-assigned name for the dump :ulb,l
-group-ID = ID of the group of atoms to be imaged :l
-{nc} or {nc/mpiio}  = style of dump command (other styles {atom} or {cfg} or {dcd} or {xtc} or {xyz} or {local} or {custom} are discussed on the "dump"_dump.html doc page) :l
-N = dump every this many timesteps :l
-file.nc = name of file to write to :l
-args = list of per atom data elements to dump, same as for the 'custom' dump style. :l,ule
-
-[Examples:]
-
-dump 1 all nc 100 traj.nc type x y z vx vy vz
-dump_modify 1 append yes at -1 global c_thermo_pe c_thermo_temp c_thermo_press :pre
-
-dump 1 all nc/mpiio 1000 traj.nc id type x y z :pre
-
-[Description:]
-
-Dump a snapshot of atom coordinates every N timesteps in Amber-style
-NetCDF file format. NetCDF files are binary, portable and
-self-describing.  This dump style will write only one file on the root
-node.  The dump style {nc} uses the "standard NetCDF
-library"_netcdf-home all data is collected on one processor and then
-written to the dump file. Dump style {nc/mpiio} used the "parallel
-NetCDF library"_pnetcdf-home and MPI-IO; it has better performance on
-a larger number of processors. Note that 'nc' outputs all atoms sorted
-by atom tag while 'nc/mpiio' outputs in order of the MPI rank.
-
-In addition to per-atom data, also global (i.e. not per atom, but per
-frame) quantities can be included in the dump file. This can be
-variables, output from computes or fixes data prefixed with v_, c_ and
-f_, respectively.  These properties are included via
-"dump_modify"_dump_modify.html {global}.
-
-:link(netcdf-home,http://www.unidata.ucar.edu/software/netcdf/)
-:link(pnetcdf-home,http://trac.mcs.anl.gov/projects/parallel-netcdf/)
-
-:line
-
-[Restrictions:]
-
-The {nc} and {nc/mpiio} dump styles are part of the USER-NC-DUMP
-package.  It is only enabled if LAMMPS was built with that
-package. See the "Making LAMMPS"_Section_start.html#start_3 section
-for more info.
-
-:line
-
-[Related commands:]
-
-"dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html
-
diff --git a/doc/src/dump_netcdf.txt b/doc/src/dump_netcdf.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e82656698a61860a9b376e2eac19e11f05ec1c8
--- /dev/null
+++ b/doc/src/dump_netcdf.txt
@@ -0,0 +1,82 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+dump netcdf command :h3
+dump netcdf/mpiio command :h3
+
+[Syntax:]
+
+dump ID group-ID netcdf N file args
+dump ID group-ID netcdf/mpiio N file args :pre
+
+ID = user-assigned name for the dump :ulb,l
+group-ID = ID of the group of atoms to be imaged :l
+{netcdf} or {netcdf/mpiio}  = style of dump command (other styles {atom} or {cfg} or {dcd} or {xtc} or {xyz} or {local} or {custom} are discussed on the "dump"_dump.html doc page) :l
+N = dump every this many timesteps :l
+file = name of file to write dump info to :l
+args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule
+
+[Examples:]
+
+dump 1 all netcdf 100 traj.nc type x y z vx vy vz
+dump_modify 1 append yes at -1 global c_thermo_pe c_thermo_temp c_thermo_press
+dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre
+
+[Description:]
+
+Dump a snapshot of atom coordinates every N timesteps in Amber-style
+NetCDF file format.  NetCDF files are binary, portable and
+self-describing.  This dump style will write only one file on the root
+node.  The dump style {netcdf} uses the "standard NetCDF
+library"_netcdf-home.  All data is collected on one processor and then
+written to the dump file.  Dump style {netcdf/mpiio} uses the
+"parallel NetCDF library"_pnetcdf-home and MPI-IO to write to the dump
+file in parallel; it has better performance on a larger number of
+processors.  Note that style {netcdf} outputs all atoms sorted by atom
+tag while style {netcdf/mpiio} outputs atoms in order of their MPI
+rank.
+
+NetCDF files can be directly visualized via the following tools:
+
+Ovito (http://www.ovito.org/). Ovito supports the AMBER convention and
+all of the above extensions. :ule,b
+
+VMD (http://www.ks.uiuc.edu/Research/vmd/). :l
+
+AtomEye (http://www.libatoms.org/). The libAtoms version of AtomEye
+contains a NetCDF reader that is not present in the standard
+distribution of AtomEye. :l,ule
+
+In addition to per-atom data, global data can be included in the dump
+file, which are the kinds of values output by the
+"thermo_style"_thermo_style.html command .  See "Section howto
+6.15"_Section_howto.html#howto_15 for an explanation of per-atom
+versus global data.  The global output written into the dump file can
+be from computes, fixes, or variables, by prefixing the compute/fix ID
+or variable name with "c_" or "f_" or "v_" respectively, as in the
+example above.  These global values are specified via the "dump_modify
+global"_dump_modify.html command.
+
+:link(netcdf-home,http://www.unidata.ucar.edu/software/netcdf/)
+:link(pnetcdf-home,http://trac.mcs.anl.gov/projects/parallel-netcdf/)
+
+:line
+
+[Restrictions:]
+
+The {netcdf} and {netcdf/mpiio} dump styles are part of the
+USER-NETCDF package.  They are only enabled if LAMMPS was built with
+that package. See the "Making LAMMPS"_Section_start.html#start_3
+section for more info.
+
+:line
+
+[Related commands:]
+
+"dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html
+
diff --git a/doc/src/dump_vtk.txt b/doc/src/dump_vtk.txt
new file mode 100644
index 0000000000000000000000000000000000000000..21502e7f491dd32a774509644ee4b6267f14d259
--- /dev/null
+++ b/doc/src/dump_vtk.txt
@@ -0,0 +1,179 @@
+ "LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+dump vtk command :h3
+
+[Syntax:]
+
+dump ID group-ID vtk N file args :pre
+
+ID = user-assigned name for the dump
+group-ID = ID of the group of atoms to be dumped
+vtk = style of dump command (other styles {atom} or {cfg} or {dcd} or {xtc} or {xyz} or {local} or {custom} are discussed on the "dump"_dump.html doc page)
+N = dump every this many timesteps
+file = name of file to write dump info to 
+args = same as arguments for "dump_style custom"_dump.html :ul
+
+[Examples:]
+
+dump dmpvtk all vtk 100 dump*.myforce.vtk id type vx fx
+dump dmpvtp flow vtk 100 dump*.%.displace.vtp id type c_myD\[1\] c_myD\[2\] c_myD\[3\] v_ke :pre
+
+[Description:]
+
+Dump a snapshot of atom quantities to one or more files every N
+timesteps in a format readable by the "VTK visualization
+toolkit"_http://www.vtk.org or other visualization tools that use it,
+e.g. "ParaView"_http://www.paraview.org.  The timesteps on which dump
+output is written can also be controlled by a variable; see the
+"dump_modify every"_dump_modify.html command for details.
+
+This dump style is similar to "dump_style custom"_dump.html but uses
+the VTK library to write data to VTK simple legacy or XML format
+depending on the filename extension specified for the dump file.  This
+can be either {*.vtk} for the legacy format or {*.vtp} and {*.vtu},
+respectively, for XML format; see the "VTK
+homepage"_http://www.vtk.org/VTK/img/file-formats.pdf for a detailed
+description of these formats.  Since this naming convention conflicts
+with the way binary output is usually specified (see below), the
+"dump_modify binary"_dump_modify.html command allows setting of a
+binary option for this dump style explicitly.
+
+Only information for atoms in the specified group is dumped.  The
+"dump_modify thresh and region"_dump_modify.html commands can also
+alter what atoms are included; see details below.
+
+As described below, special characters ("*", "%") in the filename
+determine the kind of output.
+
+IMPORTANT NOTE: Because periodic boundary conditions are enforced only
+on timesteps when neighbor lists are rebuilt, the coordinates of an
+atom written to a dump file may be slightly outside the simulation
+box.
+
+IMPORTANT NOTE: Unless the "dump_modify sort"_dump_modify.html option
+is invoked, the lines of atom information written to dump files will
+be in an indeterminate order for each snapshot.  This is even true
+when running on a single processor, if the "atom_modify
+sort"_atom_modify.html option is on, which it is by default.  In this
+case atoms are re-ordered periodically during a simulation, due to
+spatial sorting.  It is also true when running in parallel, because
+data for a single snapshot is collected from multiple processors, each
+of which owns a subset of the atoms.
+
+For the {vtk} style, sorting is off by default. See the
+"dump_modify"_dump_modify.html doc page for details.
+
+:line
+
+The dimensions of the simulation box are written to a separate file
+for each snapshot (either in legacy VTK or XML format depending on the
+format of the main dump file) with the suffix {_boundingBox} appended
+to the given dump filename.
+
+For an orthogonal simulation box this information is saved as a
+rectilinear grid (legacy .vtk or .vtr XML format).
+
+Triclinic simulation boxes (non-orthogonal) are saved as
+hexahedrons in either legacy .vtk or .vtu XML format.
+
+Style {vtk} allows you to specify a list of atom attributes to be
+written to the dump file for each atom.  The list of possible attributes 
+is the same as for the "dump_style custom"_dump.html command; see
+its doc page for a listing and an explanation of each attribute.
+
+NOTE: Since position data is required to write VTK files the atom
+attributes "x y z" do not have to be specified explicitly; they will
+be included in the dump file regardless.  Also, in contrast to the
+{custom} style, the specified {vtk} attributes are rearranged to
+ensure correct ordering of vector components (except for computes and
+fixes - these have to be given in the right order) and duplicate
+entries are removed.
+
+The VTK format uses a single snapshot of the system per file, thus
+a wildcard "*" must be included in the filename, as discussed below.
+Otherwise the dump files will get overwritten with the new snapshot
+each time.
+
+:line
+
+Dumps are performed on timesteps that are a multiple of N (including
+timestep 0) and on the last timestep of a minimization if the
+minimization converges.  Note that this means a dump will not be
+performed on the initial timestep after the dump command is invoked,
+if the current timestep is not a multiple of N.  This behavior can be
+changed via the "dump_modify first"_dump_modify.html command, which
+can also be useful if the dump command is invoked after a minimization
+ended on an arbitrary timestep.  N can be changed between runs by
+using the "dump_modify every"_dump_modify.html command.
+The "dump_modify every"_dump_modify.html command
+also allows a variable to be used to determine the sequence of
+timesteps on which dump files are written.  In this mode a dump on the
+first timestep of a run will also not be written unless the
+"dump_modify first"_dump_modify.html command is used.
+
+Dump filenames can contain two wildcard characters.  If a "*"
+character appears in the filename, then one file per snapshot is
+written and the "*" character is replaced with the timestep value.
+For example, tmp.dump*.vtk becomes tmp.dump0.vtk, tmp.dump10000.vtk,
+tmp.dump20000.vtk, etc.  Note that the "dump_modify pad"_dump_modify.html
+command can be used to insure all timestep numbers are the same length
+(e.g. 00010), which can make it easier to read a series of dump files
+in order with some post-processing tools.
+
+If a "%" character appears in the filename, then each of P processors
+writes a portion of the dump file, and the "%" character is replaced
+with the processor ID from 0 to P-1 preceded by an underscore character.
+For example, tmp.dump%.vtp becomes tmp.dump_0.vtp, tmp.dump_1.vtp, ...
+tmp.dump_P-1.vtp, etc.  This creates smaller files and can be a fast
+mode of output on parallel machines that support parallel I/O for output.
+
+By default, P = the number of processors meaning one file per
+processor, but P can be set to a smaller value via the {nfile} or
+{fileper} keywords of the "dump_modify"_dump_modify.html command.
+These options can be the most efficient way of writing out dump files
+when running on large numbers of processors.
+
+For the legacy VTK format "%" is ignored and P = 1, i.e., only
+processor 0 does write files.
+
+Note that using the "*" and "%" characters together can produce a
+large number of small dump files!
+
+If {dump_modify binary} is used, the dump file (or files, if "*" or
+"%" is also used) is written in binary format.  A binary dump file
+will be about the same size as a text version, but will typically
+write out much faster.
+
+:line
+
+[Restrictions:]
+
+The {vtk} style does not support writing of gzipped dump files.
+
+The {vtk} dump style is part of the USER-VTK package. It is
+only enabled if LAMMPS was built with that package. See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+To use this dump style, you also must link to the VTK library.  See
+the info in lib/vtk/README and insure the Makefile.lammps file in that
+directory is appropriate for your machine.
+
+The {vtk} dump style supports neither buffering or custom format
+strings.
+
+[Related commands:]
+
+"dump"_dump.html, "dump image"_dump_image.html,
+"dump_modify"_dump_modify.html, "undump"_undump.html
+
+[Default:]
+
+By default, files are written in ASCII format. If the file extension
+is not one of .vtk, .vtp or .vtu, the legacy VTK file format is used.
+
diff --git a/doc/src/fix_cmap.txt b/doc/src/fix_cmap.txt
index 5fcac589bef767f74140feb3bcd0a3503aad9f37..2b14a20c1d062b01fab0d267ac04a6e1524b5e6d 100644
--- a/doc/src/fix_cmap.txt
+++ b/doc/src/fix_cmap.txt
@@ -87,8 +87,11 @@ the note below about how to include the CMAP energy when performing an
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
-No information about this fix is written to "binary restart
-files"_restart.html.
+This fix writes the list of CMAP crossterms to "binary restart
+files"_restart.html.  See the "read_restart"_read_restart.html command
+for info on how to re-specify a fix in an input script that reads a
+restart file, so that the operation of the fix continues in an
+uninterrupted fashion.
 
 The "fix_modify"_fix_modify.html {energy} option is supported by this
 fix to add the potential "energy" of the CMAP interactions system's
diff --git a/doc/src/fix_gcmc.txt b/doc/src/fix_gcmc.txt
index 53973cdfb816b78bc86e4c16d6bf5d2ff4fa7074..7ac607a2f1e146e1f24e68cf69797402dbda4044 100644
--- a/doc/src/fix_gcmc.txt
+++ b/doc/src/fix_gcmc.txt
@@ -317,7 +317,7 @@ solution is to start a new simulation after the equilibrium density
 has been reached.
 
 With some pair_styles, such as "Buckingham"_pair_buck.html,
-"Born-Mayer-Huggins"_pair_born.html and "ReaxFF"_pair_reax_c.html, two
+"Born-Mayer-Huggins"_pair_born.html and "ReaxFF"_pair_reaxc.html, two
 atoms placed close to each other may have an arbitrary large, negative
 potential energy due to the functional form of the potential.  While
 these unphysical configurations are inaccessible to typical dynamical
diff --git a/doc/src/fix_gle.txt b/doc/src/fix_gle.txt
index ca7625e2d0aa97882d9880d6b2ed1ed82dd238d0..b8d3cc9b34bae16b446ec32901759a5cc1454724 100644
--- a/doc/src/fix_gle.txt
+++ b/doc/src/fix_gle.txt
@@ -67,9 +67,10 @@ target value as the {Tstart} and {Tstop} arguments, so that the diffusion
 matrix that gives canonical sampling for a given A is computed automatically.
 However, the GLE framework also allow for non-equilibrium sampling, that
 can be used for instance to model inexpensively zero-point energy
-effects "(Ceriotti2)"_#Ceriotti2. This is achieved specifying the
-{noneq} keyword followed by the name of the file that contains the
-static covariance matrix for the non-equilibrium dynamics.
+effects "(Ceriotti2)"_#Ceriotti2. This is achieved specifying the {noneq}
+ keyword followed by the name of the file that contains the static covariance
+matrix for the non-equilibrium dynamics.  Please note, that the covariance
+matrix is expected to be given in [temperature units].
 
 Since integrating GLE dynamics can be costly when used together with
 simple potentials, one can use the {every} optional keyword to
@@ -148,7 +149,7 @@ dpd/tstat"_pair_dpd.html, "fix gld"_fix_gld.html
 1170-80 (2010)
 
 :link(GLE4MD)
-[(GLE4MD)] "http://epfl-cosmo.github.io/gle4md/"_http://epfl-cosmo.github.io/gle4md/
+[(GLE4MD)] "http://gle4md.org/"_http://gle4md.org/
 
 :link(Ceriotti2)
 [(Ceriotti2)] Ceriotti, Bussi and Parrinello, Phys Rev Lett 103,
diff --git a/doc/src/fix_qeq.txt b/doc/src/fix_qeq.txt
index f9c8ecde63f64dd3c7ef566445b28c3569edfcf9..22f47668965c0958a3c23172bd09b90ededc0c3c 100644
--- a/doc/src/fix_qeq.txt
+++ b/doc/src/fix_qeq.txt
@@ -74,7 +74,7 @@ NOTE: The "fix qeq/comb"_fix_qeq_comb.html command must still be used
 to perform charge equilibration with the "COMB
 potential"_pair_comb.html.  The "fix qeq/reax"_fix_qeq_reax.html
 command can be used to perform charge equilibration with the "ReaxFF
-force field"_pair_reax_c.html, although fix qeq/shielded yields the
+force field"_pair_reaxc.html, although fix qeq/shielded yields the
 same results as fix qeq/reax if {Nevery}, {cutoff}, and {tolerance}
 are the same.  Eventually the fix qeq/reax command will be deprecated.
 
@@ -116,7 +116,7 @@ the shielded Coulomb is given by equation (13) of the "ReaxFF force
 field"_#vanDuin paper.  The shielding accounts for charge overlap
 between charged particles at small separation.  This style is the same
 as "fix qeq/reax"_fix_qeq_reax.html, and can be used with "pair_style
-reax/c"_pair_reax_c.html.  Only the {chi}, {eta}, and {gamma}
+reax/c"_pair_reaxc.html.  Only the {chi}, {eta}, and {gamma}
 parameters from the {qfile} file are used.  This style solves partial
 charges on atoms via the matrix inversion method.  A tolerance of
 1.0e-6 is usually a good number.
diff --git a/doc/src/fix_qeq_reax.txt b/doc/src/fix_qeq_reax.txt
index 76c95e11173f9337e18c000b2591cda86c3e5fd0..aed043f6c0e93382bf377f07df855560e7da82ab 100644
--- a/doc/src/fix_qeq_reax.txt
+++ b/doc/src/fix_qeq_reax.txt
@@ -30,7 +30,7 @@ fix 1 all qeq/reax 1 0.0 10.0 1.0e-6 param.qeq :pre
 Perform the charge equilibration (QEq) method as described in "(Rappe
 and Goddard)"_#Rappe2 and formulated in "(Nakano)"_#Nakano2.  It is
 typically used in conjunction with the ReaxFF force field model as
-implemented in the "pair_style reax/c"_pair_reax_c.html command, but
+implemented in the "pair_style reax/c"_pair_reaxc.html command, but
 it can be used with any potential in LAMMPS, so long as it defines and
 uses charges on each atom.  The "fix qeq/comb"_fix_qeq_comb.html
 command should be used to perform charge equilibration with the "COMB
@@ -42,7 +42,7 @@ The QEq method minimizes the electrostatic energy of the system by
 adjusting the partial charge on individual atoms based on interactions
 with their neighbors.  It requires some parameters for each atom type.
 If the {params} setting above is the word "reax/c", then these are
-extracted from the "pair_style reax/c"_pair_reax_c.html command and
+extracted from the "pair_style reax/c"_pair_reaxc.html command and
 the ReaxFF force field file it reads in.  If a file name is specified
 for {params}, then the parameters are taken from the specified file
 and the file must contain one line for each atom type.  The latter
@@ -106,7 +106,7 @@ be used for periodic cell dimensions less than 10 angstroms.
 
 [Related commands:]
 
-"pair_style reax/c"_pair_reax_c.html
+"pair_style reax/c"_pair_reaxc.html
 
 [Default:] none
 
diff --git a/doc/src/fix_reax_bonds.txt b/doc/src/fix_reax_bonds.txt
index 1fd1b3ca5a18f2db3d1f6054f771ba52f8bdaf3a..d3f108709406fb57eaa263b0290f0cb8dd8a9725 100644
--- a/doc/src/fix_reax_bonds.txt
+++ b/doc/src/fix_reax_bonds.txt
@@ -28,7 +28,7 @@ fix 1 all reax/c/bonds 100 bonds.reaxc :pre
 
 Write out the bond information computed by the ReaxFF potential
 specified by "pair_style reax"_pair_reax.html or "pair_style
-reax/c"_pair_reax_c.html in the exact same format as the original
+reax/c"_pair_reaxc.html in the exact same format as the original
 stand-alone ReaxFF code of Adri van Duin.  The bond information is
 written to {filename} on timesteps that are multiples of {Nevery},
 including timestep 0.  For time-averaged chemical species analysis,
@@ -80,7 +80,7 @@ reax"_pair_reax.html be invoked.  This fix is part of the REAX
 package.  It is only enabled if LAMMPS was built with that package,
 which also requires the REAX library be built and linked with LAMMPS.
 The fix reax/c/bonds command requires that the "pair_style
-reax/c"_pair_reax_c.html be invoked.  This fix is part of the
+reax/c"_pair_reaxc.html be invoked.  This fix is part of the
 USER-REAXC package.  It is only enabled if LAMMPS was built with that
 package.  See the "Making LAMMPS"_Section_start.html#start_3 section
 for more info.
@@ -88,6 +88,6 @@ for more info.
 [Related commands:]
 
 "pair_style reax"_pair_reax.html, "pair_style
-reax/c"_pair_reax_c.html, "fix reax/c/species"_fix_reaxc_species.html
+reax/c"_pair_reaxc.html, "fix reax/c/species"_fix_reaxc_species.html
 
 [Default:] none
diff --git a/doc/src/fix_reaxc_species.txt b/doc/src/fix_reaxc_species.txt
index 00db91900e1dcb01a032413a742184374465f45a..d43a338a66e776f19b6098b439690b87f97946e1 100644
--- a/doc/src/fix_reaxc_species.txt
+++ b/doc/src/fix_reaxc_species.txt
@@ -41,7 +41,7 @@ fix 1 all reax/c/species 1 100 100 species.out element Au O H position 1000 AuOH
 [Description:]
 
 Write out the chemical species information computed by the ReaxFF
-potential specified by "pair_style reax/c"_pair_reax_c.html.
+potential specified by "pair_style reax/c"_pair_reaxc.html.
 Bond-order values (either averaged or instantaneous, depending on
 value of {Nrepeat}) are used to determine chemical bonds.  Every
 {Nfreq} timesteps, chemical species information is written to
@@ -65,7 +65,7 @@ symbol printed for each LAMMPS atom type. The number of symbols must
 match the number of LAMMPS atom types and each symbol must consist of
 1 or 2 alphanumeric characters. Normally, these symbols should be
 chosen to match the chemical identity of each LAMMPS atom type, as
-specified using the "reax/c pair_coeff"_pair_reax_c.html command and
+specified using the "reax/c pair_coeff"_pair_reaxc.html command and
 the ReaxFF force field file.
 
 The optional keyword {position} writes center-of-mass positions of
@@ -158,8 +158,8 @@ more instructions on how to use the accelerated styles effectively.
 [Restrictions:]
 
 The fix species currently only works with
-"pair_style reax/c"_pair_reax_c.html and it requires that the "pair_style
-reax/c"_pair_reax_c.html be invoked.  This fix is part of the
+"pair_style reax/c"_pair_reaxc.html and it requires that the "pair_style
+reax/c"_pair_reaxc.html be invoked.  This fix is part of the
 USER-REAXC package.  It is only enabled if LAMMPS was built with that
 package.  See the "Making LAMMPS"_Section_start.html#start_3 section
 for more info.
@@ -170,7 +170,7 @@ It should be possible to extend it to other reactive pair_styles (such as
 
 [Related commands:]
 
-"pair_style reax/c"_pair_reax_c.html, "fix
+"pair_style reax/c"_pair_reaxc.html, "fix
 reax/bonds"_fix_reax_bonds.html
 
 [Default:]
diff --git a/doc/src/improper_cossq.txt b/doc/src/improper_cossq.txt
index 513f0b3151456e42e1789b501868fcb909c2c1de..e238063a8f07b248a13de71d01a6305b76b93150 100644
--- a/doc/src/improper_cossq.txt
+++ b/doc/src/improper_cossq.txt
@@ -45,12 +45,9 @@ above, or in the data file or restart files read by the
 "read_data"_read_data.html or "read_restart"_read_restart.html
 commands:
 
-K (energy/radian^2)
+K (energy)
 X0 (degrees) :ul
 
-X0 is specified in degrees, but LAMMPS converts it to radians
-internally; hence the units of K are in energy/radian^2.
-
 :line
 
 Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
diff --git a/doc/src/improper_ring.txt b/doc/src/improper_ring.txt
index 705b1cf742a77c749cc97dd8f93c87adc489471f..cba59399e715aac6e1550e5fb2c9ecf86e4344f8 100644
--- a/doc/src/improper_ring.txt
+++ b/doc/src/improper_ring.txt
@@ -49,12 +49,9 @@ above, or in the data file or restart files read by the
 "read_data"_read_data.html or "read_restart"_read_restart.html
 commands:
 
-K (energy/radian^2)
+K (energy)
 theta0 (degrees) :ul
 
-theta0 is specified in degrees, but LAMMPS converts it to radians
-internally; hence the units of K are in energy/radian^2.
-
 :line
 
 Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
diff --git a/doc/src/lammps.book b/doc/src/lammps.book
index 6c68955bc9f8eeb646a88f7dfd617df4119c8146..b2b42aa7e65b53e842189410c95c2fceb1ad7364 100644
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@@ -469,7 +469,7 @@ pair_peri.html
 pair_polymorphic.html
 pair_quip.html
 pair_reax.html
-pair_reax_c.html
+pair_reaxc.html
 pair_resquared.html
 pair_sdk.html
 pair_smd_hertz.html
diff --git a/doc/src/pair_hybrid.txt b/doc/src/pair_hybrid.txt
index 7ef54e7f07fbb40222b450a1275884fffb1bbdd3..5166fe1f849799a242ed251dff9179c99af8dd17 100644
--- a/doc/src/pair_hybrid.txt
+++ b/doc/src/pair_hybrid.txt
@@ -73,7 +73,7 @@ pair_coeff command to assign parameters for the different type pairs.
 NOTE: There are two exceptions to this option to list an individual
 pair style multiple times.  The first is for pair styles implemented
 as Fortran libraries: "pair_style meam"_pair_meam.html and "pair_style
-reax"_pair_reax.html ("pair_style reax/c"_pair_reax_c.html is OK).
+reax"_pair_reax.html ("pair_style reax/c"_pair_reaxc.html is OK).
 This is because unlike a C++ class, they can not be instantiated
 multiple times, due to the manner in which they were coded in Fortran.
 The second is for GPU-enabled pair styles in the GPU package.  This is
@@ -225,6 +225,12 @@ special_bonds lj/coul 1e-20 1e-20 0.5
 pair_hybrid tersoff lj/cut/coul/long 12.0
 pair_modify pair tersoff special lj/coul 1.0 1.0 1.0 :pre
 
+For use with the various "compute */tally"_compute_tally.html
+computes, the "pair_modify compute/tally"_pair_modify.html
+command can be used to selectively turn off processing of
+the compute tally styles, for example, if those pair styles
+(e.g. manybody styles) do not support this feature.
+
 See the "pair_modify"_pair_modify.html doc page for details on
 the specific syntax, requirements and restrictions.
 
diff --git a/doc/src/pair_modify.txt b/doc/src/pair_modify.txt
index 03fb80ae5ee10c16a61841ad57ca4ad99375b337..34dbb5bc3de7acc4f89588d730dc76b407c07d8e 100644
--- a/doc/src/pair_modify.txt
+++ b/doc/src/pair_modify.txt
@@ -15,11 +15,13 @@ pair_modify keyword values ... :pre
 one or more keyword/value pairs may be listed :ulb,l
 keyword = {pair} or {shift} or {mix} or {table} or {table/disp} or {tabinner} or {tabinner/disp} or {tail} or {compute} :l
   {pair} values = sub-style N {special} which wt1 wt2 wt3
+               or sub-style N {compute/tally} flag
     sub-style = sub-style of "pair hybrid"_pair_hybrid.html
     N = which instance of sub-style (only if sub-style is used multiple times)
-    {special} which wt1 wt2 wt3 = override {special_bonds} settings (optional)
-    which = {lj/coul} or {lj} or {coul}
-    w1,w2,w3 = 1-2, 1-3, and 1-4 weights from 0.0 to 1.0 inclusive
+      {special} which wt1 wt2 wt3 = override {special_bonds} settings (optional)
+        which = {lj/coul} or {lj} or {coul}
+        w1,w2,w3 = 1-2, 1-3, and 1-4 weights from 0.0 to 1.0 inclusive
+      {compute/tally} flag = {yes} or {no}
   {mix} value = {geometric} or {arithmetic} or {sixthpower}
   {shift} value = {yes} or {no}
   {table} value = N
@@ -40,6 +42,7 @@ pair_modify shift yes mix geometric
 pair_modify tail yes
 pair_modify table 12
 pair_modify pair lj/cut compute no
+pair_modify pair tersoff compute/tally no
 pair_modify pair lj/cut/coul/long 1 special lj/coul 0.0 0.0 0.0 :pre
 
 [Description:]
@@ -60,9 +63,12 @@ keywords will be applied to.  Note that if the {pair} keyword is not
 used, and the pair style is {hybrid} or {hybrid/overlay}, then all the
 specified keywords will be applied to all sub-styles.
 
-The {special} keyword can only be used in conjunction with the {pair}
-keyword and must directly follow it. It allows to override the
+The {special} and {compute/tally} keywords can [only] be used in
+conjunction with the {pair} keyword and must directly follow it.
+{special} allows to override the
 "special_bonds"_special_bonds.html settings for the specified sub-style.
+{compute/tally} allows to disable or enable registering
+"compute */tally"_compute_tally.html computes for a given sub-style.
 More details are given below.
 
 The {mix} keyword affects pair coefficients for interactions between
@@ -231,6 +237,14 @@ setting. Substituting 1.0e-10 for 0.0 and 0.9999999999 for 1.0 is
 usually a sufficient workaround in this case without causing a
 significant error.
 
+The {compute/tally} keyword takes exactly 1 argument ({no} or {yes}),
+and allows to selectively disable or enable processing of the various
+"compute */tally"_compute_tally.html styles for a given
+"pair hybrid or hybrid/overlay"_pair_hybrid.html sub-style.
+
+NOTE: Any "pair_modify pair compute/tally" command must be issued
+[before] the corresponding compute style is defined.
+
 :line
 
 [Restrictions:] none
@@ -240,8 +254,9 @@ conflicting options.  You cannot use {tail} yes with 2d simulations.
 
 [Related commands:]
 
-"pair_style"_pair_style.html, "pair_coeff"_pair_coeff.html,
-"thermo_style"_thermo_style.html
+"pair_style"_pair_style.html, "pair_style hybrid"_pair_hybrid.html,
+pair_coeff"_pair_coeff.html, "thermo_style"_thermo_style.html,
+"compute */tally"_compute_tally.html
 
 [Default:]
 
diff --git a/doc/src/pair_reax.txt b/doc/src/pair_reax.txt
index 7215c12cee671fa775ed1b8d1ba2d8d66bfd112e..1d13f937061c95963165efc936a680ca14415147 100644
--- a/doc/src/pair_reax.txt
+++ b/doc/src/pair_reax.txt
@@ -36,7 +36,7 @@ supplemental information of the following paper:
 the most up-to-date version of ReaxFF as of summer 2010.
 
 WARNING: pair style reax is now deprecated and will soon be retired. Users
-should switch to "pair_style reax/c"_pair_reax_c.html. The {reax} style
+should switch to "pair_style reax/c"_pair_reaxc.html. The {reax} style
 differs from the {reax/c} style in the lo-level implementation details.
 The {reax} style is a
 Fortran library, linked to LAMMPS.  The {reax/c} style was initially
@@ -82,7 +82,7 @@ be specified.
 
 Two examples using {pair_style reax} are provided in the examples/reax
 sub-directory, along with corresponding examples for
-"pair_style reax/c"_pair_reax_c.html. Note that while the energy and force
+"pair_style reax/c"_pair_reaxc.html. Note that while the energy and force
 calculated by both of these pair styles match very closely, the
 contributions due to the valence angles differ slightly due to
 the fact that with {pair_style reax/c} the default value of {thb_cutoff_sq}
@@ -201,7 +201,7 @@ appropriate units if your simulation doesn't use "real" units.
 
 [Related commands:]
 
-"pair_coeff"_pair_coeff.html, "pair_style reax/c"_pair_reax_c.html,
+"pair_coeff"_pair_coeff.html, "pair_style reax/c"_pair_reaxc.html,
 "fix_reax_bonds"_fix_reax_bonds.html
 
 [Default:]
diff --git a/doc/src/pair_reax_c.txt b/doc/src/pair_reaxc.txt
similarity index 96%
rename from doc/src/pair_reax_c.txt
rename to doc/src/pair_reaxc.txt
index c1d719d22ef0276c119047e861ed08c2627356e0..76a8e6fd5c4a2b27e6352f9c6abb53d838bcfc6b 100644
--- a/doc/src/pair_reax_c.txt
+++ b/doc/src/pair_reaxc.txt
@@ -17,6 +17,7 @@ cfile = NULL or name of a control file :ulb,l
 zero or more keyword/value pairs may be appended :l
 keyword = {checkqeq} or {lgvdw} or {safezone} or {mincap}
   {checkqeq} value = {yes} or {no} = whether or not to require qeq/reax fix
+  {enobonds} value = {yes} or {no} = whether or not to tally energy of atoms with no bonds
   {lgvdw} value = {yes} or {no} = whether or not to use a low gradient vdW correction
   {safezone} = factor used for array allocation
   {mincap} = minimum size for array allocation :pre
@@ -127,6 +128,13 @@ recommended value for parameter {thb} is 0.01, which can be set in the
 control file.  Note: Force field files are different for the original
 or lg corrected pair styles, using wrong ffield file generates an error message.
 
+Using the optional keyword {enobonds} with the value {yes}, the energy
+of atoms with no bonds (i.e. isolated atoms) is included in the total
+potential energy and the per-atom energy of that atom.  If the value
+{no} is specified then the energy of atoms with no bonds is set to zero.
+The latter behavior is usual not desired, as it causes discontinuities
+in the potential energy when the bonding of an atom drops to zero.
+
 Optional keywords {safezone} and {mincap} are used for allocating
 reax/c arrays.  Increasing these values can avoid memory problems, such
 as segmentation faults and bondchk failed errors, that could occur under
@@ -331,7 +339,7 @@ reax"_pair_reax.html
 
 [Default:]
 
-The keyword defaults are checkqeq = yes, lgvdw = no, safezone = 1.2,
+The keyword defaults are checkqeq = yes, enobonds = yes, lgvdw = no, safezone = 1.2,
 mincap = 50.
 
 :line
diff --git a/doc/src/pair_sdk.txt b/doc/src/pair_sdk.txt
index 212760e03d76d15bc5a314df0ca12094463c8780..1c348eaaf701668f030133314d213f8bbde83524 100644
--- a/doc/src/pair_sdk.txt
+++ b/doc/src/pair_sdk.txt
@@ -134,7 +134,7 @@ respa"_run_style.html command.
 
 [Restrictions:]
 
-All of the lj/sdk pair styles are part of the USER-CG-CMM package.
+All of the lj/sdk pair styles are part of the USER-CGSDK package.
 The {lj/sdk/coul/long} style also requires the KSPACE package to be
 built (which is enabled by default).  They are only enabled if LAMMPS
 was built with that package.  See the "Making
diff --git a/doc/src/pair_srp.txt b/doc/src/pair_srp.txt
index 3f54445ba89abcc4a76523976b681b93494b0303..e7f1e00d1028635db3c8cce28097cf62d54fa7d3 100644
--- a/doc/src/pair_srp.txt
+++ b/doc/src/pair_srp.txt
@@ -150,6 +150,8 @@ hybrid"_pair_hybrid.html.
 This pair style requires the "newton"_newton.html command to be {on}
 for non-bonded interactions.
 
+This pair style is not compatible with "rigid body integrators"_fix_rigid.html
+
 [Related commands:]
 
 "pair_style hybrid"_pair_hybrid.html, "pair_coeff"_pair_coeff.html,
diff --git a/doc/src/pairs.txt b/doc/src/pairs.txt
index 8694747dad5d9a5a40a4ef74f9699d15abd7e4f2..0898906e7ce4c8156c8a0caaae5cb2f225c162b8 100644
--- a/doc/src/pairs.txt
+++ b/doc/src/pairs.txt
@@ -73,7 +73,7 @@ Pair Styles :h1
    pair_polymorphic
    pair_quip
    pair_reax
-   pair_reax_c
+   pair_reaxc
    pair_resquared
    pair_sdk
    pair_smd_hertz
diff --git a/doc/src/python.txt b/doc/src/python.txt
index a5003be54c4f356888196a0e570021b5d49d0a5c..e8a76c0e3e310ede29541e074842e1d28f76c3db 100644
--- a/doc/src/python.txt
+++ b/doc/src/python.txt
@@ -305,7 +305,7 @@ which corresponds to SELF in the python command.  The first line of
 the function imports the Python module lammps.py in the python dir of
 the distribution.  The second line creates a Python object "lmp" which
 wraps the instance of LAMMPS that called the function.  The
-"ptr=lmpptr" argument is what makes that happen.  The thrid line
+"ptr=lmpptr" argument is what makes that happen.  The third line
 invokes the command() function in the LAMMPS library interface.  It
 takes a single string argument which is a LAMMPS input script command
 for LAMMPS to execute, the same as if it appeared in your input
diff --git a/examples/USER/cg-cmm/README b/examples/USER/cgsdk/README
similarity index 95%
rename from examples/USER/cg-cmm/README
rename to examples/USER/cgsdk/README
index 6a283114ba2d40bd345d511a9acca5fc787c55be..5d3a493779b3eccac86347285b92dbf4308d17b9 100644
--- a/examples/USER/cg-cmm/README
+++ b/examples/USER/cgsdk/README
@@ -1,4 +1,4 @@
-LAMMPS USER-CMM-CG example problems
+LAMMPS USER-CGSDK example problems
 
 Each of these sub-directories contains a sample problem for the SDK
 coarse grained MD potentials that you can run with LAMMPS.
diff --git a/examples/USER/cg-cmm/peg-verlet/data.pegc12e8.gz b/examples/USER/cgsdk/peg-verlet/data.pegc12e8.gz
similarity index 100%
rename from examples/USER/cg-cmm/peg-verlet/data.pegc12e8.gz
rename to examples/USER/cgsdk/peg-verlet/data.pegc12e8.gz
diff --git a/examples/USER/cg-cmm/peg-verlet/in.pegc12e8 b/examples/USER/cgsdk/peg-verlet/in.pegc12e8
similarity index 100%
rename from examples/USER/cg-cmm/peg-verlet/in.pegc12e8
rename to examples/USER/cgsdk/peg-verlet/in.pegc12e8
diff --git a/examples/USER/cg-cmm/peg-verlet/in.pegc12e8-angle b/examples/USER/cgsdk/peg-verlet/in.pegc12e8-angle
similarity index 100%
rename from examples/USER/cg-cmm/peg-verlet/in.pegc12e8-angle
rename to examples/USER/cgsdk/peg-verlet/in.pegc12e8-angle
diff --git a/examples/USER/cg-cmm/peg-verlet/log.pegc12e8 b/examples/USER/cgsdk/peg-verlet/log.pegc12e8
similarity index 100%
rename from examples/USER/cg-cmm/peg-verlet/log.pegc12e8
rename to examples/USER/cgsdk/peg-verlet/log.pegc12e8
diff --git a/examples/USER/cg-cmm/peg-verlet/log.pegc12e8-angle b/examples/USER/cgsdk/peg-verlet/log.pegc12e8-angle
similarity index 100%
rename from examples/USER/cg-cmm/peg-verlet/log.pegc12e8-angle
rename to examples/USER/cgsdk/peg-verlet/log.pegc12e8-angle
diff --git a/examples/USER/cg-cmm/sds-monolayer/data.sds.gz b/examples/USER/cgsdk/sds-monolayer/data.sds.gz
similarity index 100%
rename from examples/USER/cg-cmm/sds-monolayer/data.sds.gz
rename to examples/USER/cgsdk/sds-monolayer/data.sds.gz
diff --git a/examples/USER/cg-cmm/sds-monolayer/in.sds-hybrid b/examples/USER/cgsdk/sds-monolayer/in.sds-hybrid
similarity index 100%
rename from examples/USER/cg-cmm/sds-monolayer/in.sds-hybrid
rename to examples/USER/cgsdk/sds-monolayer/in.sds-hybrid
diff --git a/examples/USER/cg-cmm/sds-monolayer/in.sds-regular b/examples/USER/cgsdk/sds-monolayer/in.sds-regular
similarity index 100%
rename from examples/USER/cg-cmm/sds-monolayer/in.sds-regular
rename to examples/USER/cgsdk/sds-monolayer/in.sds-regular
diff --git a/examples/USER/cg-cmm/sds-monolayer/log.sds-hybrid b/examples/USER/cgsdk/sds-monolayer/log.sds-hybrid
similarity index 100%
rename from examples/USER/cg-cmm/sds-monolayer/log.sds-hybrid
rename to examples/USER/cgsdk/sds-monolayer/log.sds-hybrid
diff --git a/examples/USER/cg-cmm/sds-monolayer/log.sds-regular b/examples/USER/cgsdk/sds-monolayer/log.sds-regular
similarity index 100%
rename from examples/USER/cg-cmm/sds-monolayer/log.sds-regular
rename to examples/USER/cgsdk/sds-monolayer/log.sds-regular
diff --git a/examples/USER/flow_gauss/README b/examples/USER/misc/flow_gauss/README
similarity index 100%
rename from examples/USER/flow_gauss/README
rename to examples/USER/misc/flow_gauss/README
diff --git a/examples/USER/flow_gauss/in.GD b/examples/USER/misc/flow_gauss/in.GD
similarity index 100%
rename from examples/USER/flow_gauss/in.GD
rename to examples/USER/misc/flow_gauss/in.GD
diff --git a/examples/cmap/log.11Apr17.cmap.g++.1 b/examples/cmap/log.11Apr17.cmap.g++.1
new file mode 100644
index 0000000000000000000000000000000000000000..9b4fc299915be074b17d491b84f523246ad114e3
--- /dev/null
+++ b/examples/cmap/log.11Apr17.cmap.g++.1
@@ -0,0 +1,205 @@
+LAMMPS (31 Mar 2017)
+# Created by charmm2lammps v1.8.2.6 beta on Thu Mar  3 20:56:57 EST 2016
+
+units           real
+neigh_modify    delay 2 every 1
+#newton          off
+
+boundary        p p p
+
+atom_style      full
+bond_style      harmonic
+angle_style     charmm
+dihedral_style  charmmfsw
+improper_style  harmonic
+
+pair_style      lj/charmmfsw/coul/charmmfsh 8 12
+pair_modify     mix arithmetic
+
+fix             cmap all cmap charmm22.cmap
+Reading potential file charmm22.cmap with DATE: 2016-09-26
+fix_modify      cmap energy yes
+
+read_data       gagg.data fix cmap crossterm CMAP
+  orthogonal box = (-34.4147 -36.1348 -39.3491) to (45.5853 43.8652 40.6509)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  34 atoms
+  scanning bonds ...
+  4 = max bonds/atom
+  scanning angles ...
+  6 = max angles/atom
+  scanning dihedrals ...
+  12 = max dihedrals/atom
+  scanning impropers ...
+  1 = max impropers/atom
+  reading bonds ...
+  33 bonds
+  reading angles ...
+  57 angles
+  reading dihedrals ...
+  75 dihedrals
+  reading impropers ...
+  7 impropers
+  4 = max # of 1-2 neighbors
+  7 = max # of 1-3 neighbors
+  13 = max # of 1-4 neighbors
+  16 = max # of special neighbors
+
+special_bonds   charmm
+fix             1 all nve
+
+#fix             1 all nvt temp 300 300 100.0
+#fix             2 all shake 1e-9 500 0 m 1.0
+
+velocity        all create 0.0 12345678 dist uniform
+
+thermo          1000
+thermo_style    custom step ecoul evdwl ebond eangle edihed f_cmap eimp
+timestep        2.0
+
+run             100000
+Neighbor list info ...
+  update every 1 steps, delay 2 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 12 12 12
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/charmmfsw/coul/charmmfsh, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 14.96 | 14.96 | 14.96 Mbytes
+Step E_coul E_vdwl E_bond E_angle E_dihed f_cmap E_impro 
+       0    16.287573  -0.85933785    1.2470497    4.8441789    4.5432816    -1.473352   0.10453023 
+    1000    18.816462  -0.84379243   0.78931817    2.7554247    4.4371421   -2.7762038   0.12697656 
+    2000    18.091571    -1.045888   0.72306589    3.0951524    4.6725102   -2.3580092   0.22712496 
+    3000    17.835596   -1.2171641   0.72666403    2.6696491    5.4373798   -2.0737041  0.075101693 
+    4000    16.211232  -0.42713611   0.99472642    3.8961462    5.2009895   -2.5626866   0.17356243 
+    5000     17.72183  -0.57081189   0.90733068    3.4376382    4.5457582   -2.3727543   0.12354518 
+    6000    18.753977   -1.5772499   0.81468321    2.9236782    4.6033216   -2.3380859   0.12835782 
+    7000    18.186024  -0.84205608   0.58996182    3.0329585    4.7221473   -2.5733243   0.10047631 
+    8000    18.214306   -1.1360938   0.72597611    3.7493028    4.7319958   -2.8957969    0.2006046 
+    9000    17.248408  -0.48641993   0.90266229    2.9721743    4.7651056   -2.1473354    0.1302043 
+   10000    17.760655   -1.2968444   0.92384663    3.7007455    4.7378947   -2.2147779   0.06940579 
+   11000    17.633929  -0.57368413   0.84872849    3.4277114     4.285393    -2.236944   0.17204973 
+   12000    18.305835   -1.0675148   0.75879532    2.8853173     4.685027    -2.409087  0.087538866 
+   13000    17.391558   -0.9975291   0.66671947    3.8065638    5.2285578   -2.4198822   0.06253594 
+   14000    17.483387  -0.67727643   0.91966477    3.7317031    4.7770445   -2.6080027   0.11487095 
+   15000    18.131749   -1.1918751    1.0025684    3.1238131     4.789742   -2.2546745   0.13782813 
+   16000    16.972343  -0.43926531   0.60644597    3.7551592    4.8658618   -2.2627659   0.12353145 
+   17000    18.080785   -1.2073565    0.7867072    3.5671106      4.43754   -2.5092904   0.17429146 
+   18000    17.474576  -0.97836065    0.8678524    3.7961537    4.3409032   -1.8922572     0.134048 
+   19000    17.000911   -1.2286864   0.83615834    3.9322908    4.9319492   -2.3281576  0.056689619 
+   20000    17.043286   -0.8506561   0.80966589    3.5087339    4.8603878   -2.3365263  0.096794824 
+   21000    17.314495   -1.1430889   0.95363892    4.2446032    4.2756745   -2.1829483   0.17119518 
+   22000    18.954881    -0.998673   0.58688334      2.71536    4.6634319   -2.6862804   0.20328442 
+   23000    17.160427  -0.97803282   0.86894041    4.0897736    4.3146238   -2.1962289  0.075339092 
+   24000    17.602026   -1.0833323   0.94888776    3.7341878    4.3084335   -2.1640414  0.081493681 
+   25000    17.845584   -1.3432612   0.93497086    3.8911043     4.468032   -2.3475883  0.093204333 
+   26000    17.833261   -1.1020534   0.77931087    3.7628141     4.512381   -2.3134761   0.15568465 
+   27000     17.68607   -1.3222026    1.1985872    3.5817624    4.6360755   -2.3492774   0.08427906 
+   28000    18.326649   -1.2669291   0.74809075    3.2624429    4.4698564   -2.3679076   0.14677293 
+   29000    17.720933   -1.0773886   0.83099482    3.7652834    4.6584594   -2.8255303   0.23092596 
+   30000    18.201999   -1.0168706    1.0637455     3.453095    4.3738593   -2.8063214   0.18658217 
+   31000    17.823502   -1.2685768   0.84805585    3.8600661    4.2195821   -2.1169716   0.12517101 
+   32000    16.883133  -0.62062648   0.84434922    3.5042683    5.1264906   -2.2674699  0.030138165 
+   33000    17.805715    -1.679553    1.2430372     4.314677    4.2523894   -2.3008321   0.18591872 
+   34000    16.723767  -0.54189072    1.1282827    3.8542159    4.3026559   -2.2186336   0.05392425 
+   35000    17.976909  -0.72092075    0.5876319    2.9726396    5.0881439    -2.491692   0.17356291 
+   36000    18.782492    -1.514246   0.63237955    3.2777164    4.6077164    -2.502574  0.082537318 
+   37000    17.247716   -0.6344626   0.79885976     3.452491    4.7618281   -2.3902444   0.11450271 
+   38000    17.996494   -1.6712877    1.0111769    4.1689136      4.46963   -2.4076725   0.11875756 
+   39000    17.586857  -0.74508086   0.95970486    3.7395038    4.6011357   -2.9854953   0.30143284 
+   40000    17.494879  -0.30772446   0.72047991    3.2604877    4.7283734   -2.3812495   0.16399034 
+   41000    15.855772  -0.49642605   0.82496448    4.5139653      4.76884    -2.214141   0.10899661 
+   42000    17.898568   -1.3078863    1.1505144    4.0429873    4.3889581   -2.8696559   0.23336417 
+   43000    19.014372   -1.6325979    1.1553166    3.5660772    4.4047997   -2.9302044   0.13672127 
+   44000    18.250782  -0.97211613   0.72714301    3.2258362    4.7257298   -2.5533613   0.11968073 
+   45000    17.335174   0.24746331    1.0415866    3.3220992    4.5251095   -3.0415216   0.24453084 
+   46000     17.72846   -0.9541418   0.88153841    3.7893452    4.5251883   -2.4003613  0.051809816 
+   47000    18.226762  -0.67057787   0.84352989    3.0609522    4.5449078   -2.4694254  0.073703949 
+   48000    17.838074  -0.88768441    1.3812262    3.5890492    4.5827868   -3.0137515   0.21417113 
+   49000    17.973733  -0.75118705   0.69667886    3.3989025    4.7058886   -2.8243945   0.26665792 
+   50000    17.461583  -0.65040016   0.68943524    2.9374743    5.6971777   -2.4438011    0.1697603 
+   51000     16.79766 -0.010684434   0.89795555     3.959039      4.56763   -2.5101098   0.15048853 
+   52000    17.566543   -0.7262764   0.74354418    3.3423185    4.8426523   -2.4187649   0.16908776 
+   53000    17.964274   -0.9270914     1.065952    3.0397181    4.4682262   -2.2179503   0.07873406 
+   54000    17.941256   -0.5807578   0.76516121    3.7262371    4.6975126    -3.179899   0.24433708 
+   55000    17.079478  -0.48559832   0.95364453    3.0414645    5.2811414   -2.7064882   0.30102814 
+   56000    17.632179  -0.75403299   0.97577942    3.3672363    4.4851336   -2.3683659  0.051117638 
+   57000     16.17128  -0.44699325   0.76341543     4.267716    5.0881056   -2.4122329   0.16671692 
+   58000    16.899276  -0.76481024    1.0400825     3.973493    4.8823309   -2.4270284  0.048716383 
+   59000    18.145412  -0.84968335   0.71698306    3.2024358    4.6115739   -2.2520353   0.19466966 
+   60000    17.578258   -1.0067331   0.72822527    3.5375208    4.9110255   -2.2319607   0.11922362 
+   61000    17.434762   -1.0244393   0.90593099    3.8446915    4.8571191   -2.6228357   0.23259208 
+   62000    17.580489   -1.1135917   0.79577432    3.7043524    4.6058114    -2.351492  0.042904152 
+   63000    18.207335   -1.1512268   0.82684507    3.4114738     4.351069   -2.1878441  0.082922105 
+   64000    18.333083   -1.1182287   0.74058959    3.6905164    4.3226172   -2.7110393   0.14721704 
+   65000    16.271579   -0.7122151    1.0200168    4.6983643    4.3681131    -2.194921   0.12831024 
+   66000    17.316444   -0.5729385   0.85254108    3.5769963    4.5526705   -2.3321328  0.040452643 
+   67000     17.19011   -0.8814312    1.1381258    3.8605789    4.4183813    -2.299607  0.091527355 
+   68000    18.223367    -1.362189   0.74472056     3.259165     4.486512   -2.2181134  0.048952796 
+   69000    17.646348  -0.91647162   0.73990335    3.9313692    5.2663097   -3.3816778   0.27769877 
+   70000    18.173493   -1.3107718   0.96484426     3.219728    4.5045124   -2.3349534  0.082327407 
+   71000      17.0627  -0.58509083   0.85964129    3.8490884     4.437895   -2.1673348   0.24151404 
+   72000    17.809764  -0.35128902   0.65479258    3.3945008    4.6160508   -2.5486166   0.10829531 
+   73000     18.27769   -1.0739758   0.80890957    3.6070901    4.6256762   -2.4576547  0.080025736 
+   74000    18.109437   -1.0691837   0.66679323    3.5923203    4.4825716   -2.5048169   0.21372319 
+   75000    17.914569   -1.3500765    1.2993494     3.362421    4.4160377   -2.1278163   0.19397641 
+   76000    16.563928  -0.16539261    1.0067302    3.5742755    4.8581915   -2.1362429  0.059822408 
+   77000    18.130477  -0.38361279   0.43406954    3.4725995    4.7005855   -2.8836242   0.11958174 
+   78000    16.746204   -1.1732959    0.7455507    3.6296638    5.6344113    -2.459208   0.16099803 
+   79000    18.243999   -1.5850155    1.0108545    3.4727867    4.3367411    -2.316686  0.070480814 
+   80000    16.960715  -0.84100929   0.91604996     3.862215     4.780949   -2.3711596  0.073916605 
+   81000    17.697722   -1.1126605     0.952804    3.7114455    4.4216316   -2.2770085  0.091372066 
+   82000    17.835901   -1.3091474   0.71867629    3.8168122    5.0150205   -2.4730634  0.062592852 
+   83000    19.168418    -1.476938   0.75592316    3.2304519    4.3946471   -2.2991395   0.13083324 
+   84000    17.945778   -1.5223622    1.0859941    3.4334011    5.0286682   -2.7550892    0.2476269 
+   85000    17.950251  -0.85843846   0.86888218    3.3101287    4.5511879   -2.3640013   0.12080834 
+   86000    17.480699  -0.97493649   0.85049761    3.4973085    4.6344922    -2.343121    0.2009677 
+   87000    17.980244    -1.114983   0.88796989    3.4113329    4.3535853   -2.2535412   0.14494917 
+   88000    18.023866    -1.226683   0.62339706    3.7649269    4.5923973   -2.3923523   0.10464375 
+   89000    16.362829    -0.311462    1.0265375    4.0101723    4.4184777   -2.0314129  0.056570704 
+   90000    17.533149  -0.41526788    1.0362029    3.4247412    4.2734431   -2.4776658   0.16960663 
+   91000    17.719099   -1.1956801    1.0069945    3.2380672    4.8982805   -2.2154906   0.12950936 
+   92000    17.762654    -1.170027   0.95814525    3.5217717    4.5405343   -2.5983677   0.15037754 
+   93000    17.393958  -0.45641026    0.6579069    3.6002204    4.5942053   -2.5559641   0.12026544 
+   94000      16.8182  -0.92962066   0.86801362    4.2914398     4.659848   -2.5251987   0.18000415 
+   95000    17.642086   -0.7994896    0.7003756    3.8036697    4.5252487   -2.4166307   0.15686517 
+   96000    18.114292   -1.5102104    1.2635908    3.2764427    5.0659496   -2.2777806  0.054309645 
+   97000    18.575765   -1.6015311   0.69500699    3.1649317    4.9945742   -2.4012125  0.067373724 
+   98000    16.578893  -0.78030229   0.91524222    4.4429655    4.4622392   -2.4052655   0.15355705 
+   99000     17.26063  -0.57832833    0.7098846    3.9000046    4.5576484   -2.5333026   0.25517222 
+  100000    18.377235  -0.89109577   0.68988617    2.8751751    4.4115591   -2.3560731   0.12185212 
+Loop time of 2.96043 on 1 procs for 100000 steps with 34 atoms
+
+Performance: 5836.990 ns/day, 0.004 hours/ns, 33778.875 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 1.074      | 1.074      | 1.074      |   0.0 | 36.28
+Bond    | 1.6497     | 1.6497     | 1.6497     |   0.0 | 55.72
+Neigh   | 0.007576   | 0.007576   | 0.007576   |   0.0 |  0.26
+Comm    | 0.012847   | 0.012847   | 0.012847   |   0.0 |  0.43
+Output  | 0.0010746  | 0.0010746  | 0.0010746  |   0.0 |  0.04
+Modify  | 0.16485    | 0.16485    | 0.16485    |   0.0 |  5.57
+Other   |            | 0.05037    |            |       |  1.70
+
+Nlocal:    34 ave 34 max 34 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    0 ave 0 max 0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    395 ave 395 max 395 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 395
+Ave neighs/atom = 11.6176
+Ave special neighs/atom = 9.52941
+Neighbor list builds = 253
+Dangerous builds = 0
+Total wall time: 0:00:02
diff --git a/examples/cmap/log.11Apr17.cmap.g++.4 b/examples/cmap/log.11Apr17.cmap.g++.4
new file mode 100644
index 0000000000000000000000000000000000000000..ec471d5a7e683d379713e0862ddc7074c8d48915
--- /dev/null
+++ b/examples/cmap/log.11Apr17.cmap.g++.4
@@ -0,0 +1,205 @@
+LAMMPS (31 Mar 2017)
+# Created by charmm2lammps v1.8.2.6 beta on Thu Mar  3 20:56:57 EST 2016
+
+units           real
+neigh_modify    delay 2 every 1
+#newton          off
+
+boundary        p p p
+
+atom_style      full
+bond_style      harmonic
+angle_style     charmm
+dihedral_style  charmmfsw
+improper_style  harmonic
+
+pair_style      lj/charmmfsw/coul/charmmfsh 8 12
+pair_modify     mix arithmetic
+
+fix             cmap all cmap charmm22.cmap
+Reading potential file charmm22.cmap with DATE: 2016-09-26
+fix_modify      cmap energy yes
+
+read_data       gagg.data fix cmap crossterm CMAP
+  orthogonal box = (-34.4147 -36.1348 -39.3491) to (45.5853 43.8652 40.6509)
+  1 by 2 by 2 MPI processor grid
+  reading atoms ...
+  34 atoms
+  scanning bonds ...
+  4 = max bonds/atom
+  scanning angles ...
+  6 = max angles/atom
+  scanning dihedrals ...
+  12 = max dihedrals/atom
+  scanning impropers ...
+  1 = max impropers/atom
+  reading bonds ...
+  33 bonds
+  reading angles ...
+  57 angles
+  reading dihedrals ...
+  75 dihedrals
+  reading impropers ...
+  7 impropers
+  4 = max # of 1-2 neighbors
+  7 = max # of 1-3 neighbors
+  13 = max # of 1-4 neighbors
+  16 = max # of special neighbors
+
+special_bonds   charmm
+fix             1 all nve
+
+#fix             1 all nvt temp 300 300 100.0
+#fix             2 all shake 1e-9 500 0 m 1.0
+
+velocity        all create 0.0 12345678 dist uniform
+
+thermo          1000
+thermo_style    custom step ecoul evdwl ebond eangle edihed f_cmap eimp
+timestep        2.0
+
+run             100000
+Neighbor list info ...
+  update every 1 steps, delay 2 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 14
+  ghost atom cutoff = 14
+  binsize = 7, bins = 12 12 12
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/charmmfsw/coul/charmmfsh, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 14.94 | 15.57 | 16.2 Mbytes
+Step E_coul E_vdwl E_bond E_angle E_dihed f_cmap E_impro 
+       0    16.287573  -0.85933785    1.2470497    4.8441789    4.5432816    -1.473352   0.10453023 
+    1000    18.816462  -0.84379243   0.78931817    2.7554247    4.4371421   -2.7762038   0.12697656 
+    2000    18.091571    -1.045888   0.72306589    3.0951524    4.6725102   -2.3580092   0.22712496 
+    3000    17.835596   -1.2171641   0.72666403    2.6696491    5.4373798   -2.0737041  0.075101693 
+    4000    16.211232  -0.42713611   0.99472642    3.8961462    5.2009895   -2.5626866   0.17356243 
+    5000     17.72183  -0.57081189   0.90733068    3.4376382    4.5457582   -2.3727543   0.12354518 
+    6000    18.753977   -1.5772499   0.81468321    2.9236782    4.6033216   -2.3380859   0.12835782 
+    7000    18.186024  -0.84205609   0.58996181    3.0329584    4.7221473   -2.5733244   0.10047631 
+    8000    18.214306   -1.1360934   0.72597583    3.7493032    4.7319959   -2.8957975   0.20060467 
+    9000    17.248415  -0.48642024   0.90266262    2.9721744    4.7651003   -2.1473349   0.13020438 
+   10000    17.760663   -1.2968458   0.92384687    3.7007432    4.7378917   -2.2147799   0.06940514 
+   11000     17.63395  -0.57366075   0.84871737    3.4276851    4.2853865   -2.2369491   0.17205075 
+   12000    18.305713   -1.0672299   0.75876262    2.8852171    4.6850229   -2.4090072  0.087568888 
+   13000    17.383367  -0.99678627   0.66712651    3.8060954     5.233865   -2.4180629  0.062014239 
+   14000    17.510901  -0.68723297   0.92448551    3.7550867    4.7321218   -2.6059088   0.11504409 
+   15000    18.080165     -1.13316   0.99982253      3.09947    4.8171402   -2.2713372   0.14580371 
+   16000    17.383245   -0.4535296   0.57826268    3.6453593    4.6541138   -2.2434512   0.13285609 
+   17000    17.111153   -0.3414839   0.73667584    3.7485311    4.6262965   -2.6166049   0.12635815 
+   18000    16.862046   -1.3592061    1.2371142    4.4878937    4.2937117   -2.2112584  0.066145125 
+   19000    18.313891    -1.654238   0.90644101    3.3934089     4.550735   -2.1862171  0.081267736 
+   20000    19.083561   -1.3081747   0.56257812    2.7633848    4.6211438   -2.5196707   0.13763071 
+   21000     18.23741    -1.051353   0.64408722    3.1735565    4.6912533   -2.2491947  0.099394904 
+   22000    17.914515  -0.89769621   0.61793801    3.1224992    4.8683543    -2.282475   0.14524537 
+   23000    16.756122  -0.98277883    1.2554905    3.7916115    4.7301443   -2.3094994   0.10226772 
+   24000    16.109857  -0.54593177   0.86934462    4.4293574     4.926985   -2.2652264   0.11414331 
+   25000    18.590559    -1.497327    1.1898361    2.9134403    4.7854107   -2.4437918  0.067416154 
+   26000    18.493391   -1.0533797    0.4889578    3.6563013    4.6171721   -2.3240835   0.11607829 
+   27000    18.646522   -1.1229601   0.67956815    2.7937638    4.8991207   -2.4068997   0.10109147 
+   28000    18.545103   -1.7237438   0.72488022    3.8041665    4.6459974   -2.4339333   0.21943258 
+   29000    17.840505   -1.0909667   0.88133248    3.3698456    5.0311644   -2.5116617   0.08102693 
+   30000    17.649527  -0.65409177   0.86781692      3.24112    4.9903073   -2.6234925   0.14799777 
+   31000    18.156812  -0.77476556   0.83192789    2.9620784    4.9160635   -2.8571635   0.22283201 
+   32000    18.251583   -1.3384075    0.8059007    3.2588176    4.4365328   -2.1875071  0.087883637 
+   33000    17.702785  -0.88311587   0.98573641    3.4645713    4.2650091   -2.0909158   0.14233004 
+   34000    17.123413   -1.4873429    1.0419563    4.2628178    4.6318762   -2.2292095     0.105354 
+   35000    18.162061   -1.0136007   0.82436129    3.6365024    4.5801677   -2.6856989   0.28648222 
+   36000     17.65618    -1.094718    0.8872444    3.5075241    4.6382423   -2.3895134   0.18116961 
+   37000    17.336475   -1.0657995   0.98869254    3.9252927    4.4383632   -2.2048244   0.22285949 
+   38000    17.369467  -0.97623132    0.6712095    4.1349304     4.597754   -2.4088341   0.14608514 
+   39000    18.170206   -1.2344285   0.77546195    3.6451049    4.7482287   -2.9895286   0.25768859 
+   40000    16.210866  -0.81407781   0.99246271    4.2676233    5.0253763   -2.2929865   0.13348624 
+   41000    17.641798   -1.0868157   0.80119513    3.4302526     5.280872   -2.4025406   0.22747391 
+   42000    18.349848    -1.613759    1.1497004    3.7800682    4.3237683   -2.8676401    0.2120425 
+   43000    19.130245    -1.196778   0.71845659    2.9325758    4.3684415    -2.433424   0.12240982 
+   44000    18.061321   -1.2410101    1.0329373    3.0751569    4.7138313   -2.2880904  0.075814461 
+   45000    18.162713   -1.4414622     1.009159    4.2298758     4.589593   -2.8502298   0.21606844 
+   46000    18.591574  -0.99730412    1.0955215    3.3965004     4.359466   -3.1049731   0.17322629 
+   47000    18.380259   -1.2717381   0.72291269    3.3958016    4.6099628   -2.4605065   0.19825185 
+   48000    18.130478   -1.5051279    1.2087492    3.2488529    4.6690881   -2.2518174   0.05633061 
+   49000    16.419912  -0.89320635   0.98926144    4.0388252    4.9919488   -2.1699511   0.15646479 
+   50000    16.453196   -1.0433497     0.778346    4.6078069    4.7320614   -2.3760788   0.17161976 
+   51000    18.245221  -0.89550444    0.9310446    3.0758194    4.3944595   -2.3082379   0.19983428 
+   52000    17.839632   -1.0221781   0.76425017    3.3331547    4.5368437   -2.0988773   0.21098435 
+   53000    18.693035   -1.4231915   0.76333082    3.1612761     4.583242   -2.4485762  0.089191206 
+   54000    16.334672  -0.36309884    1.0200365    4.6700448    4.1628702   -2.1713841   0.11431995 
+   55000     17.33842  -0.61522682   0.89847366    3.4970659     4.673495   -2.4743036  0.068004878 
+   56000    17.790294   -1.0150845   0.73697112    3.6000297    4.5988343   -2.4822509   0.11434632 
+   57000    18.913486   -1.0985507    1.0231848    2.7483267    4.4421755    -2.574424    0.1763388 
+   58000    17.586896  -0.98284126   0.96965633    3.3330357    4.5325543   -2.1936869  0.083230915 
+   59000     17.77788   -1.1649953   0.83092298    3.8004148    4.3940176   -2.3136642  0.017207608 
+   60000    17.013042  -0.21728023    1.1688832    3.5374476    4.5462244   -2.4425301   0.15028297 
+   61000    17.236242   -1.1342147    1.0301086     3.685948    4.6842331    -2.328108  0.070210812 
+   62000    17.529852   -1.2961547    1.0323133    3.4474598    5.1435839   -2.4553423  0.060842687 
+   63000    18.754704   -1.1816999   0.51806039     3.140172    4.5832701   -2.2713213   0.06327871 
+   64000     17.54594   -1.3592836    0.9694558    4.1363258    4.3547729   -2.3818433   0.12634448 
+   65000    16.962312  -0.54192775   0.90321315    4.0788618    4.2008255   -2.1376711  0.039504515 
+   66000    18.078619   -1.3552947    1.0716861    3.3285374    4.7229362   -2.3331115   0.21978698 
+   67000    17.132732   -1.4376876   0.91486534    4.4461852    4.6894176   -2.3655045  0.068150385 
+   68000     18.69286   -1.2856207    0.3895394    3.0620063    4.9922992   -2.3459189  0.079879643 
+   69000    18.329552   -1.1545957   0.88632275    3.1741058    4.4562418   -2.7094867   0.25329613 
+   70000    16.681168  -0.94434373    1.2450393    4.5737944    4.4902996   -2.4581775   0.15313095 
+   71000    17.375032   -1.0514442    1.0741595    3.4896146    4.8407713   -2.5302576   0.13640847 
+   72000    17.833013   -0.9047134   0.87067876    3.1658924    4.8825932   -2.4398117    0.2343991 
+   73000    17.421411   -1.2190741   0.73706811       4.2895    4.6464636   -2.3872727   0.19696525 
+   74000    17.383158  -0.34208984   0.71333984    3.2718891    4.2718495   -2.2484281   0.10827022 
+   75000     17.20885   -1.2710479     1.125102    3.8414467    5.3222741    -2.375505   0.12910797 
+   76000    16.811578    -0.545162   0.59076961    3.9118604    4.8031296   -2.2777895  0.063015508 
+   77000    16.679231 -0.080955983    0.7253398    3.4203454    5.0987608    -2.379614   0.12961874 
+   78000    18.164524   -1.3115525   0.92526408    3.5764487    4.3814882   -2.3712488  0.073436724 
+   79000    17.738686   -1.0697859    1.2186866    3.0593848    4.6551053   -2.2505871  0.075340661 
+   80000    16.767483  -0.84777477      1.03128    4.1982958    4.6992227   -2.4146425  0.079774219 
+   81000    16.257265   0.62803774   0.84032194    3.3873471    5.0961071   -2.7219776   0.20467848 
+   82000    18.232082   -1.2129302   0.50746051    3.9207128    4.5073437    -2.599371  0.094522372 
+   83000    16.618985  -0.60917055    0.8825847     3.805497    4.9560959   -2.2194726   0.14852687 
+   84000     17.90762  -0.82336075   0.90504161    3.0324198    4.7444271   -2.5036073   0.15860682 
+   85000    16.699883  -0.50297228   0.83405307    3.8598996    4.7971968   -2.2427788   0.10338668 
+   86000    16.353038 -0.0096880616   0.80705167    4.0865115    4.5364338   -2.4548873  0.098456203 
+   87000    17.887331  -0.75281219    1.0030148    4.0117123    4.3443074   -2.9774392   0.16190152 
+   88000    18.583708   -1.4867053   0.86324814    3.3971237    4.3526221    -2.221239   0.14459352 
+   89000    17.684828    -1.283764    1.0021118    3.5426808    4.9057005   -2.3921967   0.05844702 
+   90000      17.2597  -0.84306489   0.99797936    3.8896866    4.4315457   -2.5662899   0.18270206 
+   91000    16.705581  -0.44704047   0.75239556     3.470805     4.976868   -2.1894571   0.12312848 
+   92000    17.548071   -1.2222664   0.92898812    4.0813773    4.3432647   -2.1631158   0.14071343 
+   93000    17.163675  -0.94994776   0.96876981    3.9137692    4.4388666   -2.1260232   0.13187968 
+   94000    18.842071   -1.2822113   0.58767049    3.1393475    4.5820965   -2.7264682   0.10406266 
+   95000    18.112287   -1.1011381   0.63546648    3.4672667     4.486275   -2.2991936  0.041589685 
+   96000    17.102713   -0.6877313    0.8389032    3.6892719    4.5676004   -2.1905327   0.13507011 
+   97000    16.778253   -1.2902153    1.1588744    4.2820083    4.9537657   -2.4798159   0.35696636 
+   98000     18.34638   -1.2908146     1.185356    3.0739807    4.4575453   -2.3959144   0.22407922 
+   99000    17.995148   -1.3939639    0.7727299    3.8774144    4.4345458   -2.1142776   0.13550099 
+  100000    18.444746   -1.2456693   0.86061526     3.468696    4.5264336   -2.4239851  0.074369539 
+Loop time of 2.52011 on 4 procs for 100000 steps with 34 atoms
+
+Performance: 6856.851 ns/day, 0.004 hours/ns, 39680.850 timesteps/s
+98.8% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.072506   | 0.28131    | 0.69088    |  46.2 | 11.16
+Bond    | 0.050544   | 0.45307    | 0.9416     |  57.6 | 17.98
+Neigh   | 0.0060885  | 0.0061619  | 0.0062056  |   0.1 |  0.24
+Comm    | 0.44686    | 1.3679     | 2.0111     |  53.5 | 54.28
+Output  | 0.0028057  | 0.0029956  | 0.003264   |   0.3 |  0.12
+Modify  | 0.028202   | 0.095174   | 0.15782    |  19.8 |  3.78
+Other   |            | 0.3135     |            |       | 12.44
+
+Nlocal:    8.5 ave 14 max 2 min
+Histogram: 1 0 1 0 0 0 0 0 0 2
+Nghost:    25.5 ave 32 max 20 min
+Histogram: 2 0 0 0 0 0 0 1 0 1
+Neighs:    98.75 ave 242 max 31 min
+Histogram: 2 0 1 0 0 0 0 0 0 1
+
+Total # of neighbors = 395
+Ave neighs/atom = 11.6176
+Ave special neighs/atom = 9.52941
+Neighbor list builds = 246
+Dangerous builds = 0
+Total wall time: 0:00:02
diff --git a/examples/mscg/log.31Mar17.g++.1 b/examples/mscg/log.31Mar17.g++.1
new file mode 100644
index 0000000000000000000000000000000000000000..c67bc483db3976b52b18c5f04fd129387d1cab11
--- /dev/null
+++ b/examples/mscg/log.31Mar17.g++.1
@@ -0,0 +1,145 @@
+LAMMPS (13 Apr 2017)
+units real
+atom_style full
+pair_style zero 10.0
+
+read_data data.meoh
+  orthogonal box = (-20.6917 -20.6917 -20.6917) to (20.6917 20.6917 20.6917)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  1000 atoms
+  0 = max # of 1-2 neighbors
+  0 = max # of 1-3 neighbors
+  0 = max # of 1-4 neighbors
+  1 = max # of special neighbors
+pair_coeff * *
+
+thermo 1
+thermo_style custom step
+
+# Test 1a: range finder functionality
+fix 1 all mscg 1 range on
+rerun dump.meoh first 0 last 4500 every 250 dump x y z fx fy fz
+Neighbor list info ...
+  update every 1 steps, delay 10 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 12
+  ghost atom cutoff = 12
+  binsize = 6, bins = 7 7 7
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 5.794 | 5.794 | 5.794 Mbytes
+Step 
+       0 
+     250 
+     500 
+     750 
+    1000 
+    1250 
+    1500 
+    1750 
+    2000 
+    2250 
+    2500 
+    2750 
+    3000 
+    3250 
+    3500 
+    3750 
+    4000 
+    4250 
+    4500 
+Loop time of 0.581537 on 1 procs for 19 steps with 1000 atoms
+
+Performance: 2.823 ns/day, 8.502 hours/ns, 32.672 timesteps/s
+99.2% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.5815     |            |       |100.00
+
+Nlocal:    1000 ave 1000 max 1000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    2934 ave 2934 max 2934 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    50654 ave 50654 max 50654 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 50654
+Ave neighs/atom = 50.654
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+print "TEST_1a mscg range finder"
+TEST_1a mscg range finder
+unfix 1
+
+# Test 1b: force matching functionality
+fix 1 all mscg 1
+rerun dump.meoh first 0 last 4500 every 250 dump x y z fx fy fz
+Per MPI rank memory allocation (min/avg/max) = 5.794 | 5.794 | 5.794 Mbytes
+Step 
+       0 
+     250 
+     500 
+     750 
+    1000 
+    1250 
+    1500 
+    1750 
+    2000 
+    2250 
+    2500 
+    2750 
+    3000 
+    3250 
+    3500 
+    3750 
+    4000 
+    4250 
+    4500 
+Loop time of 0.841917 on 1 procs for 19 steps with 1000 atoms
+
+Performance: 1.950 ns/day, 12.309 hours/ns, 22.568 timesteps/s
+99.8% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0          | 0          | 0          |   0.0 |  0.00
+Bond    | 0          | 0          | 0          |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0          | 0          | 0          |   0.0 |  0.00
+Output  | 0          | 0          | 0          |   0.0 |  0.00
+Modify  | 0          | 0          | 0          |   0.0 |  0.00
+Other   |            | 0.8419     |            |       |100.00
+
+Nlocal:    1000 ave 1000 max 1000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    2934 ave 2934 max 2934 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    50654 ave 50654 max 50654 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 50654
+Ave neighs/atom = 50.654
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+print "TEST_1b mscg force matching"
+TEST_1b mscg force matching
+
+print TEST_DONE
+TEST_DONE
+Total wall time: 0:00:01
diff --git a/lib/Install.py b/lib/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/README b/lib/README
index 72ebb0a5f7543c1c3ec3ead1c601e6b9aec69262..3c8f46dd0a957162660045fc76caeafceb03adec 100644
--- a/lib/README
+++ b/lib/README
@@ -33,14 +33,16 @@ kokkos        Kokkos package for GPU and many-core acceleration
                 from Kokkos development team (Sandia)
 linalg        set of BLAS and LAPACK routines needed by USER-ATC package
 	        from Axel Kohlmeyer (Temple U)
-poems	      POEMS rigid-body integration package, POEMS package
-                from Rudranarayan Mukherjee (RPI)
 meam	      modified embedded atom method (MEAM) potential, MEAM package
                 from Greg Wagner (Sandia)
 molfile       hooks to VMD molfile plugins, used by the USER-MOLFILE package
                 from Axel Kohlmeyer (Temple U) and the VMD development team
 mscg          hooks to the MSCG library, used by fix_mscg command
                 from Jacob Wagner and Greg Voth group (U Chicago)
+netcdf        hooks to a NetCDF library installed on your system
+                from Lars Pastewka (Karlsruhe Institute of Technology)
+poems	      POEMS rigid-body integration package, POEMS package
+                from Rudranarayan Mukherjee (RPI)
 python        hooks to the system Python library, used by the PYTHON package
                 from the LAMMPS development team
 qmmm	      quantum mechanics/molecular mechanics coupling interface
diff --git a/lib/atc/Install.py b/lib/atc/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/atc/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/atc/README b/lib/atc/README
index 106c303dd14fc8a2a3885de4688b3d390a4a1d17..d3adfdafe4493c8c443ad78d953d9fc2a2286299 100644
--- a/lib/atc/README
+++ b/lib/atc/README
@@ -15,6 +15,11 @@ links against when using the USER-ATC package.
 This library must be built with a C++ compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-atc" from the src directory to see help on how
+to build this library via make commands, or you can do the same thing
+by typing "python Install.py" from within this directory, or you can
+do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
@@ -44,16 +49,16 @@ user-atc_SYSINC = leave blank for this package
 user-atc_SYSLIB = BLAS and LAPACK libraries needed by this package
 user-atc_SYSPATH = path(s) to where those libraries are
 
-You have several choices for these settings:
+You have 3 choices for these settings:
 
-If the 2 libraries are already installed on your system, the settings
-in Makefile.lammps.installed should work.
+a) If the 2 libraries are already installed on your system, the
+settings in Makefile.lammps.installed should work.
 
-If they are not, you can install them yourself, and speficy the
-appropriate settings accordingly.
+b) If they are not, you can install them yourself, and specify the
+appropriate settings accordingly in a Makefile.lammps.* file
+and set the EXTRAMAKE setting in Makefile.* to that file.
 
-If you want to use the minimalist version of these libraries provided
-with LAMMPS in lib/linalg, then the settings in Makefile.lammps.linalg
-should work.  Note that in this case you also need to build the
-linear-algebra in lib/linalg; see the lib/linalg/README for more
-details.
+c) Use the minimalist version of these libraries provided with LAMMPS
+in lib/linalg, by using Makefile.lammps.linalg.  In this case you also
+need to build the library in lib/linalg; see the lib/linalg/README
+file for more details.
diff --git a/lib/awpmd/Install.py b/lib/awpmd/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/awpmd/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/awpmd/README b/lib/awpmd/README
index 3c02480419643f572700f2b54c66c3eeed33f6d2..20e142f74c0fe3bb7375225baeb91b21b0f3636a 100644
--- a/lib/awpmd/README
+++ b/lib/awpmd/README
@@ -19,6 +19,11 @@ links against when using the USER-AWPMD package.
 This library must be built with a C++ compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-awpmd" from the src directory to see help on
+how to build this library via make commands, or you can do the same
+thing by typing "python Install.py" from within this directory, or you
+can do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
@@ -47,16 +52,16 @@ user-awpmd_SYSINC = leave blank for this package
 user-awpmd_SYSLIB = BLAS and LAPACK libraries needed by this package
 user-awpmd_SYSPATH = path(s) to where those libraries are
 
-You have several choices for these settings:
+You have 3 choices for these settings:
 
-If the 2 libraries are already installed on your system, the settings
-in Makefile.lammps.installed should work.
+a) If the 2 libraries are already installed on your system, the
+settings in Makefile.lammps.installed should work.
 
-If they are not, you can install them yourself, and speficy the
-appropriate settings accordingly.
+b) If they are not, you can install them yourself, and specify the
+appropriate settings accordingly in a Makefile.lammps.* file
+and set the EXTRAMAKE setting in Makefile.* to that file.
 
-If you want to use the minimalist version of these libraries provided
-with LAMMPS in lib/linalg, then the settings in Makefile.lammps.linalg
-should work.  Note that in this case you also need to build the
-linear-algebra in lib/linalg; see the lib/linalg/README for more
-details.
+c) Use the minimalist version of these libraries provided with LAMMPS
+in lib/linalg, by using Makefile.lammps.linalg.  In this case you also
+need to build the library in lib/linalg; see the lib/linalg/README
+file for more details.
diff --git a/lib/colvars/Install.py b/lib/colvars/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/colvars/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/colvars/README b/lib/colvars/README
index d6efc333a59a55c04bbcdc0ee3829ed793cae442..a5e5938b20ea4c11f148a14a09d7557a8e3236f8 100644
--- a/lib/colvars/README
+++ b/lib/colvars/README
@@ -35,6 +35,11 @@ links against when using the USER-COLVARS package.
 This library must be built with a C++ compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-colvars" from the src directory to see help on
+how to build this library via make commands, or you can do the same
+thing by typing "python Install.py" from within this directory, or you
+can do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
diff --git a/lib/gpu/Install.py b/lib/gpu/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..d396be5e1a00fdbc296d711faaafbd46a3e3a811
--- /dev/null
+++ b/lib/gpu/Install.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+
+# Install.py tool to build the GPU library
+# used to automate the steps described in the README file in this dir
+
+import sys,os,re,commands
+
+# help message
+
+help = """
+Syntax: python Install.py -i isuffix -h hdir -a arch -p precision -e esuffix -m -o osuffix
+  specify one or more options, order does not matter
+  copies an existing Makefile.isuffix in lib/gpu to Makefile.auto 
+  optionally edits these variables in Makefile.auto:
+    CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
+  optionally uses Makefile.auto to build the GPU library -> libgpu.a
+    and to copy a Makefile.lammps.esuffix -> Makefile.lammps
+  optionally copies Makefile.auto to a new Makefile.osuffix
+
+  -i = use Makefile.isuffix as starting point, copy to Makefile.auto
+       default isuffix = linux
+  -h = set CUDA_HOME variable in Makefile.auto to hdir
+       hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
+  -a = set CUDA_ARCH variable in Makefile.auto to arch
+       use arch = ?? for K40 (Tesla)
+       use arch = 37 for dual K80 (Tesla)
+       use arch = 60 for P100 (Pascal)
+  -p = set CUDA_PRECISION variable in Makefile.auto to precision
+       use precision = double or mixed or single
+  -e = set EXTRAMAKE variable in Makefile.auto to Makefile.lammps.esuffix
+  -m = make the GPU library using Makefile.auto
+       first performs a "make clean"
+       produces libgpu.a if successful
+       also copies EXTRAMAKE file -> Makefile.lammps
+         -e can set which Makefile.lammps.esuffix file is copied
+  -o = copy final Makefile.auto to Makefile.osuffix
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+isuffix = "linux"
+hflag = aflag = pflag = eflag = 0
+makeflag = 0
+outflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-i":
+    if iarg+2 > nargs: error()
+    isuffix = args[iarg+1]
+    iarg += 2
+  elif args[iarg] == "-h":
+    if iarg+2 > nargs: error()
+    hflag = 1
+    hdir = args[iarg+1]
+    iarg += 2
+  elif args[iarg] == "-a":
+    if iarg+2 > nargs: error()
+    aflag = 1
+    arch = args[iarg+1]
+    iarg += 2
+  elif args[iarg] == "-p":
+    if iarg+2 > nargs: error()
+    pflag = 1
+    precision = args[iarg+1]
+    iarg += 2
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    eflag = 1
+    lmpsuffix = args[iarg+1]
+    iarg += 2
+  elif args[iarg] == "-m":
+    makeflag = 1
+    iarg += 1
+  elif args[iarg] == "-o":
+    if iarg+2 > nargs: error()
+    outflag = 1
+    osuffix = args[iarg+1]
+    iarg += 2
+  else: error()
+
+if pflag:
+  if precision == "double": precstr = "-D_DOUBLE_DOUBLE"
+  elif precision == "mixed": precstr = "-D_SINGLE_DOUBLE"
+  elif precision == "single": precstr = "-D_SINGLE_SINGLE"
+  else: error("Invalid precision setting")
+  
+# create Makefile.auto
+# reset EXTRAMAKE, CUDA_HOME, CUDA_ARCH, CUDA_PRECISION if requested
+  
+if not os.path.exists("Makefile.%s" % isuffix):
+  error("lib/gpu/Makefile.%s does not exist" % isuffix)
+
+lines = open("Makefile.%s" % isuffix,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) != 3:
+    print >>fp,line,
+    continue
+  
+  if hflag and words[0] == "CUDA_HOME" and words[1] == '=':
+    line = line.replace(words[2],hdir)
+  if aflag and words[0] == "CUDA_ARCH" and words[1] == '=':
+    line = line.replace(words[2],"-arch=sm_%s" % arch)
+  if pflag and words[0] == "CUDA_PRECISION" and words[1] == '=':
+    line = line.replace(words[2],precstr)
+  if eflag and words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % lmpsuffix)
+    
+  print >>fp,line,
+
+fp.close()
+
+# perform make
+# make operations copies EXTRAMAKE file to Makefile.lammps
+
+if makeflag:
+  print "Building libgpu.a ..."
+  cmd = "rm -f libgpu.a"
+  commands.getoutput(cmd)
+  cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+  commands.getoutput(cmd)
+  if not os.path.exists("libgpu.a"):
+    error("Build of lib/gpu/libgpu.a was NOT successful")
+  if not os.path.exists("Makefile.lammps"):
+    error("lib/gpu/Makefile.lammps was NOT created")
+
+# copy new Makefile.auto to Makefile.osuffix
+
+if outflag:
+  print "Creating new Makefile.%s" % osuffix
+  cmd = "cp Makefile.auto Makefile.%s" % osuffix
+  commands.getoutput(cmd)
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index e02849cfed12c6142c397d634cb492a7ef054747..660544cfaaff1dc04f9049b4947dd304e9701067 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -43,8 +43,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
        $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
        $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
        $(OBJ_DIR)/lal_charmm_long.o $(OBJ_DIR)/lal_charmm_long_ext.o \
-       $(OBJ_DIR)/lal_cg_cmm.o $(OBJ_DIR)/lal_cg_cmm_ext.o \
-       $(OBJ_DIR)/lal_cg_cmm_long.o $(OBJ_DIR)/lal_cg_cmm_long_ext.o \
+       $(OBJ_DIR)/lal_lj_sdk.o $(OBJ_DIR)/lal_lj_sdk_ext.o \
+       $(OBJ_DIR)/lal_lj_sdk_long.o $(OBJ_DIR)/lal_lj_sdk_long_ext.o \
        $(OBJ_DIR)/lal_eam.o $(OBJ_DIR)/lal_eam_ext.o \
        $(OBJ_DIR)/lal_eam_fs_ext.o $(OBJ_DIR)/lal_eam_alloy_ext.o \
        $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \
@@ -98,8 +98,8 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
        $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \
        $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \
        $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \
-       $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm_cubin.h \
-       $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long_cubin.h \
+       $(OBJ_DIR)/lj_sdk.cubin $(OBJ_DIR)/lj_sdk_cubin.h \
+       $(OBJ_DIR)/lj_sdk_long.cubin $(OBJ_DIR)/lj_sdk_long_cubin.h \
        $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
        $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
        $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
@@ -391,29 +391,29 @@ $(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR
 $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cg_cmm.cubin: lal_cg_cmm.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm.cu
+$(OBJ_DIR)/lj_sdk.cubin: lal_lj_sdk.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_sdk.cu
 
-$(OBJ_DIR)/cg_cmm_cubin.h: $(OBJ_DIR)/cg_cmm.cubin $(OBJ_DIR)/cg_cmm.cubin
-	$(BIN2C) -c -n cg_cmm $(OBJ_DIR)/cg_cmm.cubin > $(OBJ_DIR)/cg_cmm_cubin.h
+$(OBJ_DIR)/lj_sdk_cubin.h: $(OBJ_DIR)/lj_sdk.cubin $(OBJ_DIR)/lj_sdk.cubin
+	$(BIN2C) -c -n lj_sdk $(OBJ_DIR)/lj_sdk.cubin > $(OBJ_DIR)/lj_sdk_cubin.h
 
-$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp $(OBJ_DIR)/cg_cmm_cubin.h $(OBJ_DIR)/lal_base_atomic.o
-	$(CUDR) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk.o: $(ALL_H) lal_lj_sdk.h lal_lj_sdk.cpp $(OBJ_DIR)/lj_sdk_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_sdk.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
-	$(CUDR) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk_ext.o: $(ALL_H) lal_lj_sdk.h lal_lj_sdk_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_sdk_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cg_cmm_long.cubin: lal_cg_cmm_long.cu lal_precision.h lal_preprocessor.h
-	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_cg_cmm_long.cu
+$(OBJ_DIR)/lj_sdk_long.cubin: lal_lj_sdk_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_sdk_long.cu
 
-$(OBJ_DIR)/cg_cmm_long_cubin.h: $(OBJ_DIR)/cg_cmm_long.cubin $(OBJ_DIR)/cg_cmm_long.cubin
-	$(BIN2C) -c -n cg_cmm_long $(OBJ_DIR)/cg_cmm_long.cubin > $(OBJ_DIR)/cg_cmm_long_cubin.h
+$(OBJ_DIR)/lj_sdk_long_cubin.h: $(OBJ_DIR)/lj_sdk_long.cubin $(OBJ_DIR)/lj_sdk_long.cubin
+	$(BIN2C) -c -n lj_sdk_long $(OBJ_DIR)/lj_sdk_long.cubin > $(OBJ_DIR)/lj_sdk_long_cubin.h
 
-$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp $(OBJ_DIR)/cg_cmm_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
-	$(CUDR) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk_long.o: $(ALL_H) lal_lj_sdk_long.h lal_lj_sdk_long.cpp $(OBJ_DIR)/lj_sdk_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_sdk_long.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
-	$(CUDR) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk_long_ext.o: $(ALL_H) lal_lj_sdk_long.h lal_lj_sdk_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_sdk_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h
 	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_eam.cu
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 7ef1dfba0cf0790f086fa533175bfff257182e68..4a5959531388ebd3e43ba513f20d7448df45131c 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -32,8 +32,8 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
        $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
        $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
        $(OBJ_DIR)/lal_charmm_long.o $(OBJ_DIR)/lal_charmm_long_ext.o \
-       $(OBJ_DIR)/lal_cg_cmm.o $(OBJ_DIR)/lal_cg_cmm_ext.o \
-       $(OBJ_DIR)/lal_cg_cmm_long.o $(OBJ_DIR)/lal_cg_cmm_long_ext.o \
+       $(OBJ_DIR)/lal_lj_sdk.o $(OBJ_DIR)/lal_lj_sdk_ext.o \
+       $(OBJ_DIR)/lal_lj_sdk_long.o $(OBJ_DIR)/lal_lj_sdk_long_ext.o \
        $(OBJ_DIR)/lal_eam.o $(OBJ_DIR)/lal_eam_ext.o \
        $(OBJ_DIR)/lal_eam_fs_ext.o $(OBJ_DIR)/lal_eam_alloy_ext.o \
        $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \
@@ -75,8 +75,8 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
        $(OBJ_DIR)/lj_coul_long_cl.h $(OBJ_DIR)/lj_dsf_cl.h \
        $(OBJ_DIR)/lj_class2_long_cl.h \
        $(OBJ_DIR)/coul_long_cl.h $(OBJ_DIR)/morse_cl.h \
-       $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/cg_cmm_cl.h \
-       $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
+       $(OBJ_DIR)/charmm_long_cl.h $(OBJ_DIR)/lj_sdk_cl.h \
+       $(OBJ_DIR)/lj_sdk_long_cl.h $(OBJ_DIR)/neighbor_gpu_cl.h \
        $(OBJ_DIR)/eam_cl.h $(OBJ_DIR)/buck_cl.h \
        $(OBJ_DIR)/buck_coul_cl.h $(OBJ_DIR)/buck_coul_long_cl.h \
        $(OBJ_DIR)/table_cl.h $(OBJ_DIR)/yukawa_cl.h \
@@ -273,23 +273,23 @@ $(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp  $(OBJ_DI
 $(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cg_cmm_cl.h: lal_cg_cmm.cu $(PRE1_H)
-	$(BSH) ./geryon/file_to_cstr.sh cg_cmm $(PRE1_H) lal_cg_cmm.cu $(OBJ_DIR)/cg_cmm_cl.h;
+$(OBJ_DIR)/lj_sdk_cl.h: lal_lj_sdk.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_sdk $(PRE1_H) lal_lj_sdk.cu $(OBJ_DIR)/lj_sdk_cl.h;
 
-$(OBJ_DIR)/lal_cg_cmm.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm.cpp  $(OBJ_DIR)/cg_cmm_cl.h $(OBJ_DIR)/cg_cmm_cl.h $(OBJ_DIR)/lal_base_atomic.o
-	$(OCL) -o $@ -c lal_cg_cmm.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk.o: $(ALL_H) lal_lj_sdk.h lal_lj_sdk.cpp  $(OBJ_DIR)/lj_sdk_cl.h $(OBJ_DIR)/lj_sdk_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_lj_sdk.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lal_cg_cmm_ext.o: $(ALL_H) lal_cg_cmm.h lal_cg_cmm_ext.cpp lal_base_atomic.h
-	$(OCL) -o $@ -c lal_cg_cmm_ext.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk_ext.o: $(ALL_H) lal_lj_sdk.h lal_lj_sdk_ext.cpp lal_base_atomic.h
+	$(OCL) -o $@ -c lal_lj_sdk_ext.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cg_cmm_long_cl.h: lal_cg_cmm_long.cu $(PRE1_H)
-	$(BSH) ./geryon/file_to_cstr.sh cg_cmm_long $(PRE1_H) lal_cg_cmm_long.cu $(OBJ_DIR)/cg_cmm_long_cl.h;
+$(OBJ_DIR)/lj_sdk_long_cl.h: lal_lj_sdk_long.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_sdk_long $(PRE1_H) lal_lj_sdk_long.cu $(OBJ_DIR)/lj_sdk_long_cl.h;
 
-$(OBJ_DIR)/lal_cg_cmm_long.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long.cpp  $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/cg_cmm_long_cl.h $(OBJ_DIR)/lal_base_atomic.o
-	$(OCL) -o $@ -c lal_cg_cmm_long.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk_long.o: $(ALL_H) lal_lj_sdk_long.h lal_lj_sdk_long.cpp  $(OBJ_DIR)/lj_sdk_long_cl.h $(OBJ_DIR)/lj_sdk_long_cl.h $(OBJ_DIR)/lal_base_atomic.o
+	$(OCL) -o $@ -c lal_lj_sdk_long.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lal_cg_cmm_long_ext.o: $(ALL_H) lal_cg_cmm_long.h lal_cg_cmm_long_ext.cpp lal_base_charge.h
-	$(OCL) -o $@ -c lal_cg_cmm_long_ext.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/lal_lj_sdk_long_ext.o: $(ALL_H) lal_lj_sdk_long.h lal_lj_sdk_long_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_sdk_long_ext.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/eam_cl.h: lal_eam.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh eam $(PRE1_H) lal_eam.cu $(OBJ_DIR)/eam_cl.h;
diff --git a/lib/gpu/README b/lib/gpu/README
index 45c8ce49ba675b92f1be2f804a72e218fc8cce2a..b26897e885379c95eb8cda4b69850dad70879dff 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -17,6 +17,11 @@ links against when using the GPU package.
 This library must be built with a C++ compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-gpu" from the src directory to see help on how
+to build this library via make commands, or you can do the same thing
+by typing "python Install.py" from within this directory, or you can
+do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
@@ -164,9 +169,9 @@ this directory).
 The gpu library supports 3 precision modes as determined by 
 the CUDA_PRECISION variable:
 
-  CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
-  CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-  CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+  CUDA_PRECISION = -D_SINGLE_SINGLE  # Single precision for all calculations
+  CUDA_PRECISION = -D_DOUBLE_DOUBLE  # Double precision for all calculations
+  CUDA_PRECISION = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 
 NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1.
       You will get the error "GPU library not compiled for this accelerator."
diff --git a/lib/gpu/lal_cg_cmm.cpp b/lib/gpu/lal_lj_sdk.cpp
similarity index 85%
rename from lib/gpu/lal_cg_cmm.cpp
rename to lib/gpu/lal_lj_sdk.cpp
index d361e32b09e0e8b223766ea66b9e5b1ef3efb0c6..618555e38a794687cb2d6fde7c0c8ae2a57445db 100644
--- a/lib/gpu/lal_cg_cmm.cpp
+++ b/lib/gpu/lal_lj_sdk.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                                 cg_cmm.cpp
+                                 lj_sdk.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
@@ -14,14 +14,14 @@
  ***************************************************************************/
 
 #if defined(USE_OPENCL)
-#include "cg_cmm_cl.h"
+#include "lj_sdk_cl.h"
 #elif defined(USE_CUDART)
-const char *cg_cmm=0;
+const char *lj_sdk=0;
 #else
-#include "cg_cmm_cubin.h"
+#include "lj_sdk_cubin.h"
 #endif
 
-#include "lal_cg_cmm.h"
+#include "lal_lj_sdk.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define CGCMMT CGCMM<numtyp, acctyp>
@@ -53,33 +53,33 @@ int CGCMMT::init(const int ntypes, double **host_cutsq,
                           const double gpu_split, FILE *_screen) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cg_cmm,"k_cg_cmm");
+                            _screen,lj_sdk,"k_lj_sdk");
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
-  int cmm_types=ntypes;
+  int sdk_types=ntypes;
   shared_types=false;
   int max_shared_types=this->device->max_shared_types();
-  if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    cmm_types=max_shared_types;
+  if (sdk_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    sdk_types=max_shared_types;
     shared_types=true;
   }
-  _cmm_types=cmm_types;
+  _sdk_types=sdk_types;
 
   // Allocate a host write buffer for data initialization
-  UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
+  UCL_H_Vec<numtyp> host_write(sdk_types*sdk_types*32,*(this->ucl_device),
                                UCL_WRITE_ONLY);
 
-  for (int i=0; i<cmm_types*cmm_types; i++)
+  for (int i=0; i<sdk_types*sdk_types; i++)
     host_write[i]=0.0;
 
-  lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq,
+  lj1.alloc(sdk_types*sdk_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,sdk_types,lj1,host_write,host_cutsq,
                          host_cg_type,host_lj1,host_lj2);
 
-  lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
-  this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
+  lj3.alloc(sdk_types*sdk_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,sdk_types,lj3,host_write,host_lj3,host_lj4,
                          host_offset);
 
   UCL_H_Vec<double> dview;
@@ -143,7 +143,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->x, &lj1, &lj3,
-                     &_cmm_types, &sp_lj, &this->nbor->dev_nbor,
+                     &_sdk_types, &sp_lj, &this->nbor->dev_nbor,
                      &this->_nbor_data->begin(), &this->ans->force,
                      &this->ans->engv, &eflag, &vflag, &ainum,
                      &nbor_pitch, &this->_threads_per_atom);
diff --git a/lib/gpu/lal_cg_cmm.cu b/lib/gpu/lal_lj_sdk.cu
similarity index 97%
rename from lib/gpu/lal_cg_cmm.cu
rename to lib/gpu/lal_lj_sdk.cu
index 70d2ab6092c6ef4c06ccf4bb1f55240524fba966..01b2cdd18d287b14737984e9167e67df2b5a1487 100644
--- a/lib/gpu/lal_cg_cmm.cu
+++ b/lib/gpu/lal_lj_sdk.cu
@@ -1,5 +1,5 @@
 // **************************************************************************
-//                                  cg_cmm.cu
+//                                  lj_sdk.cu
 //                             -------------------
 //                           W. Michael Brown (ORNL)
 //
@@ -24,7 +24,7 @@ texture<int4,1> pos_tex;
 #define pos_tex x_
 #endif
 
-__kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
+__kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
                        const __global numtyp4 *restrict lj1,
                        const __global numtyp4 *restrict lj3,
                        const int lj_types,
@@ -116,7 +116,7 @@ __kernel void k_cg_cmm(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_cg_cmm_fast(const __global numtyp4 *restrict x_,
+__kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1_in,
                             const __global numtyp4 *restrict lj3_in,
                             const __global numtyp *restrict sp_lj_in,
diff --git a/lib/gpu/lal_cg_cmm.h b/lib/gpu/lal_lj_sdk.h
similarity index 97%
rename from lib/gpu/lal_cg_cmm.h
rename to lib/gpu/lal_lj_sdk.h
index b7895b5898230d6179e1e680c3b5a116d656c3a4..ac2b9aafe30de19d5a47f04f8a3a63acb726eadc 100644
--- a/lib/gpu/lal_cg_cmm.h
+++ b/lib/gpu/lal_lj_sdk.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                                  cg_cmm.h
+                                  lj_sdk.h
                              -------------------
                             W. Michael Brown (ORNL)
 
@@ -67,7 +67,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
   bool shared_types;
 
   /// Number of atom types
-  int _cmm_types;
+  int _sdk_types;
 
  private:
   bool _allocated;
diff --git a/lib/gpu/lal_cg_cmm_ext.cpp b/lib/gpu/lal_lj_sdk_ext.cpp
similarity index 93%
rename from lib/gpu/lal_cg_cmm_ext.cpp
rename to lib/gpu/lal_lj_sdk_ext.cpp
index b6fc110b15164e086f1829214f74555de58d8993..386106161e1b33a4ca11391e9470089d482ebeaf 100644
--- a/lib/gpu/lal_cg_cmm_ext.cpp
+++ b/lib/gpu/lal_lj_sdk_ext.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                                  cg_cmm.h
+                                  lj_sdk.h
                              -------------------
                             W. Michael Brown (ORNL)
 
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <math.h>
 
-#include "lal_cg_cmm.h"
+#include "lal_lj_sdk.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
@@ -27,7 +27,7 @@ static CGCMM<PRECISION,ACC_PRECISION> CMMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                  double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
                  const int inum, const int nall, const int max_nbors,
@@ -89,11 +89,11 @@ int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
   return init_ok;
 }
 
-void cmm_gpu_clear() {
+void sdk_gpu_clear() {
   CMMMF.clear();
 }
 
-int** cmm_gpu_compute_n(const int ago, const int inum_full,
+int** sdk_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
@@ -105,7 +105,7 @@ int** cmm_gpu_compute_n(const int ago, const int inum_full,
                        vatom, host_start, ilist, jnum, cpu_time, success);
 }
 
-void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
+void sdk_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
@@ -114,7 +114,7 @@ void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
-double cmm_gpu_bytes() {
+double sdk_gpu_bytes() {
   return CMMMF.host_memory_usage();
 }
 
diff --git a/lib/gpu/lal_cg_cmm_long.cpp b/lib/gpu/lal_lj_sdk_long.cpp
similarity index 96%
rename from lib/gpu/lal_cg_cmm_long.cpp
rename to lib/gpu/lal_lj_sdk_long.cpp
index 14b5b7622cba9ebb42d947bd1e3d9c98af467ab0..46caf6bd36ddac20244f928901bed9a7cf1d6717 100644
--- a/lib/gpu/lal_cg_cmm_long.cpp
+++ b/lib/gpu/lal_lj_sdk_long.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                               cg_cmm_long.cpp
+                               lj_sdk_long.cpp
                              -------------------
                             W. Michael Brown (ORNL)
 
@@ -14,14 +14,14 @@
  ***************************************************************************/
 
 #if defined(USE_OPENCL)
-#include "cg_cmm_long_cl.h"
+#include "lj_sdk_long_cl.h"
 #elif defined(USE_CUDART)
-const char *cg_cmm_long=0;
+const char *lj_sdk_long=0;
 #else
-#include "cg_cmm_long_cubin.h"
+#include "lj_sdk_long_cubin.h"
 #endif
 
-#include "lal_cg_cmm_long.h"
+#include "lal_lj_sdk_long.h"
 #include <cassert>
 using namespace LAMMPS_AL;
 #define CGCMMLongT CGCMMLong<numtyp, acctyp>
@@ -58,7 +58,7 @@ int CGCMMLongT::init(const int ntypes, double **host_cutsq,
                            const double g_ewald) {
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,cg_cmm_long,"k_cg_cmm_long");
+                            _screen,lj_sdk_long,"k_lj_sdk_long");
   if (success!=0)
     return success;
 
diff --git a/lib/gpu/lal_cg_cmm_long.cu b/lib/gpu/lal_lj_sdk_long.cu
similarity index 98%
rename from lib/gpu/lal_cg_cmm_long.cu
rename to lib/gpu/lal_lj_sdk_long.cu
index f6942d1809924c2a2c2758f247024e92c01846b0..5ff64b22540cd48b7f85a5a3eaf8c0ebefeb9e66 100644
--- a/lib/gpu/lal_cg_cmm_long.cu
+++ b/lib/gpu/lal_lj_sdk_long.cu
@@ -1,5 +1,5 @@
 // **************************************************************************
-//                                cg_cmm_long.cu
+//                                lj_sdk_long.cu
 //                             -------------------
 //                           W. Michael Brown (ORNL)
 //
@@ -29,7 +29,7 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif
 
-__kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
+__kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
                             const __global numtyp4 *restrict lj1,
                             const __global numtyp4 *restrict lj3,
                             const int lj_types,
@@ -154,7 +154,7 @@ __kernel void k_cg_cmm_long(const __global numtyp4 *restrict x_,
   } // if ii
 }
 
-__kernel void k_cg_cmm_long_fast(const __global numtyp4 *restrict x_,
+__kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
                                  const __global numtyp4 *restrict lj1_in,
                                  const __global numtyp4 *restrict lj3_in,
                                  const __global numtyp *restrict sp_lj_in,
diff --git a/lib/gpu/lal_cg_cmm_long.h b/lib/gpu/lal_lj_sdk_long.h
similarity index 98%
rename from lib/gpu/lal_cg_cmm_long.h
rename to lib/gpu/lal_lj_sdk_long.h
index aa0cbfbaf0dcfa43f8dc9c8c267230341c3d266c..f56687cd7dd10be2febb56683391fd8daa9ff498 100644
--- a/lib/gpu/lal_cg_cmm_long.h
+++ b/lib/gpu/lal_lj_sdk_long.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                                cg_cmm_long.h
+                                lj_sdk_long.h
                              -------------------
                             W. Michael Brown (ORNL)
 
diff --git a/lib/gpu/lal_cg_cmm_long_ext.cpp b/lib/gpu/lal_lj_sdk_long_ext.cpp
similarity index 93%
rename from lib/gpu/lal_cg_cmm_long_ext.cpp
rename to lib/gpu/lal_lj_sdk_long_ext.cpp
index ee0a0269e5dbca51ceff40ef0c89ca8596d60c8b..08390d3eeb8f1e011324e394f6b5b2281409746b 100644
--- a/lib/gpu/lal_cg_cmm_long_ext.cpp
+++ b/lib/gpu/lal_lj_sdk_long_ext.cpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                                cg_cmm_long.h
+                                lj_sdk_long.h
                              -------------------
                             W. Michael Brown (ORNL)
 
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <math.h>
 
-#include "lal_cg_cmm_long.h"
+#include "lal_lj_sdk_long.h"
 
 using namespace std;
 using namespace LAMMPS_AL;
@@ -27,7 +27,7 @@ static CGCMMLong<PRECISION,ACC_PRECISION> CMMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                   double **host_lj1, double **host_lj2, double **host_lj3,
                   double **host_lj4, double **offset, double *special_lj,
                   const int inum, const int nall, const int max_nbors,
@@ -93,11 +93,11 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   return init_ok;
 }
 
-void cmml_gpu_clear() {
+void sdkl_gpu_clear() {
   CMMLMF.clear();
 }
 
-int** cmml_gpu_compute_n(const int ago, const int inum_full,
+int** sdkl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          double *sublo, double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
@@ -111,7 +111,7 @@ int** cmml_gpu_compute_n(const int ago, const int inum_full,
                         host_q,boxlo,prd);
 }
 
-void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
+void sdkl_gpu_compute(const int ago, const int inum_full, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
@@ -122,7 +122,7 @@ void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
                 host_q,nlocal,boxlo,prd);
 }
 
-double cmml_gpu_bytes() {
+double sdkl_gpu_bytes() {
   return CMMLMF.host_memory_usage();
 }
 
diff --git a/lib/h5md/Install.py b/lib/h5md/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/h5md/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/h5md/Makefile b/lib/h5md/Makefile.h5cc
similarity index 95%
rename from lib/h5md/Makefile
rename to lib/h5md/Makefile.h5cc
index 085d21ff69c4e4c0502361be2e716c4b5ed6db9d..bd3e8a978432448a446d415ccfc7a062e0390f22 100644
--- a/lib/h5md/Makefile
+++ b/lib/h5md/Makefile.h5cc
@@ -19,7 +19,7 @@ build/ch5md.o: src/ch5md.c | build
 	$(CC) $(INC) $(CFLAGS) -c $< -o $@
 
 Makefile.lammps:
-	cp Makefile.lammps.empty $@
+	cp $(EXTRAMAKE) $@
 
 .PHONY: all lib clean
 
diff --git a/lib/h5md/README b/lib/h5md/README
index 62a4979cba6bc5b5853723481cfdb2ef19937791..fb7d82bfccc3dc993adb2cd39cae00cb52273944 100644
--- a/lib/h5md/README
+++ b/lib/h5md/README
@@ -3,6 +3,11 @@ LAMMPS under its own BSD license; see below.  This library is used
 when the USER-H5MD package is included in a LAMMPS build and the dump
 h5md command is invoked in a LAMMPS input script.
 
+You can type "make lib-h5md" from the src directory to see help on how
+to build this library via make commands, or you can do the same thing
+by typing "python Install.py" from within this directory, or you can
+do it manually by following the instructions below.
+
 ---------------------
 
 ch5md : Read and write H5MD files in C
@@ -17,8 +22,14 @@ molecular data, whose development is found at <http://nongnu.org/h5md/>.
 ch5md is developped by Pierre de Buyl and is released under the 3-clause BSD
 license that can be found in the file LICENSE.
 
-To use the h5md dump style in lammps, execute make in this directory then 'make
-yes-user-h5md' in the src directory of lammps. Rebuild lammps. 
+To use the h5md dump style in lammps, execute
+make -f Makefile.h5cc
+in this directory then
+make yes-user-h5md
+in the src directory of LAMMPS to rebuild LAMMPS. 
+
+Note that you must have the h5cc compiler installed to use
+Makefile.h5cc.  It should be part
 
 If HDF5 is not in a standard system location, edit Makefile.lammps accordingly.
 
diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 4a96e244188bb6c7d68987d34696fff392e2c997..c6fe991b9761d5ef20af649f54224b03f2dd7fe8 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,28 @@
 # Change Log
 
+## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
+
+**Implemented enhancements:**
+
+- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
+- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
+- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
+- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
+- Turning off IVDEP for Intel 14.  [\#638](https://github.com/kokkos/kokkos/issues/638)
+- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
+- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
+- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
+- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
+
+**Fixed bugs:**
+
+- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
+- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
+- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
+- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
+- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
+
 ## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)
 
diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
index 16854c839a044e5da9084d2a1a7eeb4360ab0327..1c820660ae375006e83bd50c0d4bbd8472ed0258 100644
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@@ -98,10 +98,10 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
   )
 
 TRIBITS_ADD_OPTION_AND_DEFINE(
-  Kokkos_ENABLE_QTHREAD
-  KOKKOS_HAVE_QTHREAD
-  "Enable QTHREAD support in Kokkos."
-  "${TPL_ENABLE_QTHREAD}"
+  Kokkos_ENABLE_Qthreads
+  KOKKOS_HAVE_QTHREADS
+  "Enable Qthreads support in Kokkos."
+  "${TPL_ENABLE_QTHREADS}"
   )
 
 TRIBITS_ADD_OPTION_AND_DEFINE(
@@ -110,7 +110,7 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
   "Enable C++11 support in Kokkos."
   "${${PROJECT_NAME}_ENABLE_CXX11}"
   )
-  
+
 TRIBITS_ADD_OPTION_AND_DEFINE(
   Kokkos_ENABLE_HWLOC
   KOKKOS_HAVE_HWLOC
@@ -213,4 +213,3 @@ TRIBITS_EXCLUDE_FILES(
   )
 
 TRIBITS_PACKAGE_POSTPROCESS()
-
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 9d00c19027a37387888d9f0265c7cdfecb45cc56..5b094dba8cb786c94c9119a5865fcc0dadf9a76f 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -1,39 +1,38 @@
-# Default settings common options
+# Default settings common options.
 
 #LAMMPS specific settings:
 KOKKOS_PATH=../../lib/kokkos
 CXXFLAGS=$(CCFLAGS)
 
-#Options: OpenMP,Serial,Pthreads,Cuda
+# Options: Cuda,OpenMP,Pthreads,Qthreads,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthreads"
-#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
+# Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
 KOKKOS_ARCH ?= ""
-#Options: yes,no
+# Options: yes,no
 KOKKOS_DEBUG ?= "no"
-#Options: hwloc,librt,experimental_memkind
+# Options: hwloc,librt,experimental_memkind
 KOKKOS_USE_TPLS ?= ""
-#Options: c++11,c++1z
+# Options: c++11,c++1z
 KOKKOS_CXX_STANDARD ?= "c++11"
-#Options: aggressive_vectorization,disable_profiling
+# Options: aggressive_vectorization,disable_profiling
 KOKKOS_OPTIONS ?= ""
 
-#Default settings specific options
-#Options: force_uvm,use_ldg,rdc,enable_lambda
+# Default settings specific options.
+# Options: force_uvm,use_ldg,rdc,enable_lambda
 KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
 
-# Check for general settings
-
+# Check for general settings.
 KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
 KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))
 
-# Check for external libraries
+# Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
 KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
 KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
 
-# Check for advanced settings
+# Check for advanced settings.
 KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
 KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
@@ -41,21 +40,21 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | gr
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
 
-# Check for Kokkos Host Execution Spaces one of which must be on
-
+# Check for Kokkos Host Execution Spaces one of which must be on.
 KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
 KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
+KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l))
 KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
-KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
-	KOKKOS_INTERNAL_USE_SERIAL := 1
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
+  KOKKOS_INTERNAL_USE_SERIAL := 1
+endif
 endif
 endif
 
-# Check for other Execution Spaces
-
+# Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@@ -64,27 +63,25 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
 endif
 
-# Check OS
-
+# Check OS.
 KOKKOS_OS                      := $(shell uname -s)
 KOKKOS_INTERNAL_OS_CYGWIN      := $(shell uname -s | grep CYGWIN | wc -l)
 KOKKOS_INTERNAL_OS_LINUX       := $(shell uname -s | grep Linux  | wc -l)
 KOKKOS_INTERNAL_OS_DARWIN      := $(shell uname -s | grep Darwin | wc -l)
 
-# Check compiler
-
-KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version        2>&1 | grep "Intel Corporation" | wc -l)
-KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version        2>&1 | grep PGI   | wc -l)
-KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion        2>&1 | grep XL    | wc -l)
-KOKKOS_INTERNAL_COMPILER_CRAY  := $(shell $(CXX) -craype-verbose  2>&1 | grep "CC-" | wc -l)
-KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(CXX) --version        2>&1 | grep "nvcc" | wc -l)
+# Check compiler.
+KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version       2>&1 | grep "Intel Corporation" | wc -l)
+KOKKOS_INTERNAL_COMPILER_PGI   := $(shell $(CXX) --version       2>&1 | grep PGI                 | wc -l)
+KOKKOS_INTERNAL_COMPILER_XL    := $(shell $(CXX) -qversion       2>&1 | grep XL                  | wc -l)
+KOKKOS_INTERNAL_COMPILER_CRAY  := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-"               | wc -l)
+KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(CXX) --version       2>&1 | grep "nvcc"              | wc -l)
 ifneq ($(OMPI_CXX),)
   KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(OMPI_CXX) --version   2>&1 | grep "nvcc" | wc -l)
 endif
 ifneq ($(MPICH_CXX),)
   KOKKOS_INTERNAL_COMPILER_NVCC  := $(shell $(MPICH_CXX) --version  2>&1 | grep "nvcc" | wc -l)
 endif
-KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version        2>&1 | grep "clang" | wc -l)
+KOKKOS_INTERNAL_COMPILER_CLANG := $(shell $(CXX) --version       2>&1 | grep "clang"             | wc -l)
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
   KOKKOS_INTERNAL_COMPILER_CLANG = 1
@@ -95,17 +92,17 @@ endif
 
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
   KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
+
   ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
     ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
-      $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)    
+      $(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
     endif
     KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1
   endif
 endif
 
-
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-  KOKKOS_INTERNAL_OPENMP_FLAG := -mp 
+  KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@@ -114,7 +111,7 @@ else
       KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
     else
       ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-        # OpenMP is turned on by default in Cray compiler environment
+        # OpenMP is turned on by default in Cray compiler environment.
         KOKKOS_INTERNAL_OPENMP_FLAG :=
       else
         KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
@@ -138,9 +135,9 @@ else
   endif
 endif
 
-# Check for Kokkos Architecture settings
+# Check for Kokkos Architecture settings.
 
-#Intel based
+# Intel based.
 KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
@@ -148,8 +145,8 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW |
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
 
-#NVIDIA based
-NVCC_WRAPPER :=  $(KOKKOS_PATH)/config/nvcc_wrapper
+# NVIDIA based.
+NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
@@ -170,46 +167,46 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
                                                       + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
-KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
-KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
-KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
-                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
-endif
-
-#ARM based
+  KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
+  KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
+  KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60)  \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                        + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
+endif
+
+# ARM based.
 KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))
 
-#IBM based
+# IBM based.
 KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power9 | wc -l))
 KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
 
-#AMD based
+# AMD based.
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
 
-#Any AVX?
+# Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
 
-# Decide what ISA level we are able to support
-KOKKOS_INTERNAL_USE_ISA_X86_64     := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
-KOKKOS_INTERNAL_USE_ISA_KNC        := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
-KOKKOS_INTERNAL_USE_ISA_POWERPCLE  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
+# Decide what ISA level we are able to support.
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
+KOKKOS_INTERNAL_USE_ISA_KNC       := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
+KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
 
-#Incompatible flags?
+# Incompatible flags?
 KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
 
@@ -220,7 +217,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
   $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
 endif
 
-#Generating the list of Flags
+# Generating the list of Flags.
 
 KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
 
@@ -233,98 +230,96 @@ KOKKOS_CXXFLAGS =
 
 KOKKOS_LIBS = -lkokkos -ldl
 KOKKOS_LDFLAGS = -L$(shell pwd)
-KOKKOS_SRC = 
+KOKKOS_SRC =
 KOKKOS_HEADERS =
 
-#Generating the KokkosCore_config.h file
+# Generating the KokkosCore_config.h file.
 
 tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
 tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
 tmp := $(shell date >> KokkosCore_config.tmp)
 tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
 
-
 tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) 
+  tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREADS 1" >> KokkosCore_config.tmp )
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
-	tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
-  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1)
-	tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
-  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
-	tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
-  	tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
-endif
-
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
-	KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib 
-	tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
 endif
 
 tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
-	tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
-        KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
-        tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
-        tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-	KOKKOS_CXXFLAGS += -lineinfo
+  KOKKOS_CXXFLAGS += -lineinfo
 endif
-	KOKKOS_CXXFLAGS += -g 
-	KOKKOS_LDFLAGS += -g -ldl
-	tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += -g
+  KOKKOS_LDFLAGS += -g -ldl
+  tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
-	KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
-	KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib 
-        KOKKOS_LIBS += -lhwloc
-	tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
+  KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
+  KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
+  KOKKOS_LIBS += -lhwloc
+  tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
-	tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
   tmp := $(shell echo "\#define KOKKOSP_ENABLE_RTLIB 1" >> KokkosCore_config.tmp )
-	KOKKOS_LIBS += -lrt
+  KOKKOS_LIBS += -lrt
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
-  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib 
-        KOKKOS_LIBS += -lmemkind
+  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
+  KOKKOS_LIBS += -lmemkind
   tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif
 
@@ -341,262 +336,286 @@ endif
 tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
-	tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += --relocatable-device-code=true
-	KOKKOS_LDFLAGS += --relocatable-device-code=true
+  tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += --relocatable-device-code=true
+  KOKKOS_LDFLAGS += --relocatable-device-code=true
 endif
 
 ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
     ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
-	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -expt-extended-lambda
+      tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
+      KOKKOS_CXXFLAGS += -expt-extended-lambda
     else
       $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
     endif
   endif
+
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
     tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
   endif
 endif
+
 endif
 
-#Add Architecture flags
+# Add Architecture flags.
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-	KOKKOS_CXXFLAGS +=
-	KOKKOS_LDFLAGS +=
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
     else
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-		KOKKOS_CXXFLAGS +=
-		KOKKOS_LDFLAGS +=
-	else
-		KOKKOS_CXXFLAGS += -march=armv8-a
-		KOKKOS_LDFLAGS += -march=armv8-a
-	endif
+      KOKKOS_CXXFLAGS += -march=armv8-a
+      KOKKOS_LDFLAGS += -march=armv8-a
     endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-	KOKKOS_CXXFLAGS +=
-	KOKKOS_LDFLAGS +=
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
     else
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-		KOKKOS_CXXFLAGS +=
-		KOKKOS_LDFLAGS +=
-	else
-		KOKKOS_CXXFLAGS += -march=armv8.1-a
-		KOKKOS_LDFLAGS += -march=armv8.1-a
-	endif
+      KOKKOS_CXXFLAGS += -march=armv8.1-a
+      KOKKOS_LDFLAGS += -march=armv8.1-a
     endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
-    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-	KOKKOS_CXXFLAGS +=
-	KOKKOS_LDFLAGS +=
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+    KOKKOS_CXXFLAGS +=
+    KOKKOS_LDFLAGS +=
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+      KOKKOS_CXXFLAGS +=
+      KOKKOS_LDFLAGS +=
     else
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
-		KOKKOS_CXXFLAGS +=
-		KOKKOS_LDFLAGS +=
-	else
-		KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
-		KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
-	endif
+      KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
+      KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
     endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -mavx
-		KOKKOS_LDFLAGS  += -mavx
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
-				KOKKOS_CXXFLAGS += -tp=sandybridge
-				KOKKOS_LDFLAGS  += -tp=sandybridge
-			else
-				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -mavx
-				KOKKOS_LDFLAGS  += -mavx
-			endif
-		endif
-	endif
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -mavx
+    KOKKOS_LDFLAGS  += -mavx
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=sandybridge
+        KOKKOS_LDFLAGS  += -tp=sandybridge
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -mavx
+        KOKKOS_LDFLAGS  += -mavx
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
+  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-	else
-		# Assume that this is a really a GNU compiler or it could be XL on P8
-		KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-		KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
-	endif
+  else
+    # Assume that this is a really a GNU compiler or it could be XL on P8.
+    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
+  tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-	else
-		# Assume that this is a really a GNU compiler or it could be XL on P9
-		KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
-		KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
-	endif
+  else
+    # Assume that this is a really a GNU compiler or it could be XL on P9.
+    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -xCORE-AVX2
-		KOKKOS_LDFLAGS  += -xCORE-AVX2
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
-
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) 
-				KOKKOS_CXXFLAGS += -tp=haswell
-				KOKKOS_LDFLAGS  += -tp=haswell
-			else
-				# Assume that this is a really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
-				KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
-			endif
-		endif
-	endif
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX2
+    KOKKOS_LDFLAGS  += -xCORE-AVX2
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+        KOKKOS_CXXFLAGS += -tp=haswell
+        KOKKOS_LDFLAGS  += -tp=haswell
+      else
+        # Assume that this is a really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
+        KOKKOS_LDFLAGS  += -march=core-avx2 -mtune=core-avx2
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -xMIC-AVX512
-		KOKKOS_LDFLAGS  += -xMIC-AVX512
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xMIC-AVX512
+    KOKKOS_LDFLAGS  += -xMIC-AVX512
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    else
+       ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-			else
-				# Asssume that this is really a GNU compiler
-				KOKKOS_CXXFLAGS += -march=knl
-				KOKKOS_LDFLAGS  += -march=knl
-			endif
-		endif
-	endif
+      else
+        # Asssume that this is really a GNU compiler.
+        KOKKOS_CXXFLAGS += -march=knl
+        KOKKOS_LDFLAGS  += -march=knl
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
-		KOKKOS_CXXFLAGS += -xCORE-AVX512
-		KOKKOS_LDFLAGS  += -xCORE-AVX512
-	else
-		ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
+  tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
+    KOKKOS_CXXFLAGS += -xCORE-AVX512
+    KOKKOS_LDFLAGS  += -xCORE-AVX512
+  else
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
 
-		else
-			ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
-			else
-				# Nothing here yet
-				KOKKOS_CXXFLAGS += -march=skylake-avx512
-				KOKKOS_LDFLAGS  += -march=skylake-avx512
-			endif
-		endif
-	endif
+      else
+        # Nothing here yet.
+        KOKKOS_CXXFLAGS += -march=skylake-avx512
+        KOKKOS_LDFLAGS  += -march=skylake-avx512
+      endif
+    endif
+  endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += -mmic
-	KOKKOS_LDFLAGS += -mmic
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += -mmic
+  KOKKOS_LDFLAGS += -mmic
 endif
 
-#Figure out the architecture flag for Cuda
+# Figure out the architecture flag for Cuda.
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+
 ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
   KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
 endif
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
+  KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=--cuda-gpu-arch
+  KOKKOS_CXXFLAGS += -x cuda
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
-	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
-        KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
 endif
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
-    tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
-        KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
 endif
+
 endif
- 
+
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
 ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
-KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+  KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
 else
-KOKKOS_INTERNAL_NEW_CONFIG := 1
+  KOKKOS_INTERNAL_NEW_CONFIG := 1
 endif
 
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
-	tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
+  tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
 endif
 
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
@@ -609,53 +628,57 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
 KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
-	KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
-	KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 
-	KOKKOS_LIBS += -lcudart -lcuda
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+  KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
+  KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
+  KOKKOS_LIBS += -lcudart -lcuda
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	KOKKOS_LIBS += -lpthread
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  else
+    KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+  endif
+
+  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_LIBS += -lqthread
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+  KOKKOS_LIBS += -lpthread
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
-	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
-	ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
-		KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
-	else
-		KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
-	endif
-	KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
+  KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
+  KOKKOS_LIBS += -lqthread
 endif
 
-#Explicitly set the GCC Toolchain for Clang
+# Explicitly set the GCC Toolchain for Clang.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
-    KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
-    KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC
-    KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
+  KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
+  KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
+  KOKKOS_CXXFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN) -DKOKKOS_CUDA_CLANG_WORKAROUND -DKOKKOS_CUDA_USE_LDG_INTRINSIC
+  KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
 endif
 
-#With Cygwin functions such as fdopen and fileno are not defined 
-#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
-#though. So we hard undefine it here. Not sure if that has any bad side effects
-#This is needed for gtest actually, not for Kokkos itself!
+# With Cygwin functions such as fdopen and fileno are not defined
+# when strict ansi is enabled. strict ansi gets enabled with --std=c++11
+# though. So we hard undefine it here. Not sure if that has any bad side effects
+# This is needed for gtest actually, not for Kokkos itself!
 ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
   KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
 endif
 
-# Setting up dependencies
+# Setting up dependencies.
 
 KokkosCore_config.h:
 
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index a48a5f6eb7ea78712b3f6caf695745b4ef18c043..54cacb741b4f35a0033d8de0e57ded9d4dab0a00 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -18,6 +18,8 @@ Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
 Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
+Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
 Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
 Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -43,11 +45,11 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-Kokkos_QthreadExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
-Kokkos_Qthread_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
-	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
+Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@@ -59,4 +61,3 @@ endif
 
 Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
-
diff --git a/lib/kokkos/README b/lib/kokkos/README
index 7ebde23a1fdbc0bff5f62c025e890b204edec591..257a2e5db475dea8c89f1468c42432614c909762 100644
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@@ -45,31 +45,39 @@ Primary tested compilers on X86 are:
   GCC 4.8.4
   GCC 4.9.2
   GCC 5.1.0
+  GCC 5.2.0
   Intel 14.0.4
   Intel 15.0.2
   Intel 16.0.1
   Intel 17.0.098
+  Intel 17.1.132
   Clang 3.5.2
   Clang 3.6.1
+  Clang 3.7.1
+  Clang 3.8.1
   Clang 3.9.0
+  PGI 17.1
 
 Primary tested compilers on Power 8 are:
   GCC 5.4.0 (OpenMP,Serial)
   IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)
 
 Primary tested compilers on Intel KNL are:
+   GCC 6.2.0
    Intel 16.2.181 (with gcc 4.7.2)
    Intel 17.0.098 (with gcc 4.7.2)
+   Intel 17.1.132 (with gcc 4.9.3)
+   Intel 17.2.174 (with gcc 4.9.3)
+   Intel 18.0.061 (beta) (with gcc 4.9.3)
 
 Secondary tested compilers are:
-  CUDA 7.0 (with gcc 4.7.2)
-  CUDA 7.5 (with gcc 4.7.2)
+  CUDA 7.0 (with gcc 4.8.4)
+  CUDA 7.5 (with gcc 4.8.4)
   CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
   CUDA/Clang 8.0 using Clang/Trunk compiler
 
 Other compilers working:
   X86:
-   PGI 15.4
    Cygwin 2.1.0 64bit with gcc 4.9.3
 
 Known non-working combinations:
diff --git a/lib/kokkos/algorithms/cmake/Dependencies.cmake b/lib/kokkos/algorithms/cmake/Dependencies.cmake
index 1d71d8af341181f689a6a8bf63036b67584cb138..c36b62523fadb628e970b6eccf57a9caaa317f1e 100644
--- a/lib/kokkos/algorithms/cmake/Dependencies.cmake
+++ b/lib/kokkos/algorithms/cmake/Dependencies.cmake
@@ -1,5 +1,5 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_REQUIRED_PACKAGES KokkosCore
+  LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
   LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
   TEST_OPTIONAL_TPLS CUSPARSE
   )
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index d376173bf183615e29f66bbecf6bd42cd1134a9e..bd73582362eed46161ee0ac0cf36fec4d5178129 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -547,7 +547,7 @@ namespace Kokkos {
 
     KOKKOS_INLINE_FUNCTION
     Random_XorShift64 (uint64_t state, int state_idx = 0)
-     : state_(state),state_idx_(state_idx){}
+     : state_(state==0?uint64_t(1318319):state),state_idx_(state_idx){}
 
     KOKKOS_INLINE_FUNCTION
     uint32_t urand() {
@@ -719,6 +719,9 @@ namespace Kokkos {
     }
 
     void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
+
       num_states_ = num_states;
 
       locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
@@ -968,8 +971,9 @@ namespace Kokkos {
 
     inline
     void init(uint64_t seed, int num_states) {
+      if(seed==0)
+        seed = uint64_t(1318319);
       num_states_ = num_states;
-
       locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
       state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
       p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
index 5b8c65fee1869c25681567036314d25beab9a5f2..237de751fe4b30afa1abcf475ca8af8c52cea7ab 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -53,69 +53,122 @@ namespace Kokkos {
 
   namespace Impl {
 
-  template<class ValuesViewType, int Rank=ValuesViewType::Rank>
+  template< class DstViewType , class SrcViewType
+          , int Rank = DstViewType::Rank >
   struct CopyOp;
 
-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,1> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,1> {
     KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
       dst(i_dst) = src(i_src);
     }
   };
 
-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,2> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,2> {
     KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
-      for(int j = 0;j< (int) dst.dimension_1(); j++)
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0;j< (int) dst.extent(1); j++)
         dst(i_dst,j) = src(i_src,j);
     }
   };
 
-  template<class ValuesViewType>
-  struct CopyOp<ValuesViewType,3> {
-    template<class DstType, class SrcType>
+  template< class DstViewType , class SrcViewType >
+  struct CopyOp<DstViewType,SrcViewType,3> {
     KOKKOS_INLINE_FUNCTION
-    static void copy(DstType& dst, size_t i_dst,
-                     SrcType& src, size_t i_src ) {
-      for(int j = 0; j<dst.dimension_1(); j++)
-        for(int k = 0; k<dst.dimension_2(); k++)
+    static void copy(DstViewType const& dst, size_t i_dst,
+                     SrcViewType const& src, size_t i_src ) {
+      for(int j = 0; j<dst.extent(1); j++)
+        for(int k = 0; k<dst.extent(2); k++)
           dst(i_dst,j,k) = src(i_src,j,k);
     }
   };
   }
 
-template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
-         class SizeType = typename KeyViewType::memory_space::size_type>
+//----------------------------------------------------------------------------
+
+template< class KeyViewType
+        , class BinSortOp
+        , class Space = typename KeyViewType::device_type
+        , class SizeType = typename KeyViewType::memory_space::size_type
+        >
 class BinSort {
+public:
 
+  template< class DstViewType , class SrcViewType >
+  struct copy_functor {
 
-public:
-  template<class ValuesViewType, class PermuteViewType, class CopyOp>
-  struct bin_sort_sort_functor {
-    typedef ExecutionSpace execution_space;
-    typedef typename ValuesViewType::non_const_type values_view_type;
-    typedef typename ValuesViewType::const_type const_values_view_type;
-    Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
-                 typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
-    values_view_type sorted_values;
-    typename PermuteViewType::const_type sort_order;
-    bin_sort_sort_functor(const_values_view_type values_, values_view_type  sorted_values_, PermuteViewType sort_order_):
-       values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
+    typedef typename SrcViewType::const_type  src_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    src_view_type   src_values ;
+    int             dst_offset ;
+
+    copy_functor( DstViewType  const & dst_values_
+                , int          const & dst_offset_
+                , SrcViewType  const & src_values_
+                )
+      : dst_values( dst_values_ )
+      , src_values( src_values_ )
+      , dst_offset( dst_offset_ )
+      {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i) const {
+      // printf("copy: dst(%i) src(%i)\n",i+dst_offset,i);
+      copy_op::copy(dst_values,i+dst_offset,src_values,i);
+    }
+  };
+
+  template< class DstViewType
+          , class PermuteViewType
+          , class SrcViewType
+          >
+  struct copy_permute_functor {
+
+    // If a Kokkos::View then can generate constant random access
+    // otherwise can only use the constant type.
+
+    typedef typename std::conditional
+      < Kokkos::is_view< SrcViewType >::value
+      , Kokkos::View< typename SrcViewType::const_data_type
+                    , typename SrcViewType::array_layout
+                    , typename SrcViewType::device_type
+                    , Kokkos::MemoryTraits<Kokkos::RandomAccess>
+                    >
+      , typename SrcViewType::const_type
+      >::type src_view_type ;
+
+    typedef typename PermuteViewType::const_type  perm_view_type ;
+
+    typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
+
+    DstViewType     dst_values ;
+    perm_view_type  sort_order ;
+    src_view_type   src_values ;
+
+    copy_permute_functor( DstViewType     const & dst_values_
+                        , PermuteViewType const & sort_order_
+                        , SrcViewType     const & src_values_
+                        )
+      : dst_values( dst_values_ )
+      , sort_order( sort_order_ )
+      , src_values( src_values_ )
+      {}
 
     KOKKOS_INLINE_FUNCTION
     void operator() (const int& i)  const {
-      //printf("Sort: %i %i\n",i,sort_order(i));
-      CopyOp::copy(sorted_values,i,values,sort_order(i));
+      // printf("copy_permute: dst(%i) src(%i)\n",i,sort_order(i));
+      copy_op::copy(dst_values,i,src_values,sort_order(i));
     }
   };
 
-  typedef ExecutionSpace execution_space;
+  typedef typename Space::execution_space  execution_space;
   typedef BinSortOp bin_op_type;
 
   struct bin_count_tag {};
@@ -124,84 +177,137 @@ public:
   struct bin_sort_bins_tag {};
 
 public:
+
   typedef SizeType size_type;
   typedef size_type value_type;
 
-  typedef Kokkos::View<size_type*, execution_space> offset_type;
-  typedef Kokkos::View<const int*, execution_space> bin_count_type;
+  typedef Kokkos::View<size_type*, Space> offset_type;
+  typedef Kokkos::View<const int*, Space> bin_count_type;
 
+  typedef typename KeyViewType::const_type  const_key_view_type ;
 
-  typedef Kokkos::View<typename KeyViewType::const_data_type,
-                       typename KeyViewType::array_layout,
-                       typename KeyViewType::memory_space> const_key_view_type;
-  typedef Kokkos::View<typename KeyViewType::const_data_type,
-                       typename KeyViewType::array_layout,
-                       typename KeyViewType::memory_space,
-                       Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
+  // If a Kokkos::View then can generate constant random access
+  // otherwise can only use the constant type.
+
+  typedef typename std::conditional
+    < Kokkos::is_view< KeyViewType >::value
+    , Kokkos::View< typename KeyViewType::const_data_type,
+                    typename KeyViewType::array_layout,
+                    typename KeyViewType::device_type,
+                    Kokkos::MemoryTraits<Kokkos::RandomAccess> >
+    , const_key_view_type
+    >::type const_rnd_key_view_type;
 
   typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
   typedef typename KeyViewType::const_value_type     const_key_scalar;
 
+  typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic_type ;
+
 private:
+
   const_key_view_type keys;
   const_rnd_key_view_type keys_rnd;
 
 public:
-  BinSortOp bin_op;
 
-  offset_type bin_offsets;
+  BinSortOp             bin_op ;
+  offset_type           bin_offsets ;
+  bin_count_atomic_type bin_count_atomic ;
+  bin_count_type        bin_count_const ;
+  offset_type           sort_order ;
 
-  Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
-  bin_count_type bin_count_const;
-
-  offset_type sort_order;
-
-  bool sort_within_bins;
+  int                   range_begin ;
+  int                   range_end ;
+  bool                  sort_within_bins ;
 
 public:
 
-  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
-  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
-          bool sort_within_bins_ = false)
-     :keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
+  BinSort() {}
 
-    bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
+  //----------------------------------------
+  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
+  BinSort( const_key_view_type  keys_
+         , int                  range_begin_
+         , int                  range_end_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : keys(keys_)
+     , keys_rnd(keys_)
+     , bin_op(bin_op_)
+     , bin_offsets()
+     , bin_count_atomic()
+     , bin_count_const()
+     , sort_order()
+     , range_begin( range_begin_ )
+     , range_end( range_end_ )
+     , sort_within_bins( sort_within_bins_ )
+  {
+    bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
     bin_count_const =  bin_count_atomic;
     bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
-    sort_order =       offset_type("PermutationVector",keys.dimension_0());
-    sort_within_bins = sort_within_bins_;
+    sort_order =       offset_type("PermutationVector",range_end-range_begin);
   }
 
+  BinSort( const_key_view_type  keys_
+         , BinSortOp            bin_op_
+         , bool                 sort_within_bins_ = false
+         )
+     : BinSort( keys_ , 0 , keys_.extent(0), bin_op_ , sort_within_bins_ ) {}
+
+  //----------------------------------------
   // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
   void create_permute_vector() {
-    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag>    (0,keys.dimension_0()),*this);
-    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
+    const size_t len = range_end - range_begin ;
+    Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_count_tag>    (0,len),*this);
+    Kokkos::parallel_scan(Kokkos::RangePolicy<execution_space,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
 
     Kokkos::deep_copy(bin_count_atomic,0);
-    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag>  (0,keys.dimension_0()),*this);
+    Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_binning_tag>  (0,len),*this);
 
     if(sort_within_bins)
-      Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
+      Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
   }
 
   // Sort a view with respect ot the first dimension using the permutation array
   template<class ValuesViewType>
-  void sort(ValuesViewType values) {
-    ValuesViewType sorted_values = ValuesViewType("Copy",
-           values.dimension_0(),
-           values.dimension_1(),
-           values.dimension_2(),
-           values.dimension_3(),
-           values.dimension_4(),
-           values.dimension_5(),
-           values.dimension_6(),
-           values.dimension_7());
-
-    parallel_for(values.dimension_0(),
-        bin_sort_sort_functor<ValuesViewType, offset_type,
-                              Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
-
-    deep_copy(values,sorted_values);
+  void sort( ValuesViewType const & values)
+  {
+    typedef
+      Kokkos::View< typename ValuesViewType::data_type,
+                    typename ValuesViewType::array_layout,
+                    typename ValuesViewType::device_type >
+        scratch_view_type ;
+
+    const size_t len = range_end - range_begin ;
+
+    scratch_view_type
+      sorted_values("Scratch",
+                    len,
+                    values.extent(1),
+                    values.extent(2),
+                    values.extent(3),
+                    values.extent(4),
+                    values.extent(5),
+                    values.extent(6),
+                    values.extent(7));
+
+    {
+      copy_permute_functor< scratch_view_type /* DstViewType */
+                          , offset_type       /* PermuteViewType */
+                          , ValuesViewType    /* SrcViewType */
+                          >
+        functor( sorted_values , sort_order , values );
+
+      parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
+
+    {
+      copy_functor< ValuesViewType , scratch_view_type >
+        functor( values , range_begin , sorted_values );
+
+      parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
+    }
   }
 
   // Get the permutation vector
@@ -217,9 +323,11 @@ public:
   bin_count_type get_bin_count() const {return bin_count_const;}
 
 public:
+
   KOKKOS_INLINE_FUNCTION
   void operator() (const bin_count_tag& tag, const int& i) const {
-    bin_count_atomic(bin_op.bin(keys,i))++;
+    const int j = range_begin + i ;
+    bin_count_atomic(bin_op.bin(keys,j))++;
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -232,10 +340,11 @@ public:
 
   KOKKOS_INLINE_FUNCTION
   void operator() (const bin_binning_tag& tag, const int& i)  const {
-    const int bin = bin_op.bin(keys,i);
+    const int j     = range_begin + i ;
+    const int bin   = bin_op.bin(keys,j);
     const int count = bin_count_atomic(bin)++;
 
-    sort_order(bin_offsets(bin) + count) = i;
+    sort_order(bin_offsets(bin) + count) = j ;
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -262,13 +371,19 @@ public:
   }
 };
 
+//----------------------------------------------------------------------------
+
 template<class KeyViewType>
 struct BinOp1D {
-  const int max_bins_;
-  const double mul_;
+  int max_bins_;
+  double mul_;
   typename KeyViewType::const_value_type range_;
   typename KeyViewType::const_value_type min_;
 
+  BinOp1D():max_bins_(0),mul_(0.0),
+            range_(typename KeyViewType::const_value_type()),
+            min_(typename KeyViewType::const_value_type()) {}
+
   //Construct BinOp with number of bins, minimum value and maxuimum value
   BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
                                typename KeyViewType::const_value_type max )
@@ -302,12 +417,14 @@ struct BinOp3D {
   typename KeyViewType::non_const_value_type range_[3];
   typename KeyViewType::non_const_value_type min_[3];
 
+  BinOp3D() {}
+
   BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
                                typename KeyViewType::const_value_type max[] )
   {
-    max_bins_[0] = max_bins__[0]+1;
-    max_bins_[1] = max_bins__[1]+1;
-    max_bins_[2] = max_bins__[2]+1;
+    max_bins_[0] = max_bins__[0];
+    max_bins_[1] = max_bins__[1];
+    max_bins_[2] = max_bins__[2];
     mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
     mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
     mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
@@ -364,7 +481,7 @@ bool try_std_sort(ViewType view) {
   possible  = possible && (ViewType::Rank == 1);
   possible  = possible && (stride[0] == 1);
   if(possible)  {
-   std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
+   std::sort(view.data(),view.data()+view.extent(0));
   }
   return possible;
 }
@@ -386,7 +503,8 @@ struct min_max_functor {
 }
 
 template<class ViewType>
-void sort(ViewType view, bool always_use_kokkos_sort = false) {
+void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
+{
   if(!always_use_kokkos_sort) {
     if(Impl::try_std_sort(view)) return;
   }
@@ -394,14 +512,37 @@ void sort(ViewType view, bool always_use_kokkos_sort = false) {
 
   Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
   Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
-  parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
+  parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
                   Impl::min_max_functor<ViewType>(view),reducer);
   if(result.min_val == result.max_val) return;
-  BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
+  BinSort<ViewType, CompType> bin_sort(view,CompType(view.extent(0)/2,result.min_val,result.max_val),true);
   bin_sort.create_permute_vector();
   bin_sort.sort(view);
 }
 
+template<class ViewType>
+void sort( ViewType view
+         , size_t const begin
+         , size_t const end
+         )
+{
+  typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
+  typedef BinOp1D<ViewType> CompType;
+
+  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+
+  parallel_reduce( range_policy( begin , end )
+                 , Impl::min_max_functor<ViewType>(view),reducer );
+
+  if(result.min_val == result.max_val) return;
+
+  BinSort<ViewType, CompType>
+    bin_sort(view,begin,end,CompType((end-begin)/2,result.min_val,result.max_val),true);
+
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view);
+}
 }
 
 #endif
diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
index 03e4fb691ef1a4ae6a7bed6471ccba4e3fd53762..61ffa6f43a39ecbb1640a71de5afb9be33cd10dd 100644
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -44,6 +44,7 @@
 
 #include <gtest/gtest.h>
 #include<Kokkos_Core.hpp>
+#include<Kokkos_DynamicView.hpp>
 #include<Kokkos_Random.hpp>
 #include<Kokkos_Sort.hpp>
 
@@ -192,17 +193,81 @@ void test_3D_sort(unsigned int n) {
   double epsilon = 1e-10;
   unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
 
-  printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+  if ( sort_fails )
+    printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+
   ASSERT_EQ(sort_fails,0);
   ASSERT_EQ(equal_sum,1);
 }
 
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, typename KeyType>
+void test_dynamic_view_sort(unsigned int n )
+{
+  typedef typename ExecutionSpace::memory_space memory_space ;
+  typedef Kokkos::Experimental::DynamicView<KeyType*,ExecutionSpace> KeyDynamicViewType;
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+
+  const size_t upper_bound = 2 * n ;
+
+  typename KeyDynamicViewType::memory_pool
+    pool( memory_space() , 2 * n * sizeof(KeyType) );
+
+  KeyDynamicViewType keys("Keys",pool,upper_bound);
+
+  keys.resize_serial(n);
+
+  KeyViewType keys_view("KeysTmp", n );
+
+  // Test sorting array with all numbers equal
+  Kokkos::deep_copy(keys_view,KeyType(1));
+  Kokkos::Experimental::deep_copy(keys,keys_view);
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  Kokkos::Experimental::deep_copy(keys,keys_view);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_before);
+
+  Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
+
+  Kokkos::Experimental::deep_copy( keys_view , keys );
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  if ( sort_fails != 0 || equal_sum != 1 ) {
+    std::cout << " N = " << n
+              << " ; sum_before = " << sum_before
+              << " ; sum_after = " << sum_after
+              << " ; ratio = " << ratio
+              << std::endl ;
+  }
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+//----------------------------------------------------------------------------
+
 template<class ExecutionSpace, typename KeyType>
 void test_sort(unsigned int N)
 {
   test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
   test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
   test_3D_sort<ExecutionSpace,KeyType>(N);
+  test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
 }
 
 }
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index cb206cf88b2c4e3a4f289bc919cc272e22749f36..09fa5d500abcdfe718a6d3bb12db5c91fc5ec174 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -140,6 +140,9 @@ do
   #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
   -pedantic|-Wpedantic|-ansi)
     ;;
+  #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C"
+  -Woverloaded-virtual)
+    ;;
   #strip -Xcompiler because we add it
   -Xcompiler)
     if [ $first_xcompiler_arg -eq 1 ]; then
@@ -190,7 +193,7 @@ do
     object_files_xlinker="$object_files_xlinker -Xlinker $1"
     ;;
   #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
-  *.dylib)
+  @*|*.dylib)
     object_files="$object_files -Xlinker $1"
     object_files_xlinker="$object_files_xlinker -Xlinker $1"
     ;;
diff --git a/lib/kokkos/cmake/deps/QTHREAD.cmake b/lib/kokkos/cmake/deps/QTHREADS.cmake
similarity index 98%
rename from lib/kokkos/cmake/deps/QTHREAD.cmake
rename to lib/kokkos/cmake/deps/QTHREADS.cmake
index 994b72b20096f4462beab51d19e4410cd73bf05b..c312f2590bcd29197a0cf3fbd5e0b484579a09c2 100644
--- a/lib/kokkos/cmake/deps/QTHREAD.cmake
+++ b/lib/kokkos/cmake/deps/QTHREADS.cmake
@@ -63,8 +63,7 @@
 #    Source:        https://code.google.com/p/qthreads
 #
 
-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
   REQUIRED_HEADERS qthread.h
   REQUIRED_LIBS_NAMES "qthread"
   )
-
diff --git a/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake b/lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
similarity index 98%
rename from lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
rename to lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
index 994b72b20096f4462beab51d19e4410cd73bf05b..c312f2590bcd29197a0cf3fbd5e0b484579a09c2 100644
--- a/lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
+++ b/lib/kokkos/cmake/tpls/FindTPLQTHREADS.cmake
@@ -63,8 +63,7 @@
 #    Source:        https://code.google.com/p/qthreads
 #
 
-TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
   REQUIRED_HEADERS qthread.h
   REQUIRED_LIBS_NAMES "qthread"
   )
-
diff --git a/lib/kokkos/config/kokkos_dev/config-core-all.sh b/lib/kokkos/config/kokkos_dev/config-core-all.sh
index fa588c778f68330ff130364e9425d5a6aefa357c..d4fb25a8e139c315a862306173a0b1d2a07e7cbd 100755
--- a/lib/kokkos/config/kokkos_dev/config-core-all.sh
+++ b/lib/kokkos/config/kokkos_dev/config-core-all.sh
@@ -6,7 +6,7 @@
 #-----------------------------------------------------------------------------
 # Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
 #
-#   Cuda, OpenMP, Threads, Qthread, hwloc
+#   Cuda, OpenMP, Threads, Qthreads, hwloc
 #
 # module loaded on 'kokkos-dev.sandia.gov' for this build
 #
@@ -82,13 +82,13 @@ CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
 CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
 
 #-----------------------------------------------------------------------------
-# Qthread
+# Qthreads
 
-QTHREAD_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
+QTHREADS_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
 
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREAD:BOOL=ON"
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_INCLUDE_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/include"
-CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_LIBRARY_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/lib"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREADS:BOOL=ON"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_INCLUDE_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/include"
+CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_LIBRARY_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/lib"
 
 #-----------------------------------------------------------------------------
 # C++11
@@ -108,6 +108,3 @@ rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
 echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
 
 cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
-
-#-----------------------------------------------------------------------------
-
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index 446cbb021610164980cc6dd0fdced42b162422d7..9eaecb5031b1328989e114b50a86ac07c78b8e29 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -4,4 +4,5 @@ tag:  2.01.10    date: 09:27:2016    master: e4119325    develop: e6cda11e
 tag:  2.02.00    date: 10:30:2016    master: 6c90a581    develop: ca3dd56e
 tag:  2.02.01    date: 11:01:2016    master: 9c698c86    develop: b0072304
 tag:  2.02.07    date: 12:16:2016    master: 4b4cc4ba    develop: 382c0966
-tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6 
+tag:  2.02.15    date: 02:10:2017    master: 8c64cd93    develop: 28dea8b6
+tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641 
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
index 2c15e951ba25f4831c888fa731b9e25954ee0ead..6909606643df6b83c2dc77c2469768e02a13844d 100755
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@@ -6,29 +6,29 @@
 
 set -o pipefail
 
-# Determine current machine
+# Determine current machine.
 
 MACHINE=""
 HOSTNAME=$(hostname)
 PROCESSOR=`uname -p`
 
 if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
-    MACHINE=white
+  MACHINE=white
 elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
-    MACHINE=bowman
+  MACHINE=bowman
 elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
-    if [[ "$PROCESSOR" = "aarch64" ]]; then
-        MACHINE=sullivan
-    else
-        MACHINE=shepard
-    fi
+  if [[ "$PROCESSOR" = "aarch64" ]]; then
+    MACHINE=sullivan
+  else
+    MACHINE=shepard
+  fi
 elif [[ "$HOSTNAME" =~ apollo ]]; then
-    MACHINE=apollo
+  MACHINE=apollo
 elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
-    MACHINE=sems
+  MACHINE=sems
 else
-    echo "Unrecognized machine" >&2
-    exit 1
+  echo "Unrecognized machine" >&2
+  exit 1
 fi
 
 GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
@@ -45,10 +45,11 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
 INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
 CUDA_WARNING_FLAGS=""
 
-# Default. Machine specific can override
+# Default. Machine specific can override.
 DEBUG=False
 ARGS=""
 CUSTOM_BUILD_LIST=""
+QTHREADS_PATH=""
 DRYRUN=False
 BUILD_ONLY=False
 declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
@@ -60,86 +61,90 @@ PRINT_HELP=False
 OPT_FLAG=""
 KOKKOS_OPTIONS=""
 
-
 #
-# Handle arguments
+# Handle arguments.
 #
 
 while [[ $# > 0 ]]
 do
-key="$1"
-case $key in
---kokkos-path*)
-KOKKOS_PATH="${key#*=}"
-;;
---build-list*)
-CUSTOM_BUILD_LIST="${key#*=}"
-;;
---debug*)
-DEBUG=True
-;;
---build-only*)
-BUILD_ONLY=True
-;;
---test-script*)
-TEST_SCRIPT=True
-;;
---skip-hwloc*)
-SKIP_HWLOC=True
-;;
---num*)
-NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
-;;
---dry-run*)
-DRYRUN=True
-;;
---spot-check*)
-SPOT_CHECK=True
-;;
---arch*)
-ARCH_FLAG="--arch=${key#*=}"
-;;
---opt-flag*)
-OPT_FLAG="${key#*=}"
-;;
---with-cuda-options*)
-KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
-;;
---help*)
-PRINT_HELP=True
-;;
-*)
-# args, just append
-ARGS="$ARGS $1"
-;;
-esac
-shift
+  key="$1"
+
+  case $key in
+    --kokkos-path*)
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
+    --build-list*)
+      CUSTOM_BUILD_LIST="${key#*=}"
+      ;;
+    --debug*)
+      DEBUG=True
+      ;;
+    --build-only*)
+      BUILD_ONLY=True
+      ;;
+    --test-script*)
+      TEST_SCRIPT=True
+      ;;
+    --skip-hwloc*)
+      SKIP_HWLOC=True
+      ;;
+    --num*)
+      NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
+      ;;
+    --dry-run*)
+      DRYRUN=True
+      ;;
+    --spot-check*)
+      SPOT_CHECK=True
+      ;;
+    --arch*)
+      ARCH_FLAG="--arch=${key#*=}"
+      ;;
+    --opt-flag*)
+      OPT_FLAG="${key#*=}"
+      ;;
+    --with-cuda-options*)
+      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
+      ;;
+    --help*)
+      PRINT_HELP=True
+      ;;
+    *)
+      # args, just append
+      ARGS="$ARGS $1"
+      ;;
+  esac
+
+  shift
 done
 
 SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
 
-# set kokkos path
+# Set kokkos path.
 if [ -z "$KOKKOS_PATH" ]; then
-    KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
+  KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
 else
-    # Ensure KOKKOS_PATH is abs path
-    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+  # Ensure KOKKOS_PATH is abs path.
+  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
 fi
 
 #
-# Machine specific config
+# Machine specific config.
 #
 
 if [ "$MACHINE" = "sems" ]; then
-    source /projects/sems/modulefiles/utils/sems-modules-init.sh
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
 
-    BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
-    CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
-    CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
+  BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
+  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
+  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG=""
-    fi 
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG=""
+  fi
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
@@ -153,120 +158,118 @@ if [ "$MACHINE" = "sems" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
     COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
                "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
                "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
-               "clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
                "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
                "cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
     )
   fi
-
 elif [ "$MACHINE" = "white" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=32
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
-    IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
-    CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
+  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
 
-    # Don't do pthread on white
-    GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
+  # Don't do pthread on white.
+  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-               "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
-               "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
-    )
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=Power8,Kepler37"
-    fi
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+             "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
+  )
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=Power8,Kepler37"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "bowman" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=32
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
 
-    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-    )
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  )
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=KNL"
-    fi
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=KNL"
+  fi
 
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "sullivan" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=96
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=96
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS")
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS")
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=ARMv8-ThunderX"
-    fi
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=ARMv8-ThunderX"
+  fi
 
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "shepard" ]; then
-    source /etc/profile.d/modules.sh
-    SKIP_HWLOC=True
-    export SLURM_TASKS_PER_NODE=32
+  source /etc/profile.d/modules.sh
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=32
 
-    BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
+  BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
 
-    OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
+  OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
 
-    # Format: (compiler module-list build-list exe-name warning-flag)
-    COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-               "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
-    )
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+             "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
+  )
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=HSW"
-    fi
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=HSW"
+  fi
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
 elif [ "$MACHINE" = "apollo" ]; then
-    source /projects/sems/modulefiles/utils/sems-modules-init.sh
-    module use /home/projects/modulefiles/local/x86-64
-    module load kokkos-env
+  source /projects/sems/modulefiles/utils/sems-modules-init.sh
+  module use /home/projects/modulefiles/local/x86-64
+  module load kokkos-env
 
-    module load sems-git
-    module load sems-tex
-    module load sems-cmake/3.5.2
-    module load sems-gdb
+  module load sems-git
+  module load sems-tex
+  module load sems-cmake/3.5.2
+  module load sems-gdb
 
-    SKIP_HWLOC=True
+  SKIP_HWLOC=True
 
-    BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
-    CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
-    CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
+  BASE_MODULE_LIST="sems-env,kokkos-env,sems-<COMPILER_NAME>/<COMPILER_VERSION>,kokkos-hwloc/1.10.1/base"
+  CUDA_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/4.8.4,kokkos-hwloc/1.10.1/base"
+  CUDA8_MODULE_LIST="sems-env,kokkos-env,kokkos-<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0,kokkos-hwloc/1.10.1/base"
 
-    CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/8.0.44"
-    NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
+  CLANG_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,cuda/8.0.44"
+  NVCC_MODULE_LIST="sems-env,kokkos-env,sems-git,sems-cmake/3.5.2,<COMPILER_NAME>/<COMPILER_VERSION>,sems-gcc/5.3.0"
 
-    BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
-    BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
-    BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
+  BUILD_LIST_CUDA_NVCC="Cuda_Serial,Cuda_OpenMP"
+  BUILD_LIST_CUDA_CLANG="Cuda_Serial,Cuda_Pthread"
+  BUILD_LIST_CLANG="Serial,Pthread,OpenMP"
 
   if [ "$SPOT_CHECK" = "True" ]; then
     # Format: (compiler module-list build-list exe-name warning-flag)
@@ -297,16 +300,16 @@ elif [ "$MACHINE" = "apollo" ]; then
     )
   fi
 
-    if [ -z "$ARCH_FLAG" ]; then
-      ARCH_FLAG="--arch=SNB,Kepler35"
-    fi
-    NUM_JOBS_TO_RUN_IN_PARALLEL=2
-else
-    echo "Unhandled machine $MACHINE" >&2
-    exit 1
-fi
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=SNB,Kepler35"
+  fi
 
+  NUM_JOBS_TO_RUN_IN_PARALLEL=2
 
+else
+  echo "Unhandled machine $MACHINE" >&2
+  exit 1
+fi
 
 export OMP_NUM_THREADS=4
 
@@ -315,119 +318,149 @@ declare -i NUM_RESULTS_TO_KEEP=7
 RESULT_ROOT_PREFIX=TestAll
 
 if [ "$PRINT_HELP" = "True" ]; then
-echo "test_all_sandia <ARGS> <OPTIONS>:"
-echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
-echo "    Defaults to root repo containing this script"
-echo "--debug: Run tests in debug. Defaults to False"
-echo "--test-script: Test this script, not Kokkos"
-echo "--skip-hwloc: Do not do hwloc tests"
-echo "--num=N: Number of jobs to run in parallel"
-echo "--spot-check: Minimal test set to issue pull request"
-echo "--dry-run: Just print what would be executed"
-echo "--build-only: Just do builds, don't run anything"
-echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
-echo "--arch=ARCHITECTURE: overwrite architecture flags"
-echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
-echo "--build-list=BUILD,BUILD,BUILD..."
-echo "    Provide a comma-separated list of builds instead of running all builds"
-echo "    Valid items:"
-echo "      OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
-echo "      Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
-echo ""
-
-echo "ARGS: list of expressions matching compilers to test"
-echo "  supported compilers sems"
-for COMPILER_DATA in "${COMPILERS[@]}"; do
+  echo "test_all_sandia <ARGS> <OPTIONS>:"
+  echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+  echo "    Defaults to root repo containing this script"
+  echo "--debug: Run tests in debug. Defaults to False"
+  echo "--test-script: Test this script, not Kokkos"
+  echo "--skip-hwloc: Do not do hwloc tests"
+  echo "--num=N: Number of jobs to run in parallel"
+  echo "--spot-check: Minimal test set to issue pull request"
+  echo "--dry-run: Just print what would be executed"
+  echo "--build-only: Just do builds, don't run anything"
+  echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
+  echo "--arch=ARCHITECTURE: overwrite architecture flags"
+  echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
+  echo "--build-list=BUILD,BUILD,BUILD..."
+  echo "    Provide a comma-separated list of builds instead of running all builds"
+  echo "    Valid items:"
+  echo "      OpenMP, Pthread, Qthreads, Serial, OpenMP_Serial, Pthread_Serial"
+  echo "      Qthreads_Serial, Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
+  echo ""
+
+  echo "ARGS: list of expressions matching compilers to test"
+  echo "  supported compilers sems"
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
     ARR=($COMPILER_DATA)
     COMPILER=${ARR[0]}
     echo "    $COMPILER"
-done
-echo ""
-
-echo "Examples:"
-echo "  Run all tests"
-echo "  % test_all_sandia"
-echo ""
-echo "  Run all gcc tests"
-echo "  % test_all_sandia gcc"
-echo ""
-echo "  Run all gcc/4.7.2 and all intel tests"
-echo "  % test_all_sandia gcc/4.7.2 intel"
-echo ""
-echo "  Run all tests in debug"
-echo "  % test_all_sandia --debug"
-echo ""
-echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
-echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
-echo ""
-echo "If you want to kill the tests, do:"
-echo "  hit ctrl-z"
-echo "  % kill -9 %1"
-echo
-exit 0
+  done
+  echo ""
+
+  echo "Examples:"
+  echo "  Run all tests"
+  echo "  % test_all_sandia"
+  echo ""
+  echo "  Run all gcc tests"
+  echo "  % test_all_sandia gcc"
+  echo ""
+  echo "  Run all gcc/4.7.2 and all intel tests"
+  echo "  % test_all_sandia gcc/4.7.2 intel"
+  echo ""
+  echo "  Run all tests in debug"
+  echo "  % test_all_sandia --debug"
+  echo ""
+  echo "  Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
+  echo "  % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
+  echo ""
+  echo "If you want to kill the tests, do:"
+  echo "  hit ctrl-z"
+  echo "  % kill -9 %1"
+  echo
+  exit 0
 fi
 
-# set build type
+# Set build type.
 if [ "$DEBUG" = "True" ]; then
-    BUILD_TYPE=debug
+  BUILD_TYPE=debug
 else
-    BUILD_TYPE=release
+  BUILD_TYPE=release
 fi
 
-# If no args provided, do all compilers
+# If no args provided, do all compilers.
 if [ -z "$ARGS" ]; then
-    ARGS='?'
+  ARGS='?'
 fi
 
-# Process args to figure out which compilers to test
+# Process args to figure out which compilers to test.
 COMPILERS_TO_TEST=""
+
 for ARG in $ARGS; do
-    for COMPILER_DATA in "${COMPILERS[@]}"; do
-        ARR=($COMPILER_DATA)
-        COMPILER=${ARR[0]}
-        if [[ "$COMPILER" = $ARG* ]]; then
-            if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
-                COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
-            else
-                echo "Tried to add $COMPILER twice"
-            fi
-        fi
-    done
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    COMPILER=${ARR[0]}
+
+    if [[ "$COMPILER" = $ARG* ]]; then
+      if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
+        COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
+      else
+        echo "Tried to add $COMPILER twice"
+      fi
+    fi
+  done
 done
 
+# Check if Qthreads build requested.
+HAVE_QTHREADS_BUILD="False"
+if [ -n "$CUSTOM_BUILD_LIST" ]; then
+  if [[ "$CUSTOM_BUILD_LIST" = *Qthreads* ]]; then
+    HAVE_QTHREADS_BUILD="True"
+  fi
+else
+  for COMPILER_DATA in "${COMPILERS[@]}"; do
+    ARR=($COMPILER_DATA)
+    BUILD_LIST=${ARR[2]}
+    if [[ "$BUILD_LIST" = *Qthreads* ]]; then
+      HAVE_QTHREADS_BUILD="True"
+    fi
+  done
+fi
+
+# Ensure Qthreads path is set if Qthreads build is requested.
+if [ "$HAVE_QTHREADS_BUILD" = "True" ]; then
+  if [ -z "$QTHREADS_PATH" ]; then
+    echo "Need to supply Qthreads path (--qthreads-path) when testing Qthreads backend." >&2
+    exit 1
+  else
+    # Strip trailing slashes from path.
+    QTHREADS_PATH=$(echo $QTHREADS_PATH | sed 's/\/*$//')
+  fi
+fi
+
 #
-# Functions
+# Functions.
 #
 
 # get_compiler_name <COMPILER>
 get_compiler_name() {
-    echo $1 | cut -d/ -f1
+  echo $1 | cut -d/ -f1
 }
 
 # get_compiler_version <COMPILER>
 get_compiler_version() {
-    echo $1 | cut -d/ -f2
+  echo $1 | cut -d/ -f2
 }
 
-# Do not call directly
+# Do not call directly.
 get_compiler_data() {
-    local compiler=$1
-    local item=$2
-    local compiler_name=$(get_compiler_name $compiler)
-    local compiler_vers=$(get_compiler_version $compiler)
-
-    local compiler_data
-    for compiler_data in "${COMPILERS[@]}" ; do
-        local arr=($compiler_data)
-        if [ "$compiler" = "${arr[0]}" ]; then
-            echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
-            return 0
-        fi
-    done
-
-    # Not found
-    echo "Unreconized compiler $compiler" >&2
-    exit 1
+  local compiler=$1
+  local item=$2
+  local compiler_name=$(get_compiler_name $compiler)
+  local compiler_vers=$(get_compiler_version $compiler)
+
+  local compiler_data
+  for compiler_data in "${COMPILERS[@]}" ; do
+    local arr=($compiler_data)
+
+    if [ "$compiler" = "${arr[0]}" ]; then
+      echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
+      return 0
+    fi
+  done
+
+  # Not found.
+  echo "Unreconized compiler $compiler" >&2
+  exit 1
 }
 
 #
@@ -435,227 +468,232 @@ get_compiler_data() {
 #
 
 get_compiler_modules() {
-    get_compiler_data $1 1
+  get_compiler_data $1 1
 }
 
 get_compiler_build_list() {
-    get_compiler_data $1 2
+  get_compiler_data $1 2
 }
 
 get_compiler_exe_name() {
-    get_compiler_data $1 3
+  get_compiler_data $1 3
 }
 
 get_compiler_warning_flags() {
-    get_compiler_data $1 4
+  get_compiler_data $1 4
 }
 
 run_cmd() {
-    echo "RUNNING: $*"
-    if [ "$DRYRUN" != "True" ]; then
-	eval "$* 2>&1"
-    fi
+  echo "RUNNING: $*"
+  if [ "$DRYRUN" != "True" ]; then
+    eval "$* 2>&1"
+  fi
 }
 
 # report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
 report_and_log_test_result() {
-    # Use sane var names
-    local success=$1; local desc=$2; local comment=$3;
+  # Use sane var names.
+  local success=$1; local desc=$2; local comment=$3;
 
-    if [ "$success" = "0" ]; then
-	echo "  PASSED $desc"
-        echo $comment > $PASSED_DIR/$desc
-    else
-        # For failures, comment should be the name of the phase that failed
-	echo "  FAILED $desc" >&2
-        echo $comment > $FAILED_DIR/$desc
-        cat ${desc}.${comment}.log
-    fi
+  if [ "$success" = "0" ]; then
+    echo "  PASSED $desc"
+    echo $comment > $PASSED_DIR/$desc
+  else
+    # For failures, comment should be the name of the phase that failed.
+    echo "  FAILED $desc" >&2
+    echo $comment > $FAILED_DIR/$desc
+    cat ${desc}.${comment}.log
+  fi
 }
 
 setup_env() {
-    local compiler=$1
-    local compiler_modules=$(get_compiler_modules $compiler)
-
-    module purge
-
-    local mod
-    for mod in $compiler_modules; do
-        echo "Loading module $mod"
-	module load $mod 2>&1
-        # It is ridiculously hard to check for the success of a loaded
-        # module. Module does not return error codes and piping to grep
-        # causes module to run in a subshell.
-        module list 2>&1 | grep "$mod" >& /dev/null || return 1
-    done
-
-    return 0
+  local compiler=$1
+  local compiler_modules=$(get_compiler_modules $compiler)
+
+  module purge
+
+  local mod
+  for mod in $compiler_modules; do
+    echo "Loading module $mod"
+    module load $mod 2>&1
+    # It is ridiculously hard to check for the success of a loaded
+    # module. Module does not return error codes and piping to grep
+    # causes module to run in a subshell.
+    module list 2>&1 | grep "$mod" >& /dev/null || return 1
+  done
+
+  return 0
 }
 
 # single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
 single_build_and_test() {
-    # Use sane var names
-    local compiler=$1; local build=$2; local build_type=$3;
+  # Use sane var names.
+  local compiler=$1; local build=$2; local build_type=$3;
+
+  # Set up env.
+  mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
+  cd $ROOT_DIR/$compiler/"${build}-$build_type"
+  local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
+  setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
 
-    # set up env
-    mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
-    cd $ROOT_DIR/$compiler/"${build}-$build_type"
-    local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
-    setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+  # Set up flags.
+  local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
+  local compiler_exe=$(get_compiler_exe_name $compiler)
 
-    # Set up flags
-    local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
-    local compiler_exe=$(get_compiler_exe_name $compiler)
+  if [[ "$build_type" = hwloc* ]]; then
+    local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
+  fi
 
+  if [[ "$build" = *Qthreads* ]]; then
     if [[ "$build_type" = hwloc* ]]; then
-        local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
+      local extra_args="$extra_args --qthreads-path=${QTHREADS_PATH}_hwloc"
+    else
+      local extra_args="$extra_args --qthreads-path=$QTHREADS_PATH"
     fi
+  fi
 
-    if [[ "$OPT_FLAG" = "" ]]; then
-      OPT_FLAG="-O3"
-    fi
+  if [[ "$OPT_FLAG" = "" ]]; then
+    OPT_FLAG="-O3"
+  fi
 
-    if [[ "$build_type" = *debug* ]]; then
-        local extra_args="$extra_args --debug"
-        local cxxflags="-g $compiler_warning_flags"
-    else
-        local cxxflags="$OPT_FLAG $compiler_warning_flags"
-    fi
+  if [[ "$build_type" = *debug* ]]; then
+    local extra_args="$extra_args --debug"
+    local cxxflags="-g $compiler_warning_flags"
+  else
+    local cxxflags="$OPT_FLAG $compiler_warning_flags"
+  fi
 
-    if [[ "$compiler" == cuda* ]]; then
-        cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
-        export TMPDIR=$(pwd)
-    fi
+  if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
+    local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
+  fi
 
-    if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
-        local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
-    fi
+  echo "  Starting job $desc"
 
-    echo "  Starting job $desc"
+  local comment="no_comment"
 
-    local comment="no_comment"
+  if [ "$TEST_SCRIPT" = "True" ]; then
+    local rand=$[ 1 + $[ RANDOM % 10 ]]
+    sleep $rand
 
-    if [ "$TEST_SCRIPT" = "True" ]; then
-        local rand=$[ 1 + $[ RANDOM % 10 ]]
-        sleep $rand
-        if [ $rand -gt 5 ]; then
-            run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
-        fi
-    else
-        run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
-        local -i build_start_time=$(date +%s)
-        run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
-        local -i build_end_time=$(date +%s)
-        comment="build_time=$(($build_end_time-$build_start_time))"
-        if [[ "$BUILD_ONLY" == False ]]; then
-            run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
-            local -i run_end_time=$(date +%s)
-            comment="$comment run_time=$(($run_end_time-$build_end_time))"
-        fi
+    if [ $rand -gt 5 ]; then
+      run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
     fi
+  else
+    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
+    local -i build_start_time=$(date +%s)
+    run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    local -i build_end_time=$(date +%s)
+    comment="build_time=$(($build_end_time-$build_start_time))"
+
+    if [[ "$BUILD_ONLY" == False ]]; then
+      run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
+      local -i run_end_time=$(date +%s)
+      comment="$comment run_time=$(($run_end_time-$build_end_time))"
+    fi
+  fi
 
-    report_and_log_test_result 0 $desc "$comment"
+  report_and_log_test_result 0 $desc "$comment"
 
-    return 0
+  return 0
 }
 
 # wait_for_jobs <NUM-JOBS>
 wait_for_jobs() {
-    local -i max_jobs=$1
-    local -i num_active_jobs=$(jobs | wc -l)
-    while [ $num_active_jobs -ge $max_jobs ]
-    do
-        sleep 1
-        num_active_jobs=$(jobs | wc -l)
-        jobs >& /dev/null
-    done
+  local -i max_jobs=$1
+  local -i num_active_jobs=$(jobs | wc -l)
+  while [ $num_active_jobs -ge $max_jobs ]
+  do
+    sleep 1
+    num_active_jobs=$(jobs | wc -l)
+    jobs >& /dev/null
+  done
 }
 
 # run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
 run_in_background() {
-    local compiler=$1
-
-    local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
-    # don't override command line input
-    # if [[ "$BUILD_ONLY" == True ]]; then
-        # num_jobs=8
-    # else
-        if [[ "$compiler" == cuda* ]]; then
-            num_jobs=1
-        fi
-    # fi
-    wait_for_jobs $num_jobs
-
-    single_build_and_test $* &
+  local compiler=$1
+
+  local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
+  # Don't override command line input.
+  # if [[ "$BUILD_ONLY" == True ]]; then
+  #   num_jobs=8
+  # else
+    if [[ "$compiler" == cuda* ]]; then
+      num_jobs=1
+    fi
+  # fi
+  wait_for_jobs $num_jobs
+
+  single_build_and_test $* &
 }
 
 # build_and_test_all <COMPILER>
 build_and_test_all() {
-    # Get compiler data
-    local compiler=$1
-    if [ -z "$CUSTOM_BUILD_LIST" ]; then
-	local compiler_build_list=$(get_compiler_build_list $compiler)
-    else
-	local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
-    fi
+  # Get compiler data.
+  local compiler=$1
+  if [ -z "$CUSTOM_BUILD_LIST" ]; then
+    local compiler_build_list=$(get_compiler_build_list $compiler)
+  else
+    local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
+  fi
 
-    # do builds
-    local build
-    for build in $compiler_build_list
-    do
-	run_in_background $compiler $build $BUILD_TYPE
+  # Do builds.
+  local build
+  for build in $compiler_build_list
+  do
+    run_in_background $compiler $build $BUILD_TYPE
 
-        # If not cuda, do a hwloc test too
-        if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
-            run_in_background $compiler $build "hwloc-$BUILD_TYPE"
-        fi
-    done
+    # If not cuda, do a hwloc test too.
+    if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
+      run_in_background $compiler $build "hwloc-$BUILD_TYPE"
+    fi
+  done
 
-    return 0
+  return 0
 }
 
 get_test_root_dir() {
-    local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
-    local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
-    local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
+  local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
+  local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
+  local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
 
-    if [ $num_to_delete -gt 0 ]; then
-        /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
-    fi
+  if [ $num_to_delete -gt 0 ]; then
+    /bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
+  fi
 
-    echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
+  echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
 }
 
 wait_summarize_and_exit() {
-    wait_for_jobs 1
-
-    echo "#######################################################"
-    echo "PASSED TESTS"
-    echo "#######################################################"
-
-    local passed_test
-    for passed_test in $(\ls -1 $PASSED_DIR | sort)
-    do
-        echo $passed_test $(cat $PASSED_DIR/$passed_test)
-    done
-
-    echo "#######################################################"
-    echo "FAILED TESTS"
-    echo "#######################################################"
-
-    local failed_test
-    local -i rv=0
-    for failed_test in $(\ls -1 $FAILED_DIR | sort)
-    do
-        echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
-        rv=$rv+1
-    done
-
-    exit $rv
+  wait_for_jobs 1
+
+  echo "#######################################################"
+  echo "PASSED TESTS"
+  echo "#######################################################"
+
+  local passed_test
+  for passed_test in $(\ls -1 $PASSED_DIR | sort)
+  do
+    echo $passed_test $(cat $PASSED_DIR/$passed_test)
+  done
+
+  echo "#######################################################"
+  echo "FAILED TESTS"
+  echo "#######################################################"
+
+  local failed_test
+  local -i rv=0
+  for failed_test in $(\ls -1 $FAILED_DIR | sort)
+  do
+    echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
+    rv=$rv+1
+  done
+
+  exit $rv
 }
 
 #
-# Main
+# Main.
 #
 
 ROOT_DIR=$(get_test_root_dir)
@@ -669,8 +707,8 @@ mkdir -p $FAILED_DIR
 
 echo "Going to test compilers: " $COMPILERS_TO_TEST
 for COMPILER in $COMPILERS_TO_TEST; do
-    echo "Testing compiler $COMPILER"
-    build_and_test_all $COMPILER
+  echo "Testing compiler $COMPILER"
+  build_and_test_all $COMPILER
 done
 
 wait_summarize_and_exit
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index 3277c007d0845485a57ed7aabfa35202f1b22d1b..53e0eab693afeca7bbe0c164666612dc5ccc36d9 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -60,7 +60,7 @@ class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
 { 
 public:
 
-  typedef ViewTraits< DataType , P ... >  traits ;
+  typedef Kokkos::ViewTraits< DataType , P ... >  traits ;
 
 private:
 
@@ -123,30 +123,41 @@ public:
 
   enum { Rank = 1 };
 
-  KOKKOS_INLINE_FUNCTION constexpr size_t size() const
+  KOKKOS_INLINE_FUNCTION
+  size_t size() const noexcept
     {
-      return
-        Kokkos::Impl::MemorySpaceAccess
-          < Kokkos::Impl::ActiveExecutionMemorySpace
-          , typename traits::memory_space
-          >::accessible 
-        ? // Runtime size is at the end of the chunk pointer array
-          (*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
-          << m_chunk_shift
-        : 0 ;
+      uintptr_t n = 0 ;
+
+      if ( Kokkos::Impl::MemorySpaceAccess
+            < Kokkos::Impl::ActiveExecutionMemorySpace
+            , typename traits::memory_space
+            >::accessible ) {
+        n = *reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max );
+      }
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      else {
+        Kokkos::Impl::DeepCopy< Kokkos::HostSpace
+                              , typename traits::memory_space
+                              , Kokkos::HostSpace::execution_space >
+          ( & n
+          , reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max )
+          , sizeof(uintptr_t) );
+      }
+#endif
+      return n << m_chunk_shift ;
     }
 
   template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
+  KOKKOS_INLINE_FUNCTION
   size_t extent( const iType & r ) const
     { return r == 0 ? size() : 1 ; }
 
   template< typename iType >
-  KOKKOS_INLINE_FUNCTION constexpr
+  KOKKOS_INLINE_FUNCTION
   size_t extent_int( const iType & r ) const
     { return r == 0 ? size() : 1 ; }
 
-  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return size(); }
+  KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
   KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
   KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
   KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
@@ -270,10 +281,18 @@ public:
     }
 
   /** \brief  Resizing in serial can grow or shrink the array size, */
+  template< typename IntType >
   inline
-  void resize_serial( size_t n )
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                     , typename traits::memory_space
+                                     >::accessible
+    >::type
+  resize_serial( IntType const & n )
     {
-      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
+      typedef typename traits::value_type value_type ;
+      typedef value_type * pointer_type ;
 
       const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
 
@@ -286,8 +305,8 @@ public:
 
       if ( *pc < NC ) {
         while ( *pc < NC ) {
-          m_chunks[*pc] =
-            m_pool.allocate( sizeof(traits::value_type) << m_chunk_shift );
+          m_chunks[*pc] = reinterpret_cast<pointer_type>
+            ( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
           ++*pc ;
         }
       }
@@ -295,12 +314,90 @@ public:
         while ( NC + 1 <= *pc ) {
           --*pc ;        
           m_pool.deallocate( m_chunks[*pc]
-                           , sizeof(traits::value_type) << m_chunk_shift );
+                           , sizeof(value_type) << m_chunk_shift );
           m_chunks[*pc] = 0 ;
         }
       }
     }
 
+  //----------------------------------------
+
+  struct ResizeSerial {
+    memory_pool                    m_pool ;
+    typename traits::value_type ** m_chunks ;
+    uintptr_t                    * m_pc ;
+    uintptr_t                      m_nc ;
+    unsigned                       m_chunk_shift ;  
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( int ) const
+      {
+        typedef typename traits::value_type value_type ;
+        typedef value_type * pointer_type ;
+
+        if ( *m_pc < m_nc ) {
+          while ( *m_pc < m_nc ) {
+            m_chunks[*m_pc] = reinterpret_cast<pointer_type>
+              ( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
+            ++*m_pc ;
+          }
+        }
+        else {
+          while ( m_nc + 1 <= *m_pc ) {
+            --*m_pc ;        
+            m_pool.deallocate( m_chunks[*m_pc]
+                             , sizeof(value_type) << m_chunk_shift );
+            m_chunks[*m_pc] = 0 ;
+          }
+        }
+      }
+
+    ResizeSerial( memory_pool            const & arg_pool
+                , typename traits::value_type ** arg_chunks
+                , uintptr_t                    * arg_pc
+                , uintptr_t                      arg_nc
+                , unsigned                       arg_chunk_shift
+                )
+      : m_pool( arg_pool )
+      , m_chunks( arg_chunks )
+      , m_pc( arg_pc )
+      , m_nc( arg_nc )
+      , m_chunk_shift( arg_chunk_shift )
+      {}
+  };
+
+  template< typename IntType >
+  inline
+  typename std::enable_if
+    < std::is_integral<IntType>::value &&
+      ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
+                                       , typename traits::memory_space
+                                       >::accessible
+    >::type
+  resize_serial( IntType const & n )
+    {
+      const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
+
+      if ( m_chunk_max < NC ) {
+        Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
+      }
+
+      // Must dispatch kernel
+
+      typedef Kokkos::RangePolicy< typename traits::execution_space > Range ;
+
+      uintptr_t * const pc =
+        reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
+
+      Kokkos::Impl::ParallelFor<ResizeSerial,Range>
+        closure( ResizeSerial( m_pool, m_chunks, pc, NC, m_chunk_shift )
+               , Range(0,1) );
+
+      closure.execute();
+
+      traits::execution_space::fence();
+    }
+
   //----------------------------------------------------------------------
 
   ~DynamicView() = default ;
@@ -311,15 +408,17 @@ public:
   DynamicView & operator = ( const DynamicView & ) = default ;
 
   template< class RT , class ... RP >
-  KOKKOS_INLINE_FUNCTION
   DynamicView( const DynamicView<RT,RP...> & rhs )
     : m_pool( rhs.m_pool )
     , m_track( rhs.m_track )
-    , m_chunks( rhs.m_chunks )
+    , m_chunks( (typename traits::value_type **) rhs.m_chunks )
     , m_chunk_shift( rhs.m_chunk_shift )
     , m_chunk_mask( rhs.m_chunk_mask )
     , m_chunk_max( rhs.m_chunk_max )
     {
+      typedef typename DynamicView<RT,RP...>::traits  SrcTraits ;
+      typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible DynamicView copy construction" );
     }
 
   //----------------------------------------------------------------------
@@ -400,8 +499,6 @@ public:
     , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
     , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
     {
-      DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
-
       // A functor to deallocate all of the chunks upon final destruction
 
       typedef typename traits::memory_space  memory_space ;
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index 8646d277921aff5c71b70c48d768ee39944b3455..193f1bc334dd76177e3823f6decee9dbd71b137e 100644
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -230,16 +230,17 @@ public:
   typedef typename Impl::remove_const<declared_value_type>::type value_type;
   typedef typename Impl::add_const<value_type>::type const_value_type;
 
-  typedef Device execution_space;
+  typedef Device device_type;
+  typedef typename Device::execution_space execution_space;
   typedef Hasher hasher_type;
   typedef EqualTo  equal_to_type;
   typedef uint32_t size_type;
 
   //map_types
-  typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
-  typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type>                   insertable_map_type;
-  typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type>             modifiable_map_type;
-  typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type>       const_map_type;
+  typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type>       const_map_type;
 
   static const bool is_set = std::is_same<void,value_type>::value;
   static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
@@ -264,18 +265,18 @@ private:
   typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
 
   typedef typename Impl::if_c<   is_insertable_map
-                               , View< key_type *, execution_space>
-                               , View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< key_type *, device_type>
+                               , View< const key_type *, device_type, MemoryTraits<RandomAccess> >
                              >::type key_type_view;
 
   typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
-                               , View< impl_value_type *, execution_space>
-                               , View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< impl_value_type *, device_type>
+                               , View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
                              >::type value_type_view;
 
   typedef typename Impl::if_c<   is_insertable_map
-                               , View< size_type *, execution_space>
-                               , View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
+                               , View< size_type *, device_type>
+                               , View< const size_type *, device_type, MemoryTraits<RandomAccess> >
                              >::type size_type_view;
 
   typedef typename Impl::if_c<   is_insertable_map
@@ -285,7 +286,7 @@ private:
 
   enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
   enum { num_scalars = 3 };
-  typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
+  typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
 
 public:
   //! \name Public member functions
@@ -757,7 +758,7 @@ public:
 
       Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
 
-      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
+      typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
 
       raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
       raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
@@ -781,21 +782,21 @@ private: // private member functions
 
   void set_flag(int flag) const
   {
-    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
     const int true_ = true;
     raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
   }
 
   void reset_flag(int flag) const
   {
-    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
     const int false_ = false;
     raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
   }
 
   bool get_flag(int flag) const
   {
-    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
     int result = false;
     raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
     return result;
diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt
index b9d860f32fd854a59e0258adabdc540a1ef0c512..0c59c616d620598b835525eb70410d0a26f6af6b 100644
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@@ -3,38 +3,49 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
 
-SET(SOURCES
-  UnitTestMain.cpp 
-  TestCuda.cpp
-  )
-
 SET(LIBRARIES kokkoscore)
 
 IF(Kokkos_ENABLE_Pthread)
-  LIST( APPEND SOURCES
-    TestThreads.cpp
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Threads
+  SOURCES TestThreads.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
   )
 ENDIF()
 
 IF(Kokkos_ENABLE_Serial)
-  LIST( APPEND SOURCES
-    TestSerial.cpp
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_Serial
+  SOURCES TestSerial.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
   )
 ENDIF()
 
 IF(Kokkos_ENABLE_OpenMP)
-  LIST( APPEND SOURCES
-    TestOpenMP.cpp
+TRIBITS_ADD_EXECUTABLE_AND_TEST(
+  UnitTest_OpenMP
+  SOURCES TestOpenMP.cpp UnitTestMain.cpp
+  COMM serial mpi
+  NUM_MPI_PROCS 1
+  FAIL_REGULAR_EXPRESSION "  FAILED  "
+  TESTONLYLIBS kokkos_gtest
   )
 ENDIF()
 
-
+IF(Kokkos_ENABLE_Cuda)
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  UnitTest
-  SOURCES ${SOURCES}
+  UnitTest_Cuda
+  SOURCES TestCuda.cpp UnitTestMain.cpp
   COMM serial mpi
   NUM_MPI_PROCS 1
   FAIL_REGULAR_EXPRESSION "  FAILED  "
   TESTONLYLIBS kokkos_gtest
   )
-  
+ENDIF()
+
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
index 7e3ca005f4b6401a088208fca120c097143afc49..beb07bd791cf162c31706b1eeaf31a4c25c91ba5 100644
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -64,6 +64,7 @@ struct TestDynamicView
   typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
 
   typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
+  typedef typename view_type::const_type const_view_type ;
 
   typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type ;
   typedef double value_type;
@@ -136,6 +137,8 @@ struct TestDynamicView
 
     view_type da("A",pool,arg_total_size);
 
+    const_view_type ca(da);
+
 // printf("TestDynamicView::run(%d) construct test functor\n",arg_total_size);
 
     TestDynamicView functor(da,arg_total_size);
diff --git a/lib/kokkos/core/cmake/Dependencies.cmake b/lib/kokkos/core/cmake/Dependencies.cmake
index ae9a20c50efeadec69ab22e3365cd3ec26a5e451..8d9872725e59655f256a9e62bf3f706a79e80e59 100644
--- a/lib/kokkos/core/cmake/Dependencies.cmake
+++ b/lib/kokkos/core/cmake/Dependencies.cmake
@@ -1,6 +1,6 @@
 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
-  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
+  LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
   TEST_OPTIONAL_TPLS CUSPARSE
   )
 
-TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
\ No newline at end of file
+TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib)
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
index 9359b5a32b71f06230ea8a2e878e0f457f8eee85..a71e60f20742edd8417365bb99c45f172dc5b218 100644
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@@ -30,7 +30,7 @@
 
 #cmakedefine KOKKOS_HAVE_PTHREAD
 #cmakedefine KOKKOS_HAVE_SERIAL
-#cmakedefine KOKKOS_HAVE_QTHREAD
+#cmakedefine KOKKOS_HAVE_QTHREADS
 #cmakedefine KOKKOS_HAVE_Winthread
 #cmakedefine KOKKOS_HAVE_OPENMP
 #cmakedefine KOKKOS_HAVE_HWLOC
diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile
index 85f869971a33c349769bd318af28759f3e3eca12..3a0ad2d4c16a4e16d73e91eec131ee092bf9f47e 100644
--- a/lib/kokkos/core/perf_test/Makefile
+++ b/lib/kokkos/core/perf_test/Makefile
@@ -60,4 +60,3 @@ clean: kokkos-clean
 
 gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
-
diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
index 7386ecef2032f32da8d4e672999e09021b5a673c..65ce61fb53b9e5d8025f1f6f59e8ecf194ec45f0 100644
--- a/lib/kokkos/core/perf_test/PerfTestCuda.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@@ -52,6 +52,8 @@
 
 #include <impl/Kokkos_Timer.hpp>
 
+#include <PerfTestMDRange.hpp>
+
 #include <PerfTestHexGrad.hpp>
 #include <PerfTestBlasKernels.hpp>
 #include <PerfTestGramSchmidt.hpp>
@@ -72,6 +74,14 @@ class cuda : public ::testing::Test {
     }
 };
 
+//TEST_F( cuda, mdrange_lr ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutRight>( 5, 8, "Kokkos::Cuda" )) );
+//}
+
+//TEST_F( cuda, mdrange_ll ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutLeft>( 5, 8, "Kokkos::Cuda" )) );
+//}
+
 TEST_F( cuda, hexgrad )
 {
   EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
diff --git a/lib/kokkos/core/perf_test/PerfTestDriver.hpp b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
index 7b6cfc5b5ce96399dcff47e1976b630088650af2..4732c3275a7f92cf1b1fc8f4d457c059ceb0679e 100644
--- a/lib/kokkos/core/perf_test/PerfTestDriver.hpp
+++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
@@ -60,6 +60,342 @@ namespace Test {
 
 enum { NUMBER_OF_TRIALS = 5 };
 
+template< class DeviceType , class LayoutType >
+void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], int range_offset = 0,  int tile_offset = 0 )
+// exp_beg = 6 => 2^6 = 64 is starting range length
+{
+#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0
+
+  std::string label_mdrange ;
+  label_mdrange.append( "\"MDRange< double , " );
+  label_mdrange.append( deviceTypeName );
+  label_mdrange.append( " >\"" );
+
+  std::string label_range_col2 ;
+  label_range_col2.append( "\"RangeColTwo< double , " );
+  label_range_col2.append( deviceTypeName );
+  label_range_col2.append( " >\"" );
+
+  std::string label_range_col_all ;
+  label_range_col_all.append( "\"RangeColAll< double , " );
+  label_range_col_all.append( deviceTypeName );
+  label_range_col_all.append( " >\"" );
+
+  if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Right"
+      << "\n--------------------------------------------------------------" << std::endl;
+  } else {
+    std::cout << "--------------------------------------------------------------\n"
+      << "Performance tests for MDRange Layout Left"
+      << "\n--------------------------------------------------------------" << std::endl;
+  }
+
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    const int range_length = (1<<i) + range_offset;
+
+    std::cout << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n"
+      << "MDRange Test:  range bounds: " << range_length << " , " << range_length << " , " << range_length 
+      << "\n--------------------------------------------------------------\n"
+      << "--------------------------------------------------------------\n";
+//      << std::endl;
+
+    int t0_min = 0, t1_min = 0, t2_min = 0;
+    double seconds_min = 0.0;
+
+    // Test 1: The MDRange in full
+    {
+    int t0 = 1, t1 = 1, t2 = 1;
+    int counter = 1;
+#if !defined(KOKKOS_HAVE_CUDA)
+    int min_bnd = 8;
+    int tfast = range_length;
+#else
+    int min_bnd = 2;
+    int tfast = 32;
+#endif
+    while ( tfast >= min_bnd ) {
+      int tmid = min_bnd;
+      while ( tmid < tfast ) { 
+        t0 = min_bnd;
+        t1 = tmid;
+        t2 = tfast;
+        int t2_rev = min_bnd;
+        int t1_rev = tmid;
+        int t0_rev = tfast;
+
+#if defined(KOKKOS_HAVE_CUDA)
+        //Note: Product of tile sizes must be < 1024 for Cuda
+        if ( t0*t1*t2 >= 1024 ) {
+          printf("  Exceeded Cuda tile limits; onto next range set\n\n");
+          break;
+        }
+#endif
+
+        // Run 1 with tiles LayoutRight style
+        double seconds_1 = 0;
+        { seconds_1 = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0, t1, t2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0 << " , " << t1 << " , " << t2
+          << " , " << seconds_1
+          << std::endl ;
+#endif
+
+        if ( counter == 1 ) {
+          seconds_min = seconds_1;
+          t0_min = t0;
+          t1_min = t1;
+          t2_min = t2;
+        } 
+        else {
+          if ( seconds_1 < seconds_min ) 
+          { 
+            seconds_min = seconds_1; 
+            t0_min = t0;
+            t1_min = t1;
+            t2_min = t2;
+          }
+        }
+
+        // Run 2 with tiles LayoutLeft style - reverse order of tile dims
+        double seconds_1rev = 0;
+        { seconds_1rev = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0_rev, t1_rev, t2_rev) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+        std::cout << label_mdrange
+          << " , " << t0_rev << " , " << t1_rev << " , " << t2_rev
+          << " , " << seconds_1rev
+          << std::endl ;
+#endif
+
+        if ( seconds_1rev < seconds_min ) 
+        { 
+          seconds_min = seconds_1rev; 
+          t0_min = t0_rev;
+          t1_min = t1_rev;
+          t2_min = t2_rev;
+        }
+
+        ++counter;
+        tmid <<= 1;
+      } //end inner while
+      tfast >>=1;
+    } //end outer while
+
+    std::cout << "\n"
+      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0_min << " , " << t1_min << " , " << t2_min
+      << "\n Min time: " << seconds_min
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    } //end scope
+
+#if !defined(KOKKOS_HAVE_CUDA)
+  double seconds_min_c = 0.0;
+  int t0c_min = 0, t1c_min = 0, t2c_min = 0;
+  int counter = 1;
+  {
+    int min_bnd = 8;
+    // Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize the full span in that direction, should be similar to Collapse<2>
+    if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value ) {
+      for ( unsigned int T0 = min_bnd; T0 < static_cast<unsigned int>(range_length); T0<<=1 ) {
+        for ( unsigned int T1 = min_bnd; T1 < static_cast<unsigned int>(range_length); T1<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, T0, T1, 0) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LR with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " << T0 << " , " << T1 << " , " << range_length
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+          t2c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t0c_min = T0;
+            t1c_min = T1;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t0c_min = T0;
+              t1c_min = T1;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+    else {
+      for ( unsigned int T1 = min_bnd; T1 <= static_cast<unsigned int>(range_length); T1<<=1 ) {
+        for ( unsigned int T2 = min_bnd; T2 <= static_cast<unsigned int>(range_length); T2<<=1 ) {
+          double seconds_c = 0;
+          { seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, 0, T1, T2) ; }
+
+#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+          std::cout << " MDRange LL with '0' tile - collapse-like \n"
+          << label_mdrange
+          << " , " <<range_length << " < " << T1 << " , " << T2
+          << " , " << seconds_c
+          << std::endl ;
+#endif
+
+
+          t0c_min = range_length;
+          if ( counter == 1 ) {
+            seconds_min_c = seconds_c;
+            t1c_min = T1;
+            t2c_min = T2;
+          } 
+          else {
+            if ( seconds_c < seconds_min_c ) 
+            { 
+              seconds_min_c = seconds_c; 
+              t1c_min = T1;
+              t2c_min = T2;
+            }
+          }
+          ++counter;
+        }
+      }
+    }
+
+    std::cout 
+//      << "--------------------------------------------------------------\n"
+      << label_mdrange
+      << "  Collapse<2> style: "
+      << "\n Min values "
+      << "\n Range length per dim (3D): " << range_length
+      << "\n TileDims:  " << t0c_min << " , " << t1c_min << " , " << t2c_min
+      << "\n Min time: " << seconds_min_c
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+  } //end scope test 2
+#endif
+
+
+    // Test 2: RangePolicy Collapse2 style
+    double seconds_2 = 0;
+    { seconds_2 = RangePolicyCollapseTwo< DeviceType , double , LayoutType >::test_index_collapse_two(range_length,range_length,range_length) ; }
+    std::cout << label_range_col2
+      << " , " << range_length
+      << " , " << seconds_2
+      << std::endl ;
+
+
+    // Test 3: RangePolicy Collapse all style - not necessary, always slow
+    /*
+    double seconds_3 = 0;
+    { seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType >::test_collapse_all(range_length,range_length,range_length) ; }
+    std::cout << label_range_col_all
+      << " , " << range_length
+      << " , " << seconds_3
+      << "\n---------------------------------------------------------------"
+      << std::endl ;
+    */
+
+    // Compare fastest times... will never be collapse all so ignore it
+    // seconds_min = tiled MDRange
+    // seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - only for non-Cuda, else tile too long
+    // seconds_2 = collapse<2>-style RangePolicy
+    // seconds_3 = collapse<3>-style RangePolicy
+
+#if !defined(KOKKOS_HAVE_CUDA)
+    if ( seconds_min < seconds_min_c ) {
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    }
+    else if ( seconds_min > seconds_min_c ) {
+      if ( seconds_min_c < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange collapse-like (tiledim = span on fast dim) type\n"
+          << " Time: " << seconds_min_c
+          << " Difference: " << seconds_2 - seconds_min_c
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min_c > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min_c - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "   MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+    } // end else if
+#else
+      if ( seconds_min < seconds_2 ) {
+        std::cout << "--------------------------------------------------------------\n"
+          << " Fastest run: MDRange tiled\n"
+          << " Time: " << seconds_min
+          << " Difference: " << seconds_2 - seconds_min
+          << " Other times: \n"
+          << "   Collapse2 Range Policy: " << seconds_2 << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+      else if ( seconds_min > seconds_2 ) {
+        std::cout << " Fastest run: Collapse2 RangePolicy\n"
+          << " Time: " << seconds_2
+          << " Difference: " << seconds_min - seconds_2
+          << " Other times: \n"
+          << "   MDrange Tiled: " << seconds_min << "\n"
+          << "\n--------------------------------------------------------------"
+          << "\n--------------------------------------------------------------"
+          //<< "\n\n"
+          << std::endl;
+      }
+#endif
+
+  } //end for
+
+#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
+
+}
 
 
 template< class DeviceType >
diff --git a/lib/kokkos/core/perf_test/PerfTestHost.cpp b/lib/kokkos/core/perf_test/PerfTestHost.cpp
index 606177ca50effc8a6cf88ced253ce2e1ea9930a2..831d581109984319a4c8a61674a42a297ace443a 100644
--- a/lib/kokkos/core/perf_test/PerfTestHost.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp
@@ -66,6 +66,8 @@ const char TestHostDeviceName[] = "Kokkos::Serial" ;
 
 #include <impl/Kokkos_Timer.hpp>
 
+#include <PerfTestMDRange.hpp>
+
 #include <PerfTestHexGrad.hpp>
 #include <PerfTestBlasKernels.hpp>
 #include <PerfTestGramSchmidt.hpp>
@@ -102,6 +104,14 @@ protected:
   }
 };
 
+//TEST_F( host, mdrange_lr ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutRight> (5, 8, TestHostDeviceName) ) );
+//}
+
+//TEST_F( host, mdrange_ll ) {
+//  EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutLeft> (5, 8, TestHostDeviceName) ) );
+//}
+
 TEST_F( host, hexgrad ) {
   EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
 }
diff --git a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d910b513c67f94eec4c1254fd4528ec4d74c62a5
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp
@@ -0,0 +1,564 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct MultiDimRangePerf3D
+{
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long i, const long j, const long k) const
+  {
+    A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                             + B(i,j+2,k) + B(i,j+1,k)
+                             + B(i,j,k+2) + B(i,j,k+1)
+                             + B(i,j,k) );
+  }
+
+
+  struct InitZeroTag {};
+//  struct InitViewTag {};
+
+  struct Init
+  {
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 1.0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const InitZeroTag&, const long i, const long j, const long k) const
+    {
+      input(i,j,k) = 0;
+    }
+
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+  };
+
+
+  static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1)
+  {
+    //This test performs multidim range over all dims
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef MultiDimRangePerf3D<execution_space,ScalarType,TestLayout> FunctorType;
+
+    double dt_min = 0;
+
+    // LayoutRight
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > MDRangeType;
+      using tile_type = typename MDRangeType::tile_type;
+      using point_type = typename MDRangeType::point_type;
+
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << "LR multi: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+      }
+    } //end for
+
+    } 
+    // LayoutLeft
+    else {
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); 
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); 
+
+      //typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
+      //using tile_type = typename MDRangeType::tile_type;
+      //using point_type = typename MDRangeType::point_type;
+      //Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
+      Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} ); 
+
+      Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
+      execution_space::fence();
+      Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
+      execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - only the first run
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  multi Ahost = " << Ahost(l,j,k) << "  expected = " << check  
+                      << "  multi Bhost(ijk) = " << Bhost(l,j,k) 
+                      << "  multi Bhost(l+1jk) = " << Bhost(l+1,j,k) 
+                      << "  multi Bhost(l+2jk) = " << Bhost(l+2,j,k) 
+                      << "  multi Bhost(ij+1k) = " << Bhost(l,j+1,k) 
+                      << "  multi Bhost(ij+2k) = " << Bhost(l,j+2,k) 
+                      << "  multi Bhost(ijk+1) = " << Bhost(l,j,k+1) 
+                      << "  multi Bhost(ijk+2) = " << Bhost(l,j,k+2) 
+                      << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " LL multi run: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " multi: No errors!" <<  std::endl; }
+
+      }
+    } //end for
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseTwo
+{
+  // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  using iterate_type = Kokkos::Experimental::Iterate;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+//id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r
+//r = j + i*Nj
+      long i = int(r / jrange); 
+      long j = int( r - i*jrange);
+      for (int k = 0; k < krange; ++k) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+//id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r
+//r = j + k*Nj
+      long k = int(r / jrange); 
+      long j = int( r - k*jrange);
+      for (int i = 0; i < irange; ++i) {
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+                                 + B(i,j+2,k) + B(i,j+1,k)
+                                 + B(i,j,k+2) + B(i,j,k+1)
+                                 + B(i,j,k) );
+      }
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / jrange); 
+        long j = int( r - i*jrange);
+        for (int k = 0; k < krange; ++k) {
+          input(i,j,k) = 1;
+        }
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / jrange); 
+        long j = int( r - k*jrange);
+        for (int i = 0; i < irange; ++i) {
+          input(i,j,k) = 1;
+        }
+      }
+    }
+  };
+
+
+  static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    // This test refers to collapsing two dims while using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseTwo<execution_space,ScalarType,TestLayout> FunctorType;
+
+    long collapse_index_rangeA = 0;
+    long collapse_index_rangeB = 0;
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
+      collapse_index_rangeA = icount*jcount;
+      collapse_index_rangeB = (icount+2)*(jcount+2);
+//      std::cout << "   LayoutRight " << std::endl;
+    } else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value ) {
+      collapse_index_rangeA = kcount*jcount;
+      collapse_index_rangeB = (kcount+2)*(jcount+2);
+//      std::cout << "   LayoutLeft " << std::endl;
+    } else {
+      std::cout << "  LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl;
+      exit(-1);
+    }
+
+    Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA) );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (collapse_index_rangeB) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse2: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse2: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+
+template< class DeviceType 
+        , typename ScalarType = double  
+        , typename TestLayout = Kokkos::LayoutRight  
+        >
+struct RangePolicyCollapseAll
+{
+  // RangePolicy for 3D range, but will collapse all dims
+
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type  size_type;
+  typedef TestLayout layout;
+
+  typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
+  typedef typename view_type::HostMirror host_view_type;
+
+  view_type A;
+  view_type B;
+  const long irange;
+  const long jrange;
+  const long krange;
+
+  RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_,  const long &jrange_, const long &krange_)
+  : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const long r) const
+  {
+    if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+    {
+      long i = int(r / (jrange*krange)); 
+      long j = int(( r - i*jrange*krange)/krange);
+      long k = int(r - i*jrange*krange - j*krange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+    else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+    {
+      long k = int(r / (irange*jrange)); 
+      long j = int(( r - k*irange*jrange)/irange);
+      long i = int(r - k*irange*jrange - j*irange);
+        A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+            + B(i,j+2,k) + B(i,j+1,k)
+            + B(i,j,k+2) + B(i,j,k+1)
+            + B(i,j,k) );
+    }
+  }
+
+
+  struct Init
+  {
+    view_type input;
+    const long irange;
+    const long jrange;
+    const long krange;
+
+    Init(const view_type & input_, const long &irange_,  const long &jrange_, const long &krange_)
+    : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const long r) const
+    {
+      if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
+      {
+        long i = int(r / (jrange*krange)); 
+        long j = int(( r - i*jrange*krange)/krange);
+        long k = int(r - i*jrange*krange - j*krange);
+        input(i,j,k) = 1;
+      }
+      else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
+      {
+        long k = int(r / (irange*jrange));
+        long j = int(( r - k*irange*jrange)/irange);
+        long i = int(r - k*irange*jrange - j*irange);
+        input(i,j,k) = 1;
+      }
+    }
+  };
+
+
+  static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
+  {
+    //This test refers to collapsing all dims using the RangePolicy
+    view_type Atest("Atest", icount, jcount, kcount);
+    view_type Btest("Btest", icount+2, jcount+2, kcount+2);
+    typedef RangePolicyCollapseAll<execution_space,ScalarType,TestLayout> FunctorType;
+
+    const long flat_index_range = icount*jcount*kcount;
+    Kokkos::RangePolicy<execution_space> policy(0, flat_index_range );
+    Kokkos::RangePolicy<execution_space> policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) );
+
+    double dt_min = 0;
+
+    Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
+    execution_space::fence();
+    Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
+    execution_space::fence();
+
+    for (int i = 0; i < iter; ++i)
+    {
+      Kokkos::Timer timer;
+      Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+
+      //Correctness check - first iteration only
+      if ( 0 == i )
+      {
+        long numErrors = 0;
+        host_view_type Ahost("Ahost", icount, jcount, kcount);
+        Kokkos::deep_copy(Ahost, Atest);
+        host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
+        Kokkos::deep_copy(Bhost, Btest);
+
+        // On KNL, this may vectorize - add print statement to prevent
+        // Also, compare against epsilon, as vectorization can change bitwise answer
+        for ( long l = 0; l < static_cast<long>(icount); ++l ) {
+        for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
+        for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
+          ScalarType check  = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+                                        + Bhost(l,j+2,k) + Bhost(l,j+1,k)
+                                        + Bhost(l,j,k+2) + Bhost(l,j,k+1)
+                                        + Bhost(l,j,k) );
+          if ( Ahost(l,j,k) - check != 0 ) {
+            ++numErrors;
+            std::cout << "  Callapse ALL Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
+                      << "  flat Ahost = " << Ahost(l,j,k) << "  expected = " << check  << std::endl;
+            //exit(-1);
+          }
+        } } }
+        if ( numErrors != 0 ) { std::cout << " RP collapse all: errors " << numErrors << "  range product " << icount*jcount*kcount << "  LL " << jcount*kcount << "  LR " << icount*jcount << std::endl; }
+        //else { std::cout << " RP collapse all: Pass! " << std::endl; }
+      }
+    }
+
+    return dt_min;
+  } 
+
+};
+
+} //end namespace Test
diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt
index 807a01ed01b128c531b87df0c27e1d406525b603..492470d05d07ee5684a04bff54fc103e82708ba9 100644
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@@ -92,13 +92,13 @@ LIST(APPEND SOURCES         ${SOURCES_CUDA} )
 INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
 
 #-----------------------------------------------------------------------------
-FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
-FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
+FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp)
+FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp)
 
-LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
-LIST(APPEND SOURCES         ${SOURCES_QTHREAD} )
+LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} )
+LIST(APPEND SOURCES         ${SOURCES_QTHREADS} )
 
-INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
+INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/)
 
 #-----------------------------------------------------------------------------
 
@@ -109,5 +109,3 @@ TRIBITS_ADD_LIBRARY(
     SOURCES ${SOURCES}
     DEPLIBS
     )
-
-
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e0eadb25a005f09e1c9d37400bd76a611cc4eb3b
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
@@ -0,0 +1,1300 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
+#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+
+//#include<Cuda/Kokkos_CudaExec.hpp>
+// Including the file above, leads to following type of errors:
+// /home/ndellin/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp(84): error: incomplete type is not allowed
+// As a result, recreate cuda_parallel_launch and associated code
+
+#if defined(KOKKOS_ENABLE_PROFILING)
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+namespace Kokkos { namespace Experimental { namespace Impl {
+
+// ------------------------------------------------------------------ //
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch( const DriverType driver )
+{
+  driver();
+}
+
+template< class DriverType >
+struct CudaLaunch
+{
+  inline
+  CudaLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+            )
+  {
+    cuda_parallel_launch< DriverType ><<< grid , block >>>(driver);
+  }
+
+};
+
+// ------------------------------------------------------------------ //
+template< int N , typename RP , typename Functor , typename Tag >
+struct apply_impl;
+
+//Rank 2
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<2,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+  if (RP::inner_direction == RP::Left) {
+ /*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+            m_func(i, j);
+    } }
+*/
+    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+            m_func(offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+// LR
+  else {
+/*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+            m_func(i, j);
+    } }
+*/
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+            m_func(offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<2,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+  if (RP::inner_direction == RP::Left) {
+    // Loop over size maxnumblocks until full range covered
+/*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+            m_func(Tag(), i, j);
+    } }
+*/
+    for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+      const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+      if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+        for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+          const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+          if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+            m_func(Tag(), offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+  else {
+/*
+    index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
+    index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
+
+    for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
+    for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
+            m_func(Tag(), i, j);
+    } }
+*/
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+            m_func(Tag(), offset_0 , offset_1);
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 3
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<3,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+  else {
+    for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+      const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+      if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+        for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+          const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+          if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+            for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+              const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+              if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+                m_func(offset_0 , offset_1 , offset_2);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<3,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+        if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+                if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
+        if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
+
+          for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
+            if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
+                  m_func(Tag(), offset_0 , offset_1 , offset_2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 4
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<4,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for void tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<4,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  inline __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+    if (RP::inner_direction == RP::Left) {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+        if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+
+          for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+            if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    else {
+      const index_type temp0  =  m_rp.m_tile_end[0];
+      const index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
+                if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
+
+                  for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
+                    if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
+                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 5
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<5,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<5,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+
+          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
+                        if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
+                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+
+//Rank 6
+// Specializations for void tag type
+template< typename RP , typename Functor >
+struct apply_impl<6,RP,Functor,void >
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z % numbl4;
+      const index_type tile_id5 = blockIdx.z / numbl4;
+      const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z / numbl5;
+      const index_type tile_id5 = blockIdx.z % numbl5;
+      const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// Specializations for tag type
+template< typename RP , typename Functor , typename Tag >
+struct apply_impl<6,RP,Functor,Tag>
+{
+  using index_type = typename RP::index_type;
+
+  __device__
+  apply_impl( const RP & rp_ , const Functor & f_ )
+  : m_rp(rp_)
+  , m_func(f_)
+  {}
+
+  static constexpr index_type max_blocks = 65535;
+
+  inline __device__
+  void exec_range() const
+  {
+// LL
+    if (RP::inner_direction == RP::Left) {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl0 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl1 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl0 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x % numbl0;
+      const index_type tile_id1 = blockIdx.x / numbl0;
+      const index_type thr_id0 = threadIdx.x % m_rp.m_tile[0];
+      const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl2 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl3 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl2 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y % numbl2;
+      const index_type tile_id3 = blockIdx.y / numbl2;
+      const index_type thr_id2 = threadIdx.y % m_rp.m_tile[2];
+      const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl4 = ( temp0 <= max_blocks ? temp0 : max_blocks ) ;
+      const index_type numbl5 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl4 ) :
+      (  temp1 <= max_blocks ? temp1 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z % numbl4;
+      const index_type tile_id5 = blockIdx.z / numbl4;
+      const index_type thr_id4 = threadIdx.z % m_rp.m_tile[4];
+      const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
+
+      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+
+          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+// LR
+    else {
+      index_type temp0  =  m_rp.m_tile_end[0];
+      index_type temp1  =  m_rp.m_tile_end[1];
+      const index_type numbl1 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl0 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl1 ) :
+      ( temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id0 = blockIdx.x / numbl1;
+      const index_type tile_id1 = blockIdx.x % numbl1;
+      const index_type thr_id0 = threadIdx.x / m_rp.m_tile[1];
+      const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
+
+      temp0  =  m_rp.m_tile_end[2];
+      temp1  =  m_rp.m_tile_end[3];
+      const index_type numbl3 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl2 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl3 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id2 = blockIdx.y / numbl3;
+      const index_type tile_id3 = blockIdx.y % numbl3;
+      const index_type thr_id2 = threadIdx.y / m_rp.m_tile[3];
+      const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
+
+      temp0  =  m_rp.m_tile_end[4];
+      temp1  =  m_rp.m_tile_end[5];
+      const index_type numbl5 = ( temp1 <= max_blocks ? temp1 : max_blocks ) ;
+      const index_type numbl4 = ( temp0*temp1 > max_blocks ? index_type( max_blocks / numbl5 ) :
+      (  temp0 <= max_blocks ? temp0 : max_blocks ) );
+
+      const index_type tile_id4 = blockIdx.z / numbl5;
+      const index_type tile_id5 = blockIdx.z % numbl5;
+      const index_type thr_id4 = threadIdx.z / m_rp.m_tile[5];
+      const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
+
+      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
+
+          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
+
+              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
+
+                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
+
+                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
+
+                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
+                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+  } //end exec_range
+
+private:
+  const RP & m_rp;
+  const Functor & m_func;
+};
+
+// ----------------------------------------------------------------------------------
+
+template < typename RP
+         , typename Functor
+         , typename Tag
+         >
+struct DeviceIterateTile
+{
+  using index_type = typename RP::index_type;
+  using array_index_type = typename RP::array_index_type;
+  using point_type = typename RP::point_type;
+
+  struct VoidDummy {};
+  typedef typename std::conditional< std::is_same<Tag, void>::value, VoidDummy, Tag>::type usable_tag;
+
+  DeviceIterateTile( const RP & rp, const Functor & func )
+    : m_rp{rp}
+    , m_func{func}
+  {}
+
+private:
+  inline __device__
+  void apply() const
+  {
+    apply_impl<RP::rank,RP,Functor,Tag>(m_rp,m_func).exec_range();
+  } //end apply
+
+public:
+
+  inline
+  __device__
+  void operator()(void) const
+  {
+    this-> apply();
+  }
+
+  inline
+  void execute() const
+  {
+    const array_index_type maxblocks = 65535; //not true for blockIdx.x for newer archs
+    if ( RP::rank == 2 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
+      const dim3 grid(
+            std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+          , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+          , 1
+          );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 3 )
+    {
+      const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
+      const dim3 grid(
+          std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
+        , std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 4 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
+        , std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 5 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else if ( RP::rank == 6 )
+    {
+      // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
+      const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
+      const dim3 grid(
+          std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
+                  , static_cast<index_type>(maxblocks) )
+        ,  std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
+                  , static_cast<index_type>(maxblocks) )
+        , std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
+                  , static_cast<index_type>(maxblocks) )
+        );
+      CudaLaunch< DeviceIterateTile >( *this , grid , block );
+    }
+    else
+    {
+      printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
+      Kokkos::abort("Aborting");
+    }
+
+  } //end execute
+
+protected:
+  const RP         m_rp;
+  const Functor    m_func;
+};
+
+} } } //end namespace Kokkos::Experimental::Impl
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index 0a0f41686bab1232f0bebe9e66dc4f6b08c76d6b..a273db998ba808726f4d9b5bc17bfc10347952ed 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -131,6 +131,7 @@ namespace Impl {
     int* atomic;
     int* scratch;
     int* threadid;
+    int n;
   };
 }
 }
@@ -250,6 +251,7 @@ struct CudaParallelLaunch< DriverType , true > {
       locks.atomic = atomic_lock_array_cuda_space_ptr(false);
       locks.scratch = scratch_lock_array_cuda_space_ptr(false);
       locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      locks.n = Kokkos::Cuda::concurrency();
       cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
       #endif
 
@@ -292,6 +294,7 @@ struct CudaParallelLaunch< DriverType , false > {
       locks.atomic = atomic_lock_array_cuda_space_ptr(false);
       locks.scratch = scratch_lock_array_cuda_space_ptr(false);
       locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+      locks.n = Kokkos::Cuda::concurrency();
       cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
       #endif
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 91a3c921381709fc0ade5776b03ef48a2abcfe67..303b3fa4f699f0e56c7d44682197bd050b2ac7ca 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -59,7 +59,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <impl/Kokkos_Error.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 
@@ -184,7 +184,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
 
   enum { max_uvm_allocations = 65536 };
 
-  if ( arg_alloc_size > 0 ) 
+  if ( arg_alloc_size > 0 )
   {
     Kokkos::Impl::num_uvm_allocations++;
 
@@ -193,7 +193,7 @@ void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
     }
 
     CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
-  } 
+  }
 
   return ptr ;
 }
@@ -375,7 +375,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::CudaSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
 
     SharedAllocationHeader header ;
@@ -395,7 +395,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
 SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::fence(); //Make sure I can access the label ...
     Kokkos::Profiling::deallocateData(
@@ -412,7 +412,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
 SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::deallocateData(
       Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
@@ -442,7 +442,7 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
   , m_tex_obj( 0 )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -479,7 +479,7 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
   , m_tex_obj( 0 )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -510,7 +510,7 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
       )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -745,14 +745,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
       //Formatting dependent on sizeof(uintptr_t)
       const char * format_string;
 
-      if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+      if (sizeof(uintptr_t) == sizeof(unsigned long)) {
         format_string = "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
       }
-      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+      else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
         format_string = "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
       }
 
-      snprintf( buffer , 256 
+      snprintf( buffer , 256
               , format_string
               , reinterpret_cast<uintptr_t>( r )
               , reinterpret_cast<uintptr_t>( r->m_prev )
@@ -776,14 +776,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
         //Formatting dependent on sizeof(uintptr_t)
         const char * format_string;
 
-        if (sizeof(uintptr_t) == sizeof(unsigned long)) { 
+        if (sizeof(uintptr_t) == sizeof(unsigned long)) {
           format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
         }
-        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { 
+        else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
           format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
         }
 
-        snprintf( buffer , 256 
+        snprintf( buffer , 256
                 , format_string
                 , reinterpret_cast< uintptr_t >( r->data() )
                 , r->size()
@@ -883,6 +883,7 @@ void init_lock_arrays_cuda_space() {
     locks.atomic = atomic_lock_array_cuda_space_ptr(false);
     locks.scratch = scratch_lock_array_cuda_space_ptr(false);
     locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+    locks.n = Kokkos::Cuda::concurrency();
     cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
     init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
     init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
index eeea97049fa3e8ba949fb9aed7841b4639bea928..44d908d1023197c5a8d0232a3d13ff49d06ef8d9 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -505,18 +505,18 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
       std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
       std::cout << "                                  without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
       std::cout << "                                  The code must call Cuda::fence() after each kernel" << std::endl;
-      std::cout << "                                  or will likely crash when accessing data on the host." << std::endl; 
+      std::cout << "                                  or will likely crash when accessing data on the host." << std::endl;
     }
 
     const char * env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
     bool force_device_alloc;
     if (env_force_device_alloc == 0) force_device_alloc=false;
     else force_device_alloc=atoi(env_force_device_alloc)!=0;
-  
+
     const char * env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
     bool visible_devices_one=true;
     if (env_visible_devices == 0) visible_devices_one=false;
-    
+
     if(!visible_devices_one && !force_device_alloc) {
       std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
       std::cout << "                                  without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
@@ -536,6 +536,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
   locks.atomic = atomic_lock_array_cuda_space_ptr(false);
   locks.scratch = scratch_lock_array_cuda_space_ptr(false);
   locks.threadid = threadid_lock_array_cuda_space_ptr(false);
+  locks.n = Kokkos::Cuda::concurrency();
   cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
   #endif
 }
@@ -620,9 +621,9 @@ void CudaInternal::finalize()
   was_finalized = 1;
   if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
 
-    atomic_lock_array_cuda_space_ptr(false);
-    scratch_lock_array_cuda_space_ptr(false);
-    threadid_lock_array_cuda_space_ptr(false);
+    atomic_lock_array_cuda_space_ptr(true);
+    scratch_lock_array_cuda_space_ptr(true);
+    threadid_lock_array_cuda_space_ptr(true);
 
     if ( m_stream ) {
       for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@@ -700,7 +701,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
 {
   Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
 }
@@ -739,7 +740,7 @@ void Cuda::finalize()
 {
   Impl::CudaInternal::singleton().finalize();
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
   #endif
 }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index fa29d732f473d727b5ac8beb81c8602d0e715914..56e6a3c1e34123d8fc58dbfffea0574acea31047 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -61,7 +61,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <Kokkos_Vectorization.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <typeinfo>
 #endif
@@ -586,13 +586,35 @@ public:
   void operator()(void) const
   {
     // Iterate this block through the league
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
+        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+
     for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
 
       this-> template exec_team< WorkTag >(
         typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
                                     , m_shmem_begin
                                     , m_shmem_size
-                                    , m_scratch_ptr[1]
+                                    , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                     , m_scratch_size[1]
                                     , league_rank
                                     , m_league_size ) );
@@ -946,11 +968,32 @@ public:
 
   __device__ inline
   void operator() () const {
-    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+    int threadid = 0;
+    if ( m_scratch_size[1]>0 ) {
+      __shared__ int base_thread_id;
+      if (threadIdx.x==0 && threadIdx.y==0 ) {
+        threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
+        threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
+        if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
+        int done = 0;
+        while (!done) {
+          done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
+          if(!done) {
+            threadid += blockDim.x * blockDim.y;
+            if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
+          }
+        }
+        base_thread_id = threadid;
+      }
+      __syncthreads();
+      threadid = base_thread_id;
+    }
+
+    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
   }
 
   __device__ inline
-  void run(const DummySHMEMReductionType&) const
+  void run(const DummySHMEMReductionType&, const int& threadid) const
   {
     const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
       word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
@@ -964,7 +1007,7 @@ public:
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
                                         , m_shmem_size
-                                        , m_scratch_ptr[1]
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                         , m_scratch_size[1]
                                         , league_rank
                                         , m_league_size )
@@ -992,7 +1035,7 @@ public:
   }
 
   __device__ inline
-  void run(const DummyShflReductionType&) const
+  void run(const DummyShflReductionType&, const int& threadid) const
   {
     value_type value;
     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
@@ -1003,7 +1046,7 @@ public:
         ( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
                                         , m_shmem_begin
                                         , m_shmem_size
-                                        , m_scratch_ptr[1]
+                                        , (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
                                         , m_scratch_size[1]
                                         , league_rank
                                         , m_league_size )
@@ -1128,9 +1171,9 @@ public:
       Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
     }
 
-    if ( m_team_size >
-         Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
-               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
+    if ( unsigned(m_team_size) >
+         unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
+               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
       Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
     }
 
@@ -1621,14 +1664,25 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Cuda
 #endif
 }
 
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+/** \brief  Intra-thread vector parallel_reduce.
  *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
+ *  Calls lambda(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed using +=
+ *  and output into result.
+ *
+ *  The identity value for the += operator is assumed to be the default
+ *  constructed value.
+ */
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
+void parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+      const & loop_boundaries
+  , Lambda const & lambda
+  , ValueType & result )
+{
 #ifdef __CUDA_ARCH__
   result = ValueType();
 
@@ -1636,52 +1690,42 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::C
     lambda(i,result);
   }
 
-  if (loop_boundaries.increment > 1)
-    result += shfl_down(result, 1,loop_boundaries.increment);
-  if (loop_boundaries.increment > 2)
-    result += shfl_down(result, 2,loop_boundaries.increment);
-  if (loop_boundaries.increment > 4)
-    result += shfl_down(result, 4,loop_boundaries.increment);
-  if (loop_boundaries.increment > 8)
-    result += shfl_down(result, 8,loop_boundaries.increment);
-  if (loop_boundaries.increment > 16)
-    result += shfl_down(result, 16,loop_boundaries.increment);
-
-  result = shfl(result,0,loop_boundaries.increment);
+  Impl::cuda_intra_warp_vector_reduce(
+    Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & result ) );
+
 #endif
 }
 
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+/** \brief  Intra-thread vector parallel_reduce.
  *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
+ *  Calls lambda(iType i, ValueType & val) for each i=[0..N).
+ *
+ *  The range [0..N) is mapped to all vector lanes of
+ *  the calling thread and a reduction of val is performed
+ *  using JoinType::operator()(ValueType& val, const ValueType& update)
+ *  and output into result.
+ *
+ *  The input value of result must be the identity value for the
+ *  reduction operation; e.g., ( 0 , += ) or ( 1 , *= ).
+ */
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
+void parallel_reduce
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+      const & loop_boundaries
+  , Lambda const & lambda
+  , JoinType const & join
+  , ValueType & result )
+{
 #ifdef __CUDA_ARCH__
-  ValueType result = init_result;
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
     lambda(i,result);
   }
 
-  if (loop_boundaries.increment > 1)
-    join( result, shfl_down(result, 1,loop_boundaries.increment));
-  if (loop_boundaries.increment > 2)
-    join( result, shfl_down(result, 2,loop_boundaries.increment));
-  if (loop_boundaries.increment > 4)
-    join( result, shfl_down(result, 4,loop_boundaries.increment));
-  if (loop_boundaries.increment > 8)
-    join( result, shfl_down(result, 8,loop_boundaries.increment));
-  if (loop_boundaries.increment > 16)
-    join( result, shfl_down(result, 16,loop_boundaries.increment));
-
-  init_result = shfl(result,0,loop_boundaries.increment);
+  Impl::cuda_intra_warp_vector_reduce(
+    Impl::Reducer< ValueType , JoinType >( join , & result ) );
+
 #endif
 }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index ad9cca26ce2463df58820da78a3fb2e16c2a351c..79b3867ba24a87e787faac051c21abf6a99795de 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -55,15 +55,163 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
 namespace Impl {
 
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl( *reinterpret_cast<int const *>(&in) , lane , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl( T & out , T const & in , int lane ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_down( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
 
+//----------------------------------------------------------------------------
 
-//Shfl based reductions
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+  *reinterpret_cast<int*>(&out) =
+    __shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
+}
+
+template< typename T >
+__device__ inline
+void cuda_shfl_up( T & out , T const & in , int delta ,
+  typename std::enable_if
+    < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+    , int >::type width )
+{
+  enum : int { N = sizeof(T) / sizeof(int) };
+
+  for ( int i = 0 ; i < N ; ++i ) {
+    reinterpret_cast<int*>(&out)[i] =
+      __shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Reduce within a warp over blockDim.x, the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Requires:
+ *    blockDim.x is power of 2
+ *    blockDim.x <= 32 (one warp)
+ *
+ *  Cannot use "butterfly" pattern because floating point
+ *  addition is non-associative.  Therefore, must broadcast
+ *  the final result.
+ */
+template< class Reducer >
+__device__ inline
+void cuda_intra_warp_vector_reduce( Reducer const & reducer )
+{
+  static_assert(
+    std::is_reference< typename Reducer::reference_type >::value , "" );
+
+  if ( 1 < blockDim.x ) {
+
+    typename Reducer::value_type tmp ;
+
+    for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
+
+      cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
+
+      if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
+    }
+
+    // Broadcast from root "lane" to all other "lanes"
+
+    cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
+  }
+}
+
+/** \brief  Inclusive scan over blockDim.x, the "vector" dimension.
+ *
+ *  This will be called within a nested, intra-team parallel operation.
+ *  Use shuffle operations to avoid conflicts with shared memory usage.
+ *
+ *  Algorithm is concurrent bottom-up reductions in triangular pattern
+ *  where each CUDA thread is the root of a reduction tree from the
+ *  zeroth CUDA thread to itself.
+ *
+ *  Requires:
+ *    blockDim.x is power of 2
+ *    blockDim.x <= 32 (one warp)
+ */
+template< typename ValueType >
+__device__ inline
+void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
+{
+  ValueType tmp ;
+
+  // Bottom up:
+  //   [t] += [t-1] if t >= 1
+  //   [t] += [t-2] if t >= 2
+  //   [t] += [t-4] if t >= 4
+  // ...
+
+  for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
+
+    cuda_shfl_up( tmp , local , i , blockDim.x );
+
+    if ( i <= threadIdx.x ) { local += tmp ; }
+  }
+}
+
+//----------------------------------------------------------------------------
 /*
  *  Algorithmic constraints:
  *   (a) threads with same threadIdx.y have same value
@@ -98,7 +246,10 @@ inline void cuda_inter_warp_reduction( ValueType& value,
                                        const int max_active_thread = blockDim.y) {
 
   #define STEP_WIDTH 4
-  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
+  // Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
+  // The reason not to use ValueType directly is that for types with constructors it 
+  // could lead to race conditions
+  __shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
   ValueType* result = (ValueType*) & sh_result;
   const unsigned step = 32 / blockDim.x;
   unsigned shift = STEP_WIDTH;
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
index c96b8b7d40666830032ee560840cddcc9e52fe04..cf3e55d50cf416cbb6a268c85602e7c7dd8fa4e2 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.cpp
@@ -91,7 +91,7 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
       // Loop by priority and then type
       for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
         for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
-          task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
+          task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
         }
       }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 479294f3078a4e0d055610cb38b599415bbac921..a13e37837d8005867f1087b827a4d7e59ebd3209 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -61,6 +61,8 @@ void set_cuda_task_base_apply_function_pointer
 
 }
 
+template< class > class TaskExec ;
+
 template<>
 class TaskQueueSpecialization< Kokkos::Cuda >
 {
@@ -69,6 +71,7 @@ public:
   using execution_space = Kokkos::Cuda ;
   using memory_space    = Kokkos::CudaUVMSpace ;
   using queue_type      = TaskQueue< execution_space > ;
+  using member_type     = TaskExec< Kokkos::Cuda > ;
 
   static
   void iff_single_thread_recursive_execute( queue_type * const ) {}
@@ -79,13 +82,15 @@ public:
   static
   void execute( queue_type * const );
 
-  template< typename FunctorType >
+  template< typename TaskType >
   static
-  void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
+  typename TaskType::function_type
+  get_function_pointer()
     {
-      using TaskType = TaskBase< execution_space
-                               , typename FunctorType::value_type
-                               , FunctorType > ;
+      using function_type = typename TaskType::function_type ;
+
+      function_type * const ptr =
+        (function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
 
       CUDA_SAFE_CALL( cudaDeviceSynchronize() );
 
@@ -93,6 +98,8 @@ public:
 
       CUDA_SAFE_CALL( cudaGetLastError() );
       CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+
+      return *ptr ;
     }
 };
 
@@ -435,18 +442,26 @@ void parallel_reduce
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename ValueType, typename iType, class Lambda >
+template< typename iType, class Closure >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
   (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
-   const Lambda & lambda) {
+   const Closure & closure )
+{
+  // Extract value_type from closure
 
-  ValueType accum = 0 ;
-  ValueType val, y, local_total;
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+  value_type val, y, local_total;
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
     val = 0;
-    lambda(i,val,false);
+    closure(i,val,false);
 
     // intra-blockDim.y exclusive scan on 'val'
     // accum = accumulated, sum in total for this iteration
@@ -458,7 +473,7 @@ void parallel_scan
     }
 
     // pass accum to all threads
-    local_total = shfl_warp_broadcast<ValueType>(val,
+    local_total = shfl_warp_broadcast<value_type>(val,
                                             threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
                                             Impl::CudaTraits::WarpSize);
 
@@ -467,7 +482,7 @@ void parallel_scan
     if ( threadIdx.y == 0 ) { val = 0 ; }
 
     val += accum;
-    lambda(i,val,true);
+    closure(i,val,true);
     accum += local_total;
   }
 }
@@ -478,18 +493,26 @@ void parallel_scan
 // blockDim.y == team_size
 // threadIdx.x == position in vec
 // threadIdx.y == member number
-template< typename iType, class Lambda, typename ValueType >
+template< typename iType, class Closure >
 KOKKOS_INLINE_FUNCTION
 void parallel_scan
   (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
-   const Lambda & lambda)
+   const Closure & closure )
 {
-  ValueType accum = 0 ;
-  ValueType val, y, local_total;
+  // Extract value_type from closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+  value_type val, y, local_total;
 
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
     val = 0;
-    lambda(i,val,false);
+    closure(i,val,false);
 
     // intra-blockDim.x exclusive scan on 'val'
     // accum = accumulated, sum in total for this iteration
@@ -501,14 +524,14 @@ void parallel_scan
     }
 
     // pass accum to all threads
-    local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
+    local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);
 
     // make EXCLUSIVE scan by shifting values over one
     val = Kokkos::shfl_up(val, 1, blockDim.x);
     if ( threadIdx.x == 0 ) { val = 0 ; }
 
     val += accum;
-    lambda(i,val,true);
+    closure(i,val,true);
     accum += local_total;
   }
 }
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index 4e1ce855c5efc9f8ecb414096b87ea14728967f9..a450ca36ae1bb0049c2abd142e20733edcaf2f7c 100644
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -44,36 +44,47 @@
 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
 
+#include <initializer_list>
+
+#include<impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <initializer_list>
 
-#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_MDRANGE_IVDEP
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
 #endif
 
 namespace Kokkos { namespace Experimental {
 
+// ------------------------------------------------------------------ //
+
 enum class Iterate
 {
   Default, // Default for the device
   Left,    // Left indices stride fastest
   Right,   // Right indices stride fastest
-  Flat,    // Do not tile, only valid for inner direction
 };
 
 template <typename ExecSpace>
 struct default_outer_direction
 {
   using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
   static constexpr Iterate value = Iterate::Right;
+  #endif
 };
 
 template <typename ExecSpace>
 struct default_inner_direction
 {
   using type = Iterate;
+  #if defined( KOKKOS_ENABLE_CUDA)
+  static constexpr Iterate value = Iterate::Left;
+  #else
   static constexpr Iterate value = Iterate::Right;
+  #endif
 };
 
 
@@ -86,7 +97,7 @@ struct Rank
 {
   static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
   static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
+  static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
 
   using iteration_pattern = Rank<N, OuterDir, InnerDir>;
 
@@ -96,515 +107,370 @@ struct Rank
 };
 
 
-
 // multi-dimensional iteration pattern
 template <typename... Properties>
 struct MDRangePolicy
+  : public Kokkos::Impl::PolicyTraits<Properties ...>
 {
+  using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
   using range_policy = RangePolicy<Properties...>;
 
-  static_assert( !std::is_same<range_policy,void>::value
+  using impl_range_policy = RangePolicy< typename traits::execution_space
+                                       , typename traits::schedule_type
+                                       , typename traits::index_type
+                                       > ;
+
+  static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
                , "Kokkos Error: MD iteration pattern not defined" );
 
-  using iteration_pattern   = typename range_policy::iteration_pattern;
-  using work_tag            = typename range_policy::work_tag;
+  using iteration_pattern   = typename traits::iteration_pattern;
+  using work_tag            = typename traits::work_tag;
 
   static constexpr int rank = iteration_pattern::rank;
 
   static constexpr int outer_direction = static_cast<int> (
-      (iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
+      (iteration_pattern::outer_direction != Iterate::Default)
     ? iteration_pattern::outer_direction
-    : default_outer_direction< typename range_policy::execution_space>::value );
+    : default_outer_direction< typename traits::execution_space>::value );
 
   static constexpr int inner_direction = static_cast<int> (
       iteration_pattern::inner_direction != Iterate::Default
     ? iteration_pattern::inner_direction
-    : default_inner_direction< typename range_policy::execution_space>::value ) ;
+    : default_inner_direction< typename traits::execution_space>::value ) ;
 
 
   // Ugly ugly workaround intel 14 not handling scoped enum correctly
-  static constexpr int Flat = static_cast<int>( Iterate::Flat );
   static constexpr int Right = static_cast<int>( Iterate::Right );
-
-
-  using size_type   = typename range_policy::index_type;
-  using index_type  = typename std::make_signed<size_type>::type;
-
-
-  template <typename I>
-  MDRangePolicy( std::initializer_list<I> upper_corner )
+  static constexpr int Left  = static_cast<int>( Iterate::Left );
+
+  using index_type  = typename traits::index_type;
+  using array_index_type = long;
+  using point_type  = Kokkos::Array<array_index_type,rank>; //was index_type
+  using tile_type   = Kokkos::Array<array_index_type,rank>;
+  // If point_type or tile_type is not templated on a signed integral type (if it is unsigned), 
+  // then if user passes in intializer_list of runtime-determined values of 
+  // signed integral type that are not const will receive a compiler error due 
+  // to an invalid case for implicit conversion - 
+  // "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
+  // This would require the user to either pass a matching index_type parameter
+  // as template parameter to the MDRangePolicy or static_cast the individual values
+
+  MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
+    : m_lower(lower)
+    , m_upper(upper)
+    , m_tile(tile)
+    , m_num_tiles(1)
   {
-    static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-
-    //static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
-
-    const auto u = upper_corner.begin();
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(0);
-      m_dim[i]    = static_cast<index_type>(u[i]);
-      if (inner_direction != Flat) {
-        // default tile size to 4
-        m_tile[i] = 4;
-      } else {
-        m_tile[i] = 1;
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
+       )
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = span;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
       }
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
     }
-  }
-
-  template <typename IA, typename IB>
-  MDRangePolicy( std::initializer_list<IA> corner_a
-               , std::initializer_list<IB> corner_b
-               )
-  {
-    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
-    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
-    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
-
-
-    using A = typename std::make_signed<IA>::type;
-    using B = typename std::make_signed<IB>::type;
-
-    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
-    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
-      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
-      if (inner_direction != Flat) {
-        // default tile size to 4
-        m_tile[i] = 4;
-      } else {
-        m_tile[i] = 1;
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = upper[i] - lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+      index_type total_tile_size_check = 1;
+      for (int i=0; i<rank; ++i) {
+        total_tile_size_check *= m_tile[i];
+      }
+      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
       }
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
-    }
-  }
-
-  template <typename IA, typename IB, typename T>
-  MDRangePolicy( std::initializer_list<IA> corner_a
-               , std::initializer_list<IB> corner_b
-               , std::initializer_list<T> tile
-               )
-  {
-    static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
-    static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
-    static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
-    static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
-
-    // TODO check size of lists equal to rank
-    // static_asserts on initializer_list.size() require c++14
-    //static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
-    //static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
-    //static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
-
-    using A = typename std::make_signed<IA>::type;
-    using B = typename std::make_signed<IB>::type;
-
-    const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
-    const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
-    const auto t = tile.begin();
-
-    m_num_tiles = 1;
-    for (int i=0; i<rank; ++i) {
-      m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
-      m_dim[i]    = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
-      m_tile[i]   = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
-      m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
-      m_num_tiles *= m_tile_dim[i];
     }
+    #endif
   }
 
-  index_type   m_offset[rank];
-  index_type   m_dim[rank];
-  int          m_tile[rank];
-  index_type   m_tile_dim[rank];
-  size_type    m_num_tiles;       // product of tile dims
-};
-
-namespace Impl {
 
-// Serial, Threads, OpenMP
-// use enable_if to overload for Cuda
-template < typename MDRange, typename Functor, typename Enable = void >
-struct MDForFunctor
-{
-  using work_tag   = typename MDRange::work_tag;
-  using index_type = typename MDRange::index_type;
-  using size_type  = typename MDRange::size_type;
-
-  MDRange m_range;
-  Functor m_func;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange const& range, Functor const& f )
-    : m_range(range)
-    , m_func( f )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange const& range, Functor && f )
-    : m_range(range)
-    , m_func( std::forward<Functor>(f) )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange && range, Functor const& f )
-    : m_range( std::forward<MDRange>(range) )
-    , m_func( f )
-  {}
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDRange && range, Functor && f )
-    : m_range( std::forward<MDRange>(range) )
-    , m_func( std::forward<Functor>(f) )
-  {}
-
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDForFunctor const& ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor& operator=( MDForFunctor const& ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor( MDForFunctor && ) = default;
-
-  KOKKOS_INLINE_FUNCTION
-  MDForFunctor& operator=( MDForFunctor && ) = default;
-
-  // Rank-2, Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
+  template < typename LT , typename UT , typename TT = array_index_type >
+  MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
   {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
-            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
-    } else {
-      m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
+#if 0
+    // This should work, less duplicated code but not yet extensively tested
+    point_type lower_tmp, upper_tmp;
+    tile_type tile_tmp;
+    for ( auto i = 0; i < rank; ++i ) {
+      lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
+      upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
+      tile_tmp[i]  = static_cast<array_index_type>(tile.begin()[i]);
     }
-  }
 
-  // Rank-2, Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
-            , m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
-    } else {
-      m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
-    }
-  }
+    MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
 
-  // Rank-2, Not Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    index_type t0, t1;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      t0 = t / m_range.m_tile_dim[1];
-      t1 = t % m_range.m_tile_dim[1];
-    } else {
-      t0 = t % m_range.m_tile_dim[0];
-      t1 = t / m_range.m_tile_dim[0];
-    }
+#else
+    if(m_lower.size()!=rank || m_upper.size() != rank)
+      Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i1=b1; i1<e1; ++i1) {
-        m_func( i0, i1 );
-      }}
-    } else {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( i0, i1 );
-      }}
+    for ( auto i = 0; i < rank; ++i ) {
+      m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
+      m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
+      if(tile.size()==rank)
+        m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
+      else
+        m_tile[i] = 0;
     }
-  }
 
-  // Rank-2, Not Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 2
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    work_tag tag;
-
-    index_type t0, t1;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      t0 = t / m_range.m_tile_dim[1];
-      t1 = t % m_range.m_tile_dim[1];
-    } else {
-      t0 = t % m_range.m_tile_dim[0];
-      t1 = t / m_range.m_tile_dim[0];
-    }
+    m_num_tiles = 1;
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i1=b1; i1<e1; ++i1) {
-        m_func( tag, i0, i1 );
-      }}
-    } else {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( tag, i0, i1 );
-      }}
-    }
-  }
 
-  //---------------------------------------------------------------------------
-
-  // Rank-3, Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-    const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
-    m_func( m_range.m_offset[0] + (  t / tmp_prod )
-          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
-          , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
-          );
-    } else {
-    const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
-    m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
-          , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
-          , m_range.m_offset[2] + (  t / tmp_prod )
-          );
+    // Host
+    if ( true
+       #if defined(KOKKOS_ENABLE_CUDA)
+         && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
+       #endif
+       )
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = span;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
     }
-  }
-
-  // Rank-3, Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction == MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
-      m_func( work_tag{}
-            , m_range.m_offset[0] + (  t / tmp_prod )
-            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
-            , m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
-            );
-    } else {
-      const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
-      m_func( work_tag{}
-            , m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
-            , m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
-            , m_range.m_offset[2] + (  t / tmp_prod )
-            );
+    #if defined(KOKKOS_ENABLE_CUDA)
+    else // Cuda
+    {
+      index_type span;
+      for (int i=0; i<rank; ++i) {
+        span = m_upper[i] - m_lower[i];
+        if ( m_tile[i] <= 0 ) {
+          // TODO: determine what is a good default tile size for cuda
+          // may be rank dependent
+          if (  (inner_direction == Right && (i < rank-1))
+              || (inner_direction == Left && (i > 0)) )
+          {
+            m_tile[i] = 2;
+          }
+          else {
+            m_tile[i] = 16;
+          }
+        }
+        m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
+        m_num_tiles *= m_tile_end[i];
+      }
+      index_type total_tile_size_check = 1;
+      for (int i=0; i<rank; ++i) {
+        total_tile_size_check *= m_tile[i];
+      }
+      if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
+        printf(" Tile dimensions exceed Cuda limits\n");
+        Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+        //Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
+      }
     }
+    #endif
+#endif
   }
 
-  // Rank-3, Not Flat, No Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    index_type t0, t1, t2;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
-      t0 = t / tmp_prod;
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
-      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
-    } else {
-      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
-      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
-      t2 = t / tmp_prod;
-    }
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i2=b2; i2<e2; ++i2) {
-        m_func( i0, i1, i2 );
-      }}}
-    } else {
-      for (int i2=b2; i2<e2; ++i2) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( i0, i1, i2 );
-      }}}
-    }
-  }
+  point_type m_lower;
+  point_type m_upper;
+  tile_type  m_tile;
+  point_type m_tile_end;
+  index_type m_num_tiles;
+};
+// ------------------------------------------------------------------ //
 
-  // Rank-3, Not Flat, Tag
-  template <typename Idx>
-  KOKKOS_FORCEINLINE_FUNCTION
-  typename std::enable_if<(  std::is_integral<Idx>::value
-                          && !std::is_same<void, work_tag>::value
-                          && MDRange::rank == 3
-                          && MDRange::inner_direction != MDRange::Flat
-                          )>::type
-  operator()(Idx t) const
-  {
-    work_tag tag;
-
-    index_type t0, t1, t2;
-    if (  MDRange::outer_direction == MDRange::Right ) {
-      const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
-      t0 = t / tmp_prod;
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
-      t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
-    } else {
-      const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
-      t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
-      t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
-      t2 = t / tmp_prod;
-    }
+// ------------------------------------------------------------------ //
+//md_parallel_for
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename Enable = void>
+void md_parallel_for( MDRange const& range
+                    , Functor const& f
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
 
-    const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
-    const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
-    const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
-
-    const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
-    const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
-    const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
-
-    if (  MDRange::inner_direction == MDRange::Right ) {
-      for (int i0=b0; i0<e0; ++i0) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i2=b2; i2<e2; ++i2) {
-        m_func( tag, i0, i1, i2 );
-      }}}
-    } else {
-      for (int i2=b2; i2<e2; ++i2) {
-      for (int i1=b1; i1<e1; ++i1) {
-      #if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
-      #pragma ivdep
-      #endif
-      for (int i0=b0; i0<e0; ++i0) {
-        m_func( tag, i0, i1, i2 );
-      }}}
-    }
-  }
-};
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
+
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
 
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, void> g(range, f);
 
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
 
-} // namespace Impl
+  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+}
 
+// Cuda specialization
+#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
+template <typename MDRange, typename Functor>
+void md_parallel_for( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
 
 template <typename MDRange, typename Functor>
 void md_parallel_for( MDRange const& range
                     , Functor const& f
                     , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
                     )
 {
-  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
+  closure.execute();
+}
+#endif
+// ------------------------------------------------------------------ //
 
-  using range_policy = typename MDRange::range_policy;
+// ------------------------------------------------------------------ //
+//md_parallel_reduce
+// ------------------------------------------------------------------ //
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
 
-  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
 
-template <typename MDRange, typename Functor>
-void md_parallel_for( const std::string& str
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
                     , MDRange const& range
                     , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
                     )
 {
-  Impl::MDForFunctor<MDRange, Functor> g(range, f);
+  Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
 
-  using range_policy = typename MDRange::range_policy;
+  //using range_policy = typename MDRange::range_policy;
+  using range_policy = typename MDRange::impl_range_policy;
 
-  Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
+  Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
 }
 
+// Cuda - parallel_reduce not implemented yet
+/*
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , const std::string& str = ""
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
+  closure.execute();
+}
+
+template <typename MDRange, typename Functor, typename ValueType>
+void md_parallel_reduce( const std::string& str
+                    , MDRange const& range
+                    , Functor const& f
+                    , ValueType & v
+                    , typename std::enable_if<( true
+                      #if defined( KOKKOS_ENABLE_CUDA)
+                      && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
+                      #endif
+                      ) >::type* = 0
+                    )
+{
+  Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
+  closure.execute();
+}
+*/
+
 }} // namespace Kokkos::Experimental
 
 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp
index 8deb5142c4352021c4305b422508b21f8524e108..abb263b7ccd7d6f82f469d06fadbc2326fe21438 100644
--- a/lib/kokkos/core/src/Kokkos_Array.hpp
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@@ -59,8 +59,14 @@ template< class T      = void
         , class Proxy  = void
         >
 struct Array {
-private:
-  T m_elem[N];
+public:
+  /**
+   * The elements of this C array shall not be accessed directly. The data
+   * member has to be declared public to enable aggregate initialization as for
+   * std::array. We mark it as private in the documentation.
+   * @private
+   */
+  T m_internal_implementation_private_member_data[N];
 public:
 
   typedef T &                                 reference ;
@@ -78,25 +84,32 @@ public:
   KOKKOS_INLINE_FUNCTION
   reference operator[]( const iType & i )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
-      return m_elem[i];
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
     }
 
   template< typename iType >
   KOKKOS_INLINE_FUNCTION
   const_reference operator[]( const iType & i ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
-      return m_elem[i];
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
+      return m_internal_implementation_private_member_data[i];
     }
 
-  KOKKOS_INLINE_FUNCTION pointer       data()       { return & m_elem[0] ; }
-  KOKKOS_INLINE_FUNCTION const_pointer data() const { return & m_elem[0] ; }
+  KOKKOS_INLINE_FUNCTION pointer       data()
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
+  KOKKOS_INLINE_FUNCTION const_pointer data() const
+    {
+      return & m_internal_implementation_private_member_data[0];
+    }
 
-  ~Array() = default ;
-  Array() = default ;
-  Array( const Array & ) = default ;
-  Array & operator = ( const Array & ) = default ;
+  // Do not default unless move and move-assignment are also defined
+  // ~Array() = default ;
+  // Array() = default ;
+  // Array( const Array & ) = default ;
+  // Array & operator = ( const Array & ) = default ;
 
   // Some supported compilers are not sufficiently C++11 compliant
   // for default move constructor and move assignment operator.
@@ -124,7 +137,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   value_type operator[]( const iType & )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
       return value_type();
     }
 
@@ -132,7 +145,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   value_type operator[]( const iType & ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integer argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
       return value_type();
     }
 
@@ -181,7 +194,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   reference operator[]( const iType & i )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i];
     }
 
@@ -189,7 +202,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   const_reference operator[]( const iType & i ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i];
     }
 
@@ -250,7 +263,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   reference operator[]( const iType & i )
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i*m_stride];
     }
 
@@ -258,7 +271,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   const_reference operator[]( const iType & i ) const
     {
-      static_assert( std::is_integral<iType>::value , "Must be integral argument" );
+      static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
       return m_elem[i*m_stride];
     }
 
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
index 3f9bdea40da551332852448b3b7fb68952bd1875..cfcdabf95e3e085cf388f14e99fb6b4db3d8c654 100644
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -102,6 +102,7 @@ KOKKOS_IMPL_IS_CONCEPT( memory_traits )
 KOKKOS_IMPL_IS_CONCEPT( execution_space )
 KOKKOS_IMPL_IS_CONCEPT( execution_policy )
 KOKKOS_IMPL_IS_CONCEPT( array_layout )
+KOKKOS_IMPL_IS_CONCEPT( reducer )
 
 namespace Impl {
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index 6d92f4bf616a057bb83cc34d38ab872e77281608..16c1bce902d47f38a1cd455df8f8900d3e73c0a5 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -57,6 +57,10 @@
 #include <Kokkos_OpenMP.hpp>
 #endif
 
+#if defined( KOKKOS_ENABLE_QTHREADS )
+#include <Kokkos_Qthreads.hpp>
+#endif
+
 #if defined( KOKKOS_ENABLE_PTHREAD )
 #include <Kokkos_Threads.hpp>
 #endif
@@ -76,6 +80,7 @@
 
 #include <Kokkos_Complex.hpp>
 
+#include <iosfwd>
 
 //----------------------------------------------------------------------------
 
@@ -105,6 +110,9 @@ void finalize_all();
 
 void fence();
 
+/** \brief Print "Bill of Materials" */
+void print_configuration( std::ostream & , const bool detail = false );
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -159,4 +167,3 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
 //----------------------------------------------------------------------------
 
 #endif
-
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index e7e6a49d379045b2da38c7b53fdde589a989adec..4029bf599c6b564a8bc6bb2b6d20f9472fe19be5 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -63,7 +63,7 @@ namespace Kokkos {
 
 struct AUTO_t {
   KOKKOS_INLINE_FUNCTION
-  constexpr const AUTO_t & operator()() const { return *this ; }
+  constexpr const AUTO_t & operator()() const { return *this; }
 };
 
 namespace {
@@ -73,46 +73,49 @@ constexpr AUTO_t AUTO = Kokkos::AUTO_t();
 
 struct InvalidType {};
 
-}
+} // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Forward declarations for class inter-relationships
 
 namespace Kokkos {
 
-class HostSpace ; ///< Memory space for main process and CPU execution spaces
+class HostSpace; ///< Memory space for main process and CPU execution spaces
 
 #ifdef KOKKOS_ENABLE_HBWSPACE
 namespace Experimental {
-class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
+class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
 }
 #endif
 
 #if defined( KOKKOS_ENABLE_SERIAL )
-class Serial ;    ///< Execution space main process on CPU
-#endif // defined( KOKKOS_ENABLE_SERIAL )
+class Serial;    ///< Execution space main process on CPU.
+#endif
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+class Qthreads;  ///< Execution space with Qthreads back-end.
+#endif
 
 #if defined( KOKKOS_ENABLE_PTHREAD )
-class Threads ;  ///< Execution space with pthreads back-end
+class Threads;   ///< Execution space with pthreads back-end.
 #endif
 
 #if defined( KOKKOS_ENABLE_OPENMP )
-class OpenMP ; ///< OpenMP execution space
+class OpenMP;    ///< OpenMP execution space.
 #endif
 
 #if defined( KOKKOS_ENABLE_CUDA )
-class CudaSpace ;            ///< Memory space on Cuda GPU
-class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
-class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
-class Cuda ;                 ///< Execution space for Cuda GPU
+class CudaSpace;            ///< Memory space on Cuda GPU
+class CudaUVMSpace;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda;                 ///< Execution space for Cuda GPU
 #endif
 
 template<class ExecutionSpace, class MemorySpace>
 struct Device;
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Set the default execution space.
 
@@ -122,60 +125,66 @@ struct Device;
 
 namespace Kokkos {
 
-#if   defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-  typedef Cuda DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultExecutionSpace ;
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif
 
-#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef OpenMP DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Threads DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-  typedef Serial DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_OPENMP )
-  typedef OpenMP DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_PTHREAD )
-  typedef Threads DefaultHostExecutionSpace ;
-#elif defined ( KOKKOS_ENABLE_SERIAL )
-  typedef Serial DefaultHostExecutionSpace ;
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_PTHREAD )
+  typedef Threads DefaultHostExecutionSpace;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Qthreads DefaultHostExecutionSpace;
+#elif defined( KOKKOS_ENABLE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace;
 #else
-#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
 #endif
 
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // Detect the active execution space and define its memory space.
 // This is used to verify whether a running kernel can access
 // a given memory space.
 
 namespace Kokkos {
+
 namespace Impl {
 
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_ENABLE_CUDA)
-typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace;
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace;
 #else
-typedef void ActiveExecutionMemorySpace ;
+typedef void ActiveExecutionMemorySpace;
 #endif
 
-template< class ActiveSpace , class MemorySpace >
+template< class ActiveSpace, class MemorySpace >
 struct VerifyExecutionCanAccessMemorySpace {
   enum {value = 0};
 };
 
 template< class Space >
-struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+struct VerifyExecutionCanAccessMemorySpace< Space, Space >
 {
   enum {value = 1};
   KOKKOS_INLINE_FUNCTION static void verify(void) {}
@@ -183,33 +192,33 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
-#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE, DATA_PTR ) \
   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify( DATA_PTR )
 
 #define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
-    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+    Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify()
 
 //----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
 
 namespace Kokkos {
   void fence();
 }
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
 template< class Functor
         , class Policy
         , class EnableFunctor = void
-	      , class EnablePolicy = void
+        , class EnablePolicy = void
         >
 struct FunctorPolicyExecutionSpace;
 
@@ -220,18 +229,18 @@ struct FunctorPolicyExecutionSpace;
 ///
 /// This is an implementation detail of parallel_for.  Users should
 /// skip this and go directly to the nonmember function parallel_for.
-template< class FunctorType , class ExecPolicy , class ExecutionSpace =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelFor ;
+template< class FunctorType, class ExecPolicy, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelFor;
 
 /// \class ParallelReduce
 /// \brief Implementation detail of parallel_reduce.
 ///
 /// This is an implementation detail of parallel_reduce.  Users should
 /// skip this and go directly to the nonmember function parallel_reduce.
-template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelReduce ;
+template< class FunctorType, class ExecPolicy, class ReducerType = InvalidType, class ExecutionSpace =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelReduce;
 
 /// \class ParallelScan
 /// \brief Implementation detail of parallel_scan.
@@ -239,10 +248,12 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
 /// This is an implementation detail of parallel_scan.  Users should
 /// skip this and go directly to the documentation of the nonmember
 /// template function Kokkos::parallel_scan.
-template< class FunctorType , class ExecPolicy , class ExecutionSapce =
-          typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
-        > class ParallelScan ;
+template< class FunctorType, class ExecPolicy, class ExecutionSapce =
+          typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
+        > class ParallelScan;
 
-}}
-#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+} // namespace Impl
+
+} // namespace Kokkos
 
+#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
index afccdb6c5246b8a9778346d2db9065eb68ab7db0..433cac5e518cfbb40a413e1b5984994d54bfacbd 100644
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -62,7 +62,6 @@
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
-#include <KokkosExp_MDRangePolicy.hpp>
 
 /*--------------------------------------------------------------------------*/
 
@@ -295,6 +294,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <Cuda/Kokkos_Cuda_Parallel.hpp>
 #include <Cuda/Kokkos_Cuda_Task.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
 //----------------------------------------------------------------------------
 
 #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index d6bf8dcdf4520224fe238ec7eb3cc90754bd3838..fc39ce0e5bc04c4a9f2c6ee91580dbc43a45d8ef 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -44,14 +44,16 @@
 #ifndef KOKKOS_HBWSPACE_HPP
 #define KOKKOS_HBWSPACE_HPP
 
-
 #include <Kokkos_HostSpace.hpp>
 
 /*--------------------------------------------------------------------------*/
+
 #ifdef KOKKOS_ENABLE_HBWSPACE
 
 namespace Kokkos {
+
 namespace Experimental {
+
 namespace Impl {
 
 /// \brief Initialize lock array for arbitrary size atomics.
@@ -67,7 +69,7 @@ void init_lock_array_hbw_space();
 /// This function tries to aquire the lock for the hash value derived
 /// from the provided ptr. If the lock is successfully aquired the
 /// function returns true. Otherwise it returns false.
-bool lock_address_hbw_space(void* ptr);
+bool lock_address_hbw_space( void* ptr );
 
 /// \brief Release lock for the address
 ///
@@ -75,13 +77,16 @@ bool lock_address_hbw_space(void* ptr);
 /// from the provided ptr. This function should only be called
 /// after previously successfully aquiring a lock with
 /// lock_address.
-void unlock_address_hbw_space(void* ptr);
+void unlock_address_hbw_space( void* ptr );
 
 } // namespace Impl
-} // neamspace Experimental
+
+} // namespace Experimental
+
 } // namespace Kokkos
 
 namespace Kokkos {
+
 namespace Experimental {
 
 /// \class HBWSpace
@@ -91,10 +96,9 @@ namespace Experimental {
 /// memory means the usual CPU-accessible memory.
 class HBWSpace {
 public:
-
   //! Tag this class as a kokkos memory space
-  typedef HBWSpace  memory_space ;
-  typedef size_t     size_type ;
+  typedef HBWSpace  memory_space;
+  typedef size_t     size_type;
 
   /// \typedef execution_space
   /// \brief Default execution space for this memory space.
@@ -103,21 +107,25 @@ public:
   /// useful for things like initializing a View (which happens in
   /// parallel using the View's default execution space).
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
+  typedef Kokkos::Serial    execution_space;
 #else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qhreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
 #endif
 
   //! This memory space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
 
   /*--------------------------------*/
   /* Functions unique to the HBWSpace */
@@ -129,72 +137,73 @@ public:
 
   /**\brief  Default memory space instance */
   HBWSpace();
-  HBWSpace( const HBWSpace & rhs ) = default ;
-  HBWSpace & operator = ( const HBWSpace & ) = default ;
-  ~HBWSpace() = default ;
+  HBWSpace( const HBWSpace & rhs ) = default;
+  HBWSpace & operator = ( const HBWSpace & ) = default;
+  ~HBWSpace() = default;
 
   /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
 
-  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
 
   explicit
   HBWSpace( const AllocationMechanism & );
 
   /**\brief  Allocate untracked memory in the space */
-  void * allocate( const size_t arg_alloc_size ) const ;
+  void * allocate( const size_t arg_alloc_size ) const;
 
   /**\brief  Deallocate untracked memory in the space */
-  void deallocate( void * const arg_alloc_ptr 
-                 , const size_t arg_alloc_size ) const ;
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name();
 
 private:
 
-  AllocationMechanism  m_alloc_mech ;
+  AllocationMechanism  m_alloc_mech;
   static constexpr const char* m_name = "HBW";
-  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };
 
 } // namespace Experimental
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
-  : public SharedAllocationRecord< void , void >
+class SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >
+  : public SharedAllocationRecord< void, void >
 {
 private:
 
-  friend Kokkos::Experimental::HBWSpace ;
+  friend Kokkos::Experimental::HBWSpace;
 
-  typedef SharedAllocationRecord< void , void >  RecordBase ;
+  typedef SharedAllocationRecord< void, void >  RecordBase;
 
-  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
-  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
 
   static void deallocate( RecordBase * );
 
   /**\brief  Root record for tracked allocations from this HBWSpace instance */
-  static RecordBase s_root_record ;
+  static RecordBase s_root_record;
 
-  const Kokkos::Experimental::HBWSpace m_space ;
+  const Kokkos::Experimental::HBWSpace m_space;
 
 protected:
 
   ~SharedAllocationRecord();
-  SharedAllocationRecord() = default ;
+  SharedAllocationRecord() = default;
 
-  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace        & arg_space
-                        , const std::string              & arg_label
-                        , const size_t                     arg_alloc_size
-                        , const RecordBase::function_type  arg_dealloc = & deallocate
+  SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
+                        , const std::string                    & arg_label
+                        , const size_t                           arg_alloc_size
+                        , const RecordBase::function_type        arg_dealloc = & deallocate
                         );
 
 public:
@@ -206,23 +215,23 @@ public:
     }
 
   KOKKOS_INLINE_FUNCTION static
-  SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace &  arg_space
-                                   , const std::string       &  arg_label
-                                   , const size_t               arg_alloc_size
+  SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace & arg_space
+                                   , const std::string                    & arg_label
+                                   , const size_t                           arg_alloc_size
                                    )
     {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+      return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
 #else
-      return (SharedAllocationRecord *) 0 ;
+      return (SharedAllocationRecord *) 0;
 #endif
     }
 
   /**\brief  Allocate tracked memory in the space */
   static
   void * allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
-                         , const std::string & arg_label
-                         , const size_t arg_alloc_size );
+                         , const std::string                    & arg_label
+                         , const size_t                           arg_alloc_size );
 
   /**\brief  Reallocate tracked memory in the space */
   static
@@ -233,88 +242,93 @@ public:
   static
   void deallocate_tracked( void * const arg_alloc_ptr );
 
-
   static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
 
-  static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
+  static void print_records( std::ostream &, const Kokkos::Experimental::HBWSpace &, bool detail = false );
 };
 
 } // namespace Impl
-} // namespace Kokkos
 
+} // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
-static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace >::assignable, "" );
 
 template<>
-struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
+struct MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace > {
   enum { assignable = true };
   enum { accessible = true };
   enum { deepcopy   = true };
 };
 
 template<>
-struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
+struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace > {
   enum { assignable = false };
   enum { accessible = true };
   enum { deepcopy   = true };
 };
 
-}}
+} // namespace Impl
+
+} // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Impl {
 
+namespace Impl {
 
-template<class ExecutionSpace>
-struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
-template<class ExecutionSpace>
-struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, Experimental::HBWSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
-template<class ExecutionSpace>
-struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< Experimental::HBWSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace >
 {
   enum { value = true };
   inline static void verify( void ) { }
@@ -322,7 +336,7 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experime
 };
 
 template<>
-struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace >
 {
   enum { value = true };
   inline static void verify( void ) { }
@@ -330,8 +344,9 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kok
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 #endif
-#endif /* #define KOKKOS_HBWSPACE_HPP */
 
+#endif // #define KOKKOS_HBWSPACE_HPP
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
index e79de462bfe354fe5f7eb77100cdcc4e7aca2aef..82006665ce0a6a4ba37ae88ad8e7456d4c75101a 100644
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -60,6 +60,7 @@
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+
 namespace Impl {
 
 /// \brief Initialize lock array for arbitrary size atomics.
@@ -83,9 +84,10 @@ bool lock_address_host_space(void* ptr);
 /// from the provided ptr. This function should only be called
 /// after previously successfully aquiring a lock with
 /// lock_address.
-void unlock_address_host_space(void* ptr);
+void unlock_address_host_space( void* ptr );
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 namespace Kokkos {
@@ -97,10 +99,9 @@ namespace Kokkos {
 /// memory means the usual CPU-accessible memory.
 class HostSpace {
 public:
-
   //! Tag this class as a kokkos memory space
-  typedef HostSpace  memory_space ;
-  typedef size_t     size_type ;
+  typedef HostSpace  memory_space;
+  typedef size_t     size_type;
 
   /// \typedef execution_space
   /// \brief Default execution space for this memory space.
@@ -109,21 +110,25 @@ public:
   /// useful for things like initializing a View (which happens in
   /// parallel using the View's default execution space).
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_OPENMP )
-  typedef Kokkos::OpenMP   execution_space ;
+  typedef Kokkos::OpenMP    execution_space;
 #elif defined( KOKKOS_ENABLE_PTHREAD )
-  typedef Kokkos::Threads  execution_space ;
+  typedef Kokkos::Threads   execution_space;
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  typedef Kokkos::Qthreads  execution_space;
 #elif defined( KOKKOS_ENABLE_SERIAL )
-  typedef Kokkos::Serial   execution_space ;
+  typedef Kokkos::Serial    execution_space;
 #else
-#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
 #endif
 
   //! This memory space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
 
   /*--------------------------------*/
   /* Functions unique to the HostSpace */
@@ -135,61 +140,57 @@ public:
 
   /**\brief  Default memory space instance */
   HostSpace();
-  HostSpace( HostSpace && rhs ) = default ;
-  HostSpace( const HostSpace & rhs ) = default ;
-  HostSpace & operator = ( HostSpace && ) = default ;
-  HostSpace & operator = ( const HostSpace & ) = default ;
-  ~HostSpace() = default ;
+  HostSpace( HostSpace && rhs ) = default;
+  HostSpace( const HostSpace & rhs ) = default;
+  HostSpace & operator = ( HostSpace && ) = default;
+  HostSpace & operator = ( const HostSpace & ) = default;
+  ~HostSpace() = default;
 
   /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
 
-  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+  enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
 
   explicit
   HostSpace( const AllocationMechanism & );
 
   /**\brief  Allocate untracked memory in the space */
-  void * allocate( const size_t arg_alloc_size ) const ;
+  void * allocate( const size_t arg_alloc_size ) const;
 
   /**\brief  Deallocate untracked memory in the space */
-  void deallocate( void * const arg_alloc_ptr 
-                 , const size_t arg_alloc_size ) const ;
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name();
 
 private:
-
-  AllocationMechanism  m_alloc_mech ;
+  AllocationMechanism  m_alloc_mech;
   static constexpr const char* m_name = "Host";
-  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
+  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace, void >;
 };
 
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
-namespace Impl {
 
-static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
+namespace Impl {
 
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
 
 template< typename S >
 struct HostMirror {
 private:
-
   // If input execution space can access HostSpace then keep it.
   // Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot
   enum { keep_exe = Kokkos::Impl::MemorySpaceAccess
-    < typename S::execution_space::memory_space , Kokkos::HostSpace >
-      ::accessible };
+                      < typename S::execution_space::memory_space, Kokkos::HostSpace >::accessible };
 
   // If HostSpace can access memory space then keep it.
   // Example:  Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace
   enum { keep_mem = Kokkos::Impl::MemorySpaceAccess
-    < Kokkos::HostSpace , typename S::memory_space >::accessible };
+                      < Kokkos::HostSpace, typename S::memory_space >::accessible };
 
 public:
 
@@ -202,42 +203,41 @@ public:
                         , typename S::memory_space >
         , Kokkos::HostSpace
         >::type
-    >::type  Space ;
+    >::type  Space;
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-class SharedAllocationRecord< Kokkos::HostSpace , void >
-  : public SharedAllocationRecord< void , void >
+class SharedAllocationRecord< Kokkos::HostSpace, void >
+  : public SharedAllocationRecord< void, void >
 {
 private:
+  friend Kokkos::HostSpace;
 
-  friend Kokkos::HostSpace ;
-
-  typedef SharedAllocationRecord< void , void >  RecordBase ;
+  typedef SharedAllocationRecord< void, void >  RecordBase;
 
-  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
-  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
 
   static void deallocate( RecordBase * );
 
   /**\brief  Root record for tracked allocations from this HostSpace instance */
-  static RecordBase s_root_record ;
+  static RecordBase s_root_record;
 
-  const Kokkos::HostSpace m_space ;
+  const Kokkos::HostSpace m_space;
 
 protected:
-
   ~SharedAllocationRecord();
-  SharedAllocationRecord() = default ;
+  SharedAllocationRecord() = default;
 
   SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
                         , const std::string              & arg_label
@@ -249,22 +249,23 @@ public:
 
   inline
   std::string get_label() const
-    {
-      return std::string( RecordBase::head()->m_label );
-    }
+  {
+    return std::string( RecordBase::head()->m_label );
+  }
 
   KOKKOS_INLINE_FUNCTION static
   SharedAllocationRecord * allocate( const Kokkos::HostSpace &  arg_space
                                    , const std::string       &  arg_label
                                    , const size_t               arg_alloc_size
                                    )
-    {
+  {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+    return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
 #else
-      return (SharedAllocationRecord *) 0 ;
+    return (SharedAllocationRecord *) 0;
 #endif
-    }
+  }
+   
 
   /**\brief  Allocate tracked memory in the space */
   static
@@ -281,37 +282,37 @@ public:
   static
   void deallocate_tracked( void * const arg_alloc_ptr );
 
-
   static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
 
-  static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
+  static void print_records( std::ostream &, const Kokkos::HostSpace &, bool detail = false );
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
-template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space> struct DeepCopy ;
+template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space > struct DeepCopy;
 
-template<class ExecutionSpace>
-struct DeepCopy<HostSpace,HostSpace,ExecutionSpace> {
-  DeepCopy( void * dst , const void * src , size_t n ) {
-    memcpy( dst , src , n );
+template< class ExecutionSpace >
+struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
+  DeepCopy( void * dst, const void * src, size_t n ) {
+    memcpy( dst, src, n );
   }
-  DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
+
+  DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
     exec.fence();
-    memcpy( dst , src , n );
+    memcpy( dst, src, n );
   }
 };
 
 } // namespace Impl
-} // namespace Kokkos
-
 
-#endif /* #define KOKKOS_HOSTSPACE_HPP */
+} // namespace Kokkos
 
+#endif // #define KOKKOS_HOSTSPACE_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 52845b9e093bcc6cd363b144ac59df0bda8bb124..c138b08c94a5a9f93e7faeb067283a221486cb4a 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -45,22 +45,20 @@
 #define KOKKOS_MACROS_HPP
 
 //----------------------------------------------------------------------------
-/** Pick up configure/build options via #define macros:
+/** Pick up configure / build options via #define macros:
  *
  *  KOKKOS_ENABLE_CUDA                Kokkos::Cuda execution and memory spaces
  *  KOKKOS_ENABLE_PTHREAD             Kokkos::Threads execution space
- *  KOKKOS_ENABLE_QTHREAD             Kokkos::Qthread execution space
- *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP  execution space
- *  KOKKOS_ENABLE_HWLOC               HWLOC library is available
- *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK    insert array bounds checks, is expensive!
- *
- *  KOKKOS_ENABLE_MPI                 negotiate MPI/execution space interactions
- *
- *  KOKKOS_ENABLE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ *  KOKKOS_ENABLE_QTHREADS            Kokkos::Qthreads execution space
+ *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
+ *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space interactions.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
  */
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
-#include <KokkosCore_config.h>
+  #include <KokkosCore_config.h>
 #endif
 
 #include <impl/Kokkos_OldMacros.hpp>
@@ -86,7 +84,7 @@
  *  KOKKOS_ENABLE_INTEL_ATOMICS
  *  KOKKOS_ENABLE_OPENMP_ATOMICS
  *
- *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *  A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
  *
  *  Macros for marking functions to run in an execution space:
  *
@@ -98,64 +96,63 @@
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
+  // Compiling with a CUDA compiler.
+  //
+  //  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+  //    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+  //
+  //  When generating device code the __CUDA_ARCH__ macro is defined as:
+  //    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+
+  #include <cuda_runtime.h>
+  #include <cuda.h>
+
+  #if !defined( CUDA_VERSION )
+    #error "#include <cuda.h> did not define CUDA_VERSION."
+  #endif
 
-/*  Compiling with a CUDA compiler.
- *
- *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
- *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
- *
- *  When generating device code the __CUDA_ARCH__ macro is defined as:
- *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
- */
+  #if ( CUDA_VERSION < 7000 )
+    // CUDA supports C++11 in device code starting with version 7.0.
+    // This includes auto type and device code internal lambdas.
+    #error "Cuda version 7.0 or greater required."
+  #endif
 
-#include <cuda_runtime.h>
-#include <cuda.h>
+  #if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+    // Compiling with CUDA compiler for device code.
+    #error "Cuda device capability >= 3.0 is required."
+  #endif
 
-#if ! defined( CUDA_VERSION )
-#error "#include <cuda.h> did not define CUDA_VERSION"
-#endif
+  #ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+    #if ( CUDA_VERSION < 7050 )
+      // CUDA supports C++11 lambdas generated in host code to be given
+      // to the device starting with version 7.5. But the release candidate (7.5.6)
+      // still identifies as 7.0.
+      #error "Cuda version 7.5 or greater required for host-to-device Lambda support."
+    #endif
 
-#if ( CUDA_VERSION < 7000 )
-// CUDA supports C++11 in device code starting with
-// version 7.0. This includes auto type and device code internal
-// lambdas.
-#error "Cuda version 7.0 or greater required"
-#endif
+    #if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
+      #define KOKKOS_LAMBDA [=]__device__
+    #else
+      #define KOKKOS_LAMBDA [=]__host__ __device__
 
-#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
-/*  Compiling with CUDA compiler for device code. */
-#error "Cuda device capability >= 3.0 is required"
-#endif
+      #if defined( KOKKOS_ENABLE_CXX1Z )
+        #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
+      #endif
+    #endif
 
-#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
-#if ( CUDA_VERSION < 7050 )
-  // CUDA supports C++11 lambdas generated in host code to be given
-  // to the device starting with version 7.5. But the release candidate (7.5.6)
-  // still identifies as 7.0
-  #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
-#endif
-#if ( CUDA_VERSION < 8000 ) && defined(__NVCC__)
-  #define KOKKOS_LAMBDA [=]__device__
-#else
-  #define KOKKOS_LAMBDA [=]__host__ __device__
-  #if defined( KOKKOS_ENABLE_CXX1Z )
-    #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
+    #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
   #endif
-#endif
-#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
-#endif
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ ) */
+#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
 
-
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
    // Cuda version 8.0 still needs the functor wrapper
-   #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) && defined(__NVCC__)
+   #if /* ( CUDA_VERSION < 8000 ) && */  defined( __NVCC__ )
       #define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
    #endif
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* Language info: C++, CUDA, OPENMP */
+//----------------------------------------------------------------------------
+// Language info: C++, CUDA, OPENMP
 
 #if defined( KOKKOS_ENABLE_CUDA )
   // Compiling Cuda code to 'ptx'
@@ -163,20 +160,17 @@
   #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
   #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
   #define KOKKOS_FUNCTION              __device__  __host__
-#endif /* #if defined( __CUDA_ARCH__ ) */
+#endif // #if defined( __CUDA_ARCH__ )
 
 #if defined( _OPENMP )
+  //  Compiling with OpenMP.
+  //  The value of _OPENMP is an integer value YYYYMM
+  //  where YYYY and MM are the year and month designation
+  //  of the supported OpenMP API version.
+#endif // #if defined( _OPENMP )
 
-  /*  Compiling with OpenMP.
-   *  The value of _OPENMP is an integer value YYYYMM
-   *  where YYYY and MM are the year and month designation
-   *  of the supported OpenMP API version.
-   */
-
-#endif /* #if defined( _OPENMP ) */
-
-/*--------------------------------------------------------------------------*/
-/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+//----------------------------------------------------------------------------
+// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros
 
 #if defined( __NVCC__ )
   // NVIDIA compiler is being used.
@@ -184,29 +178,28 @@
   // Host code is compiled again with another compiler.
   // Device code is compile to 'ptx'.
   #define KOKKOS_COMPILER_NVCC __NVCC__
-
 #else
-#if ! defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
-    #if !defined (KOKKOS_ENABLE_CUDA) // Compiling with clang for Cuda does not work with LAMBDAs either
-    // CUDA (including version 6.5) does not support giving lambdas as
-    // arguments to global functions. Thus its not currently possible
-    // to dispatch lambdas from the host.
-    #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
+  #if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+    #if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
+      // CUDA (including version 6.5) does not support giving lambdas as
+      // arguments to global functions. Thus its not currently possible
+      // to dispatch lambdas from the host.
+      #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
     #endif
   #endif
-#endif /* #if defined( __NVCC__ ) */
+#endif // #if defined( __NVCC__ )
 
-#if !defined (KOKKOS_LAMBDA)
+#if !defined( KOKKOS_LAMBDA )
   #define KOKKOS_LAMBDA [=]
 #endif
 
-#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined (KOKKOS_CLASS_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
   #define KOKKOS_CLASS_LAMBDA [=,*this]
 #endif
 
-//#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'.
 
-/* Intel compiler for host code */
+// Intel compiler for host code.
 
 #if defined( __INTEL_COMPILER )
   #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
@@ -218,7 +211,7 @@
   #define KOKKOS_COMPILER_INTEL __ECC
 #endif
 
-/* CRAY compiler for host code */
+// CRAY compiler for host code
 #if defined( _CRAYC )
   #define KOKKOS_COMPILER_CRAYC _CRAYC
 #endif
@@ -234,50 +227,53 @@
   #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
 #endif
 
-#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
+#if defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL )
   #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
 #endif
 
-#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+#if !defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
   #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+
   #if ( 472 > KOKKOS_COMPILER_GNU )
     #error "Compiling with GCC version earlier than 4.7.2 is not supported."
   #endif
 #endif
 
-#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+#if defined( __PGIC__ ) && !defined( __GNUC__ )
   #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+
   #if ( 1540 > KOKKOS_COMPILER_PGI )
     #error "Compiling with PGI version earlier than 15.4 is not supported."
   #endif
 #endif
 
-//#endif /* #if ! defined( __CUDA_ARCH__ ) */
+//#endif // #if !defined( __CUDA_ARCH__ )
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-/* Intel compiler macros */
+//----------------------------------------------------------------------------
+// Intel compiler macros
 
 #if defined( KOKKOS_COMPILER_INTEL )
-
   #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
-  #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
+  #if ( __INTEL_COMPILER > 1400 )
+    #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
+  #endif
+
   #define KOKKOS_RESTRICT __restrict__
 
   #ifndef KOKKOS_ALIGN
-  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+    #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
   #endif
 
   #ifndef KOKKOS_ALIGN_PTR
-  #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
+    #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
   #endif
 
   #ifndef KOKKOS_ALIGN_SIZE
-  #define KOKKOS_ALIGN_SIZE 64
+    #define KOKKOS_ALIGN_SIZE 64
   #endif
 
   #if ( 1400 > KOKKOS_COMPILER_INTEL )
@@ -287,12 +283,13 @@
       #warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
     #endif
   #endif
-  #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
+
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( _WIN32 )
     #define KOKKOS_ENABLE_ASM 1
   #endif
 
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-    #if !defined (_WIN32)
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined( _WIN32 )
       #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
     #else
       #define KOKKOS_FORCEINLINE_FUNCTION inline
@@ -302,192 +299,170 @@
   #if defined( __MIC__ )
     // Compiling for Xeon Phi
   #endif
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* Cray compiler macros */
+//----------------------------------------------------------------------------
+// Cray compiler macros
 
 #if defined( KOKKOS_COMPILER_CRAYC )
-
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* IBM Compiler macros */
+//----------------------------------------------------------------------------
+// IBM Compiler macros
 
 #if defined( KOKKOS_COMPILER_IBM )
-
   #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* CLANG compiler macros */
+//----------------------------------------------------------------------------
+// CLANG compiler macros
 
 #if defined( KOKKOS_COMPILER_CLANG )
-
   //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
     #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
   #endif
-
 #endif
 
-/*--------------------------------------------------------------------------*/
-/* GNU Compiler macros */
+//----------------------------------------------------------------------------
+// GNU Compiler macros
 
 #if defined( KOKKOS_COMPILER_GNU )
-
   //#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   //#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   //#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
 
-  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #if !defined( KOKKOS_FORCEINLINE_FUNCTION )
     #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
   #endif
 
-  #if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( __PGIC__ ) && \
-      ( defined( __amd64 ) || \
-        defined( __amd64__ ) || \
-        defined( __x86_64 ) || \
-        defined( __x86_64__ ) )
+  #if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
+      ( defined( __amd64 ) || defined( __amd64__ ) || \
+        defined( __x86_64 ) || defined( __x86_64__ ) )
     #define KOKKOS_ENABLE_ASM 1
   #endif
-
 #endif
 
-/*--------------------------------------------------------------------------*/
+//----------------------------------------------------------------------------
 
 #if defined( KOKKOS_COMPILER_PGI )
-
   #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
   //#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
   #define KOKKOS_ENABLE_PRAGMA_VECTOR 1
   //#define KOKKOS_ENABLE_PRAGMA_SIMD 1
-
 #endif
 
-/*--------------------------------------------------------------------------*/
+//----------------------------------------------------------------------------
 
 #if defined( KOKKOS_COMPILER_NVCC )
-
-  #if defined(__CUDA_ARCH__ )
+  #if defined( __CUDA_ARCH__ )
     #define KOKKOS_ENABLE_PRAGMA_UNROLL 1
   #endif
-
 #endif
 
 //----------------------------------------------------------------------------
-/** Define function marking macros if compiler specific macros are undefined: */
+// Define function marking macros if compiler specific macros are undefined:
 
-#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
-#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
+  #define KOKKOS_FORCEINLINE_FUNCTION  inline
 #endif
 
-#if ! defined( KOKKOS_INLINE_FUNCTION )
-#define KOKKOS_INLINE_FUNCTION  inline
+#if !defined( KOKKOS_INLINE_FUNCTION )
+  #define KOKKOS_INLINE_FUNCTION  inline
 #endif
 
-#if ! defined( KOKKOS_FUNCTION )
-#define KOKKOS_FUNCTION /**/
+#if !defined( KOKKOS_FUNCTION )
+  #define KOKKOS_FUNCTION /**/
 #endif
 
-
 //----------------------------------------------------------------------------
-///** Define empty macro for restrict if necessary: */
+// Define empty macro for restrict if necessary:
 
-#if ! defined(KOKKOS_RESTRICT)
-#define KOKKOS_RESTRICT
+#if !defined( KOKKOS_RESTRICT )
+  #define KOKKOS_RESTRICT
 #endif
 
 //----------------------------------------------------------------------------
-/** Define Macro for alignment: */
-#if ! defined KOKKOS_ALIGN_SIZE
-#define KOKKOS_ALIGN_SIZE 16
-#endif
+// Define Macro for alignment:
 
-#if ! defined(KOKKOS_ALIGN)
-#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
+#if !defined KOKKOS_ALIGN_SIZE
+  #define KOKKOS_ALIGN_SIZE 16
 #endif
 
-#if ! defined(KOKKOS_ALIGN_PTR)
-#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
+#if !defined( KOKKOS_ALIGN )
+  #define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
 #endif
 
-//----------------------------------------------------------------------------
-/** Determine the default execution space for parallel dispatch.
- *  There is zero or one default execution space specified.
- */
-
-#if 1 < ( ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
-          ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
-
-#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
-
+#if !defined( KOKKOS_ALIGN_PTR )
+  #define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
 #endif
 
-/** If default is not specified then chose from enabled execution spaces.
- *  Priority: CUDA, OPENMP, THREADS, SERIAL
- */
-#if   defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-#elif defined ( KOKKOS_ENABLE_CUDA )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
-#elif defined ( KOKKOS_ENABLE_OPENMP )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
-#elif defined ( KOKKOS_ENABLE_PTHREAD )
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+//----------------------------------------------------------------------------
+// Determine the default execution space for parallel dispatch.
+// There is zero or one default execution space specified.
+
+#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
+          ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+  #error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified."
+#endif
+
+// If default is not specified then chose from enabled execution spaces.
+// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
+#if   defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
+//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_OPENMP )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined( KOKKOS_ENABLE_PTHREAD )
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+//#elif defined( KOKKOS_ENABLE_QTHREADS )
+//  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
 #else
-#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
 #endif
 
 //----------------------------------------------------------------------------
-/** Determine for what space the code is being compiled: */
+// Determine for what space the code is being compiled:
 
-#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_ENABLE_CUDA)
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
 #else
-#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
 #endif
 
-//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 #if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
     ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
-#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
-#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
-#endif
+  #if defined( KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN )
+    #define KOKKOS_ENABLE_POSIX_MEMALIGN 1
+  #endif
 #endif
 
 //----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/**Enable Profiling by default**/
+// Enable Profiling by default
 
 #ifndef KOKKOS_ENABLE_PROFILING
-#define KOKKOS_ENABLE_PROFILING 1
+  #define KOKKOS_ENABLE_PROFILING 1
 #endif
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #ifndef KOKKOS_MACROS_HPP */
-
+#endif // #ifndef KOKKOS_MACROS_HPP
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index 2d45926e762acd61ba7f308a80c2d7f922267ffe..eadad10b4991db1e98410f8eafcd77ad9bc87db0 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -1294,6 +1294,7 @@ public:
   KOKKOS_INLINE_FUNCTION
   size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
 
+  KOKKOS_INLINE_FUNCTION
   size_t get_mem_size() const { return m_data_size; }
 
 private:
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
index a337d1a9d4a02fcdaae38a6f402301d1a6a9ec03..c0c43b92f4d72f4fb6ae5ba95dc5270887f1cd32 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -66,7 +66,6 @@
 #include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
 
-#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
@@ -196,6 +195,7 @@ struct VerifyExecutionCanAccessMemorySpace
 #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMP_Task.hpp>
 
+#include <KokkosExp_MDRangePolicy.hpp>
 /*--------------------------------------------------------------------------*/
 
 #endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp
index 83436826f4aded7131802662327d6b80c5b5c785..067767f2f83f1739fb3a40bd800300c2078c3b28 100644
--- a/lib/kokkos/core/src/Kokkos_Pair.hpp
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@@ -78,16 +78,14 @@ struct pair
   /// This calls the default constructors of T1 and T2.  It won't
   /// compile if those default constructors are not defined and
   /// public.
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first(), second()
-  {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair() = default ;
 
   /// \brief Constructor that takes both elements of the pair.
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type const& f, second_type const& s)
     : first(f), second(s)
   {}
@@ -97,7 +95,7 @@ struct pair
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -107,7 +105,7 @@ struct pair
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const volatile pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -183,7 +181,7 @@ struct pair<T1&, T2&>
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type f, second_type s)
     : first(f), second(s)
   {}
@@ -193,7 +191,7 @@ struct pair<T1&, T2&>
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -247,7 +245,7 @@ struct pair<T1, T2&>
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type const& f, second_type s)
     : first(f), second(s)
   {}
@@ -257,7 +255,7 @@ struct pair<T1, T2&>
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -311,7 +309,7 @@ struct pair<T1&, T2>
   ///
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(first_type f, second_type const& s)
     : first(f), second(s)
   {}
@@ -321,7 +319,7 @@ struct pair<T1&, T2>
   /// This calls the copy constructors of T1 and T2.  It won't compile
   /// if those copy constructors are not defined and public.
   template <class U, class V>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,V> &p)
     : first(p.first), second(p.second)
   {}
@@ -366,31 +364,31 @@ bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 
 //! Inequality operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(lhs==rhs); }
 
 //! Less-than operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
 
 //! Less-than-or-equal-to operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(rhs<lhs); }
 
 //! Greater-than operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return rhs<lhs; }
 
 //! Greater-than-or-equal-to operator for Kokkos::pair.
 template <class T1, class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 { return !(lhs<rhs); }
 
@@ -399,7 +397,7 @@ bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
 /// This is a "nonmember constructor" for Kokkos::pair.  It works just
 /// like std::make_pair.
 template <class T1,class T2>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 pair<T1,T2> make_pair (T1 x, T2 y)
 { return ( pair<T1,T2>(x,y) ); }
 
@@ -460,23 +458,21 @@ struct pair<T1,void>
   first_type  first;
   enum { second = 0 };
 
-  KOKKOS_FORCEINLINE_FUNCTION
-  pair()
-    : first()
-  {}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
+  pair() = default ;
 
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(const first_type & f)
     : first(f)
   {}
 
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair(const first_type & f, int)
     : first(f)
   {}
 
   template <class U>
-  KOKKOS_FORCEINLINE_FUNCTION
+  KOKKOS_FORCEINLINE_FUNCTION constexpr
   pair( const pair<U,void> &p)
     : first(p.first)
   {}
@@ -495,32 +491,32 @@ struct pair<T1,void>
 //
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return lhs.first==rhs.first; }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(lhs==rhs); }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return lhs.first<rhs.first; }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(rhs<lhs); }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return rhs<lhs; }
 
 template <class T1>
-KOKKOS_FORCEINLINE_FUNCTION
+KOKKOS_FORCEINLINE_FUNCTION constexpr
 bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 { return !(lhs<rhs); }
 
@@ -528,3 +524,4 @@ bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
 
 
 #endif //KOKKOS_PAIR_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
index 64b1502bcc1932338a16bfcb1604eb1887d85cce..e412e608b28ca52f7d7888ea5fc37af721c5b10c 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -52,13 +52,14 @@
 #include <Kokkos_View.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <typeinfo>
 #endif
 
 #include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
 #ifdef KOKKOS_DEBUG
@@ -175,7 +176,7 @@ void parallel_for( const ExecPolicy  & policy
                  , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
                  )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
      	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@@ -185,10 +186,10 @@ void parallel_for( const ExecPolicy  & policy
     Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
     Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
     Kokkos::Impl::shared_allocation_tracking_release_and_enable();
-   
+
    closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Profiling::endParallelFor(kpID);
      }
@@ -207,20 +208,20 @@ void parallel_for( const size_t        work_count
       execution_space ;
   typedef RangePolicy< execution_space > policy ;
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
   	Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
-    
+
   Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
   Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
   Kokkos::Impl::shared_allocation_tracking_release_and_enable();
 
   closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelFor(kpID);
      }
@@ -417,7 +418,7 @@ void parallel_scan( const ExecutionPolicy & policy
                   , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
                   )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@@ -430,7 +431,7 @@ void parallel_scan( const ExecutionPolicy & policy
 
   closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelScan(kpID);
      }
@@ -450,20 +451,20 @@ void parallel_scan( const size_t        work_count
 
   typedef Kokkos::RangePolicy< execution_space > policy ;
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   uint64_t kpID = 0;
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
      }
 #endif
-    
+
   Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
   Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
   Kokkos::Impl::shared_allocation_tracking_release_and_enable();
 
   closure.execute();
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
      if(Kokkos::Profiling::profileLibraryLoaded()) {
 	Kokkos::Profiling::endParallelScan(kpID);
      }
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index a3649b4422dc7f581b38f2866f2bacb63b93b631..900dce19fe52b538228fbb2a82cb649f5313ec43 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -1094,7 +1094,7 @@ namespace Impl {
         const PolicyType& policy,
         const FunctorType& functor,
         ReturnType& return_value) {
-          #if (KOKKOS_ENABLE_PROFILING)
+          #if defined(KOKKOS_ENABLE_PROFILING)
             uint64_t kpID = 0;
             if(Kokkos::Profiling::profileLibraryLoaded()) {
               Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
@@ -1116,7 +1116,7 @@ namespace Impl {
           Kokkos::Impl::shared_allocation_tracking_release_and_enable();
           closure.execute();
 
-          #if (KOKKOS_ENABLE_PROFILING)
+          #if defined(KOKKOS_ENABLE_PROFILING)
             if(Kokkos::Profiling::profileLibraryLoaded()) {
               Kokkos::Profiling::endParallelReduce(kpID);
             }
diff --git a/lib/kokkos/core/src/Kokkos_Qthread.hpp b/lib/kokkos/core/src/Kokkos_Qthreads.hpp
similarity index 72%
rename from lib/kokkos/core/src/Kokkos_Qthread.hpp
rename to lib/kokkos/core/src/Kokkos_Qthreads.hpp
index c58518b0654bb3267a12041a2ab7fef4e2375972..0507552c3f95e7fb63527603c7123a19daee2b14 100644
--- a/lib/kokkos/core/src/Kokkos_Qthread.hpp
+++ b/lib/kokkos/core/src/Kokkos_Qthreads.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,57 +36,75 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-#ifndef KOKKOS_QTHREAD_HPP
-#define KOKKOS_QTHREAD_HPP
+#ifndef KOKKOS_QTHREADS_HPP
+#define KOKKOS_QTHREADS_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#ifdef KOKKOS_ENABLE_QTHREADS
+
+// Defines to enable experimental Qthreads functionality.
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
 
 #include <cstddef>
 #include <iosfwd>
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Layout.hpp>
-#include <Kokkos_MemoryTraits.hpp>
+
 #include <Kokkos_HostSpace.hpp>
-#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+//#include <Kokkos_MemoryTraits.hpp>
+//#include <Kokkos_ExecPolicy.hpp>
+//#include <Kokkos_TaskScheduler.hpp> // Uncomment when Tasking working.
+#include <Kokkos_Layout.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
 
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+
 namespace Impl {
-class QthreadExec ;
+
+class QthreadsExec;
+
 } // namespace Impl
+
 } // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
 
-/** \brief  Execution space supported by Qthread */
-class Qthread {
+/** \brief  Execution space supported by Qthreads */
+class Qthreads {
 public:
   //! \name Type declarations that all Kokkos devices must provide.
   //@{
 
   //! Tag this class as an execution space
-  typedef Qthread                  execution_space ;
-  typedef Kokkos::HostSpace        memory_space ;
+  typedef Qthreads                 execution_space;
+  typedef Kokkos::HostSpace        memory_space;
   //! This execution space preferred device_type
-  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef Kokkos::Device< execution_space, memory_space > device_type;
 
-  typedef Kokkos::LayoutRight      array_layout ;
-  typedef memory_space::size_type  size_type ;
+  typedef Kokkos::LayoutRight      array_layout;
+  typedef memory_space::size_type  size_type;
 
-  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
+  typedef ScratchMemorySpace< Qthreads > scratch_memory_space;
 
   //@}
   /*------------------------------------------------------------------------*/
 
   /** \brief  Initialization will construct one or more instances */
-  static Qthread & instance( int = 0 );
+  static Qthreads & instance( int = 0 );
 
   /** \brief  Set the execution space to a "sleep" state.
    *
@@ -100,14 +118,14 @@ public:
   bool sleep();
 
   /** \brief  Wake from the sleep state.
-   * 
+   *
    *  \return True if enters or is in the "ready" state.
    *          False if functions are currently executing.
    */
   static bool wake();
 
   /** \brief Wait until all dispatched functions to complete.
-   * 
+   *
    *  The parallel_for or parallel_reduce dispatch of a functor may
    *  return asynchronously, before the functor completes.  This
    *  method does not return until all dispatched functors on this
@@ -128,26 +146,24 @@ public:
   static void finalize();
 
   /** \brief Print configuration information to the given output stream. */
-  static void print_configuration( std::ostream & , const bool detail = false );
+  static void print_configuration( std::ostream &, const bool detail = false );
 
-  int shepherd_size() const ;
-  int shepherd_worker_size() const ;
+  int shepherd_size() const;
+  int shepherd_worker_size() const;
 };
 
-/*--------------------------------------------------------------------------*/
-
 } // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+
 namespace Impl {
 
 template<>
-struct MemorySpaceAccess 
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
+struct MemorySpaceAccess
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
   >
 {
   enum { assignable = false };
@@ -157,27 +173,26 @@ struct MemorySpaceAccess
 
 template<>
 struct VerifyExecutionCanAccessMemorySpace
-  < Kokkos::Qthread::memory_space
-  , Kokkos::Qthread::scratch_memory_space
+  < Kokkos::Qthreads::memory_space
+  , Kokkos::Qthreads::scratch_memory_space
   >
 {
   enum { value = true };
-  inline static void verify( void ) { }
-  inline static void verify( const void * ) { }
+  inline static void verify( void ) {}
+  inline static void verify( const void * ) {}
 };
 
 } // namespace Impl
+
 } // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
-#include <Kokkos_Parallel.hpp>
-#include <Qthread/Kokkos_QthreadExec.hpp>
-#include <Qthread/Kokkos_Qthread_Parallel.hpp>
 
-#endif /* #define KOKKOS_QTHREAD_HPP */
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
+#include <Qthreads/Kokkos_Qthreads_Parallel.hpp>
+//#include <Qthreads/Kokkos_Qthreads_Task.hpp> // Uncomment when Tasking working.
+//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
+#endif // #define KOKKOS_ENABLE_QTHREADS
 
+#endif // #define KOKKOS_QTHREADS_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index f26253591007774c6d1aeb70bce6210896fea56f..72710e81679863bfc3c5e680663cf0feda2b5868 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -56,6 +56,8 @@
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
@@ -138,30 +140,15 @@ public:
   static void initialize( unsigned threads_count = 1 ,
                           unsigned use_numa_count = 0 ,
                           unsigned use_cores_per_numa = 0 ,
-                          bool allow_asynchronous_threadpool = false) {
-    (void) threads_count;
-    (void) use_numa_count;
-    (void) use_cores_per_numa;
-    (void) allow_asynchronous_threadpool;
-
-    // Init the array of locks used for arbitrarily sized atomics
-    Impl::init_lock_array_host_space();
-    #if (KOKKOS_ENABLE_PROFILING)
-      Kokkos::Profiling::initialize();
-    #endif
-  }
+                          bool allow_asynchronous_threadpool = false);
 
-  static int is_initialized() { return 1 ; }
+  static int is_initialized();
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency() {return 1;};
 
   //! Free any resources being consumed by the device.
-  static void finalize() {
-    #if (KOKKOS_ENABLE_PROFILING)
-      Kokkos::Profiling::finalize();
-    #endif
-  }
+  static void finalize();
 
   //! Print configuration information to the given output stream.
   static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@@ -177,10 +164,6 @@ public:
   inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
 
   //--------------------------------------------------------------------------
-
-  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
-
-  //--------------------------------------------------------------------------
 };
 
 } // namespace Kokkos
@@ -192,7 +175,7 @@ namespace Kokkos {
 namespace Impl {
 
 template<>
-struct MemorySpaceAccess 
+struct MemorySpaceAccess
   < Kokkos::Serial::memory_space
   , Kokkos::Serial::scratch_memory_space
   >
@@ -213,22 +196,6 @@ struct VerifyExecutionCanAccessMemorySpace
   inline static void verify( const void * ) { }
 };
 
-namespace SerialImpl {
-
-struct Sentinel {
-
-  void *   m_scratch ;
-  unsigned m_reduce_end ;
-  unsigned m_shared_end ;
-
-  Sentinel();
-  ~Sentinel();
-  static Sentinel & singleton();
-};
-
-inline
-unsigned align( unsigned n );
-}
 } // namespace Impl
 } // namespace Kokkos
 
@@ -238,89 +205,26 @@ unsigned align( unsigned n );
 namespace Kokkos {
 namespace Impl {
 
-class SerialTeamMember {
-private:
-  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
-  const scratch_memory_space  m_space ;
-  const int                   m_league_rank ;
-  const int                   m_league_size ;
-
-  SerialTeamMember & operator = ( const SerialTeamMember & );
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_scratch(int) const
-    { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & thread_scratch(int) const
-    { return m_space ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes );
 
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+HostThreadTeamData * serial_get_thread_team_data();
 
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(const ValueType& , const int& ) const {}
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION
-  ValueType team_reduce( const ValueType & value , const JoinOp & ) const
-    {
-      return value ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-    {
-      const Type tmp = global_accum ? *global_accum : Type(0) ;
-      if ( global_accum ) { *global_accum += value ; }
-      return tmp ;
-    }
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
-    { return Type(0); }
-
-  //----------------------------------------
-  // Execution space specific:
+} /* namespace Impl */
+} /* namespace Kokkos */
 
-  SerialTeamMember( int arg_league_rank
-                  , int arg_league_size
-                  , int arg_shared_size
-                  );
-};
 
-} // namespace Impl
+namespace Kokkos {
+namespace Impl {
 
 /*
  * < Kokkos::Serial , WorkArgTag >
  * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
  *
  */
-namespace Impl {
 template< class ... Properties >
 class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
 {
@@ -441,14 +345,11 @@ public:
     return p;
   };
 
-  typedef Impl::SerialTeamMember  member_type ;
+  typedef Impl::HostThreadTeamMember< Kokkos::Serial >  member_type ;
 };
 } /* namespace Impl */
 } /* namespace Kokkos */
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
-
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 /* Parallel patterns for Kokkos::Serial with RangePolicy */
@@ -521,11 +422,12 @@ private:
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -535,34 +437,25 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( i , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
       const TagType t{} ;
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( t , i , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
 public:
@@ -570,10 +463,29 @@ public:
   inline
   void execute() const
     {
-      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
 
-      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class HostViewType >
@@ -587,7 +499,7 @@ public:
     : m_functor( arg_functor )
     , m_policy( arg_policy )
     , m_reducer( InvalidType() )
-    , m_result_ptr( arg_result_view.ptr_on_device() )
+    , m_result_ptr( arg_result_view.data() )
     {
       static_assert( Kokkos::is_view< HostViewType >::value
         , "Kokkos::Serial reduce result must be a View" );
@@ -623,11 +535,13 @@ private:
 
   typedef Kokkos::RangePolicy< Traits ... > Policy ;
   typedef typename Policy::work_tag                                  WorkTag ;
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -635,10 +549,8 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
-      reference_type update = ValueInit::init( m_functor , ptr );
-
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( i , update , true );
@@ -648,11 +560,9 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( reference_type update ) const
     {
       const TagType t{} ;
-      reference_type update = ValueInit::init( m_functor , ptr );
-
       const typename Policy::member_type e = m_policy.end();
       for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
         m_functor( t , i , update , true );
@@ -664,9 +574,22 @@ public:
   inline
   void execute() const
     {
-      pointer_type ptr = (pointer_type)
-        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
-      this-> template exec< WorkTag >( ptr );
+      const size_t pool_reduce_size = Analysis::value_size( m_functor );
+      const size_t team_reduce_size  = 0 ; // Never shrinks
+      const size_t team_shared_size  = 0 ; // Never shrinks
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      reference_type update =
+        ValueInit::init( m_functor , pointer_type(data.pool_reduce_local()) );
+
+      this-> template exec< WorkTag >( update );
     }
 
   inline
@@ -696,6 +619,8 @@ class ParallelFor< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
   typedef typename Policy::member_type                       Member ;
 
@@ -706,21 +631,21 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec() const
+  exec( HostThreadTeamData & data ) const
     {
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( Member(ileague,m_league,m_shared) );
+        m_functor( Member(data,ileague,m_league) );
       }
     }
 
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec() const
+  exec( HostThreadTeamData & data ) const
     {
       const TagType t{} ;
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( t , Member(ileague,m_league,m_shared) );
+        m_functor( t , Member(data,ileague,m_league) );
       }
     }
 
@@ -729,15 +654,28 @@ public:
   inline
   void execute() const
     {
-      Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
-      this-> template exec< typename Policy::work_tag >();
+      const size_t pool_reduce_size  = 0 ; // Never shrinks
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      this->template exec< typename Policy::work_tag >( data );
     }
 
   ParallelFor( const FunctorType & arg_functor
              , const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_league(  arg_policy.league_size() )
-    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
     { }
 };
 
@@ -752,18 +690,22 @@ class ParallelReduce< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
+
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
   typedef typename Policy::member_type                       Member ;
   typedef typename Policy::work_tag                          WorkTag ;
 
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const int          m_league ;
@@ -774,33 +716,23 @@ private:
   template< class TagType >
   inline
   typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( HostThreadTeamData & data , reference_type update ) const
     {
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( Member(ileague,m_league,m_shared) , update );
+        m_functor( Member(data,ileague,m_league) , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class TagType >
   inline
   typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec( pointer_type ptr ) const
+  exec( HostThreadTeamData & data , reference_type update ) const
     {
       const TagType t{} ;
 
-      reference_type update = ValueInit::init(  ReducerConditional::select(m_functor , m_reducer) , ptr );
-
       for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
-        m_functor( t , Member(ileague,m_league,m_shared) , update );
+        m_functor( t , Member(data,ileague,m_league) , update );
       }
-
-      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
-        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
 public:
@@ -808,10 +740,31 @@ public:
   inline
   void execute() const
     {
-      pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
-           ( ValueTraits::value_size(  ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
+      const size_t pool_reduce_size  =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      const size_t team_reduce_size  = TEAM_REDUCE_SIZE ;
+      const size_t team_shared_size  = m_shared ;
+      const size_t thread_local_size = 0 ; // Never shrinks
+
+      serial_resize_thread_team_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
+
 
-      this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
+      HostThreadTeamData & data = *serial_get_thread_team_data();
+
+      pointer_type ptr =
+        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+
+      reference_type update =
+        ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      this-> template exec< WorkTag >( data , update );
+
+      Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
+        final(  ReducerConditional::select(m_functor , m_reducer) , ptr );
     }
 
   template< class ViewType >
@@ -825,8 +778,10 @@ public:
     : m_functor( arg_functor )
     , m_league( arg_policy.league_size() )
     , m_reducer( InvalidType() )
-    , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
+    , m_result_ptr( arg_result.data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
     {
       static_assert( Kokkos::is_view< ViewType >::value
         , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@@ -838,13 +793,15 @@ public:
 
   inline
   ParallelReduce( const FunctorType & arg_functor
-    , Policy       arg_policy
-    , const ReducerType& reducer )
-  : m_functor( arg_functor )
-  , m_league(  arg_policy.league_size() )
-  , m_reducer( reducer )
-  , m_result_ptr(  reducer.result_view().data() )
-  , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+                , Policy       arg_policy
+                , const ReducerType& reducer )
+    : m_functor( arg_functor )
+    , m_league(  arg_policy.league_size() )
+    , m_reducer( reducer )
+    , m_result_ptr(  reducer.result_view().data() )
+    , m_shared( arg_policy.scratch_size(0) +
+                arg_policy.scratch_size(1) +
+                FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
   {
   /*static_assert( std::is_same< typename ViewType::memory_space
                           , Kokkos::HostSpace >::value
@@ -858,261 +815,6 @@ public:
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
-/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
-
-namespace Kokkos {
-namespace Impl {
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
-  typedef iType index_type;
-  const iType begin ;
-  const iType end ;
-  enum {increment = 1};
-  const SerialTeamMember& thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
-    : begin(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
-    : begin( arg_begin )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-  template<typename iType>
-  struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
-    typedef iType index_type;
-    enum {start = 0};
-    const iType end;
-    enum {increment = 1};
-
-    KOKKOS_INLINE_FUNCTION
-    ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
-      end( count )
-    {}
-  };
-
-} // namespace Impl
-
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
-TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, count );
-}
-
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::SerialTeamMember >
-TeamThreadRange( const Impl::SerialTeamMember& thread, const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, iType(begin), iType(end) );
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
-  ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-}
-
-//----------------------------------------------------------------------------
 
 #include <impl/Kokkos_Serial_Task.hpp>
 
diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
index e4271aa18814160f58fde909b619c78cc25761fa..e25039d236d68544cecf3dc968f853179e94a52d 100644
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -82,6 +82,15 @@ class Future ;
 template< typename Space >
 class TaskScheduler ;
 
+template< typename Space >
+void wait( TaskScheduler< Space > const & );
+
+template< typename Space >
+struct is_scheduler : public std::false_type {};
+
+template< typename Space >
+struct is_scheduler< TaskScheduler< Space > > : public std::true_type {};
+
 } // namespace Kokkos
 
 #include <impl/Kokkos_TaskQueue.hpp>
@@ -109,9 +118,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;
 
-template< typename Space >
-class TaskExec ;
-
 } // namespace Impl
 } // namespace Kokkos
 
@@ -312,6 +318,19 @@ public:
     }
 };
 
+// Is a Future with the given execution space
+template< typename , typename ExecSpace = void >
+struct is_future : public std::false_type {};
+
+template< typename Arg1 , typename Arg2 , typename ExecSpace >
+struct is_future< Future<Arg1,Arg2> , ExecSpace >
+  : public std::integral_constant
+      < bool ,
+      ( std::is_same< ExecSpace , void >::value ||
+        std::is_same< ExecSpace
+                    , typename Future<Arg1,Arg2>::execution_space >::value )
+      > {};
+
 } // namespace Kokkos
 
 //----------------------------------------------------------------------------
@@ -319,18 +338,59 @@ public:
 
 namespace Kokkos {
 
-enum TaskType { TaskTeam   = Impl::TaskBase<void,void,void>::TaskTeam
-              , TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
+enum class TaskPriority : int { High    = 0
+                              , Regular = 1
+                              , Low     = 2 };
 
-enum TaskPriority { TaskHighPriority    = 0
-                  , TaskRegularPriority = 1
-                  , TaskLowPriority     = 2 };
+} // namespace Kokkos
 
-template< typename Space >
-void wait( TaskScheduler< Space > const & );
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< int TaskEnum , typename DepFutureType >
+struct TaskPolicyData
+{
+  using execution_space = typename DepFutureType::execution_space ;
+  using scheduler_type  = TaskScheduler< execution_space > ;
+
+  enum : int { m_task_type = TaskEnum };
+
+  scheduler_type const * m_scheduler ;
+  DepFutureType  const   m_dependence ;
+  int                    m_priority ;
+
+  TaskPolicyData() = delete ;
+  TaskPolicyData( TaskPolicyData && ) = default ;
+  TaskPolicyData( TaskPolicyData const & ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData && ) = default ;
+  TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( DepFutureType             && arg_future
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( 0 )
+    , m_dependence( arg_future )
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicyData( scheduler_type       const & arg_scheduler
+                , Kokkos::TaskPriority const & arg_priority )
+    : m_scheduler( & arg_scheduler )
+    , m_dependence()
+    , m_priority( static_cast<int>( arg_priority ) )
+    {}
+};
 
+} // namespace Impl
 } // namespace Kokkos
 
+//----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
@@ -348,52 +408,13 @@ private:
   queue_type * m_queue ;
 
   //----------------------------------------
-  // Process optional arguments to spawn and respawn functions
-
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const ) {}
-
-  // TaskTeam or TaskSingle
-  template< typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , TaskType const & arg
-             , Options const & ... opts )
-    {
-      task->m_task_type = arg ;
-      assign( task , opts ... );
-    }
-
-  // TaskHighPriority or TaskRegularPriority or TaskLowPriority
-  template< typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , TaskPriority const & arg
-             , Options const & ... opts )
-    {
-      task->m_priority = arg ;
-      assign( task , opts ... );
-    }
-
-  // Future for a dependence
-  template< typename A1 , typename A2 , typename ... Options >
-  KOKKOS_INLINE_FUNCTION static
-  void assign( task_base * const task
-             , Future< A1 , A2 > const & arg
-             , Options const & ... opts )
-    {
-      task->add_dependence( arg.m_task );
-      assign( task , opts ... );
-    }
-
-  //----------------------------------------
 
 public:
 
-  using execution_policy = TaskScheduler ;
   using execution_space  = ExecSpace ;
   using memory_space     = typename queue_type::memory_space ;
-  using member_type      = Kokkos::Impl::TaskExec< ExecSpace > ;
+  using member_type      =
+    typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
 
   KOKKOS_INLINE_FUNCTION
   TaskScheduler() : m_track(), m_queue(0) {}
@@ -460,18 +481,13 @@ public:
 
   //----------------------------------------
 
-  /**\brief  A task spawns a task with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   *  3) Team or Serial
-   */
-  template< typename FunctorType , typename ... Options >
-  KOKKOS_FUNCTION
-  Future< typename FunctorType::value_type , ExecSpace >
-  task_spawn( FunctorType const & arg_functor
-            , Options const & ... arg_options
-            ) const
+  template< int TaskEnum , typename DepFutureType , typename FunctorType >
+  KOKKOS_FUNCTION static
+  Kokkos::Future< typename FunctorType::value_type , execution_space >
+  spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+       , typename task_base::function_type                    arg_function
+       , FunctorType                                       && arg_functor
+       )
     {
       using value_type  = typename FunctorType::value_type ;
       using future_type = Future< value_type , execution_space > ;
@@ -479,11 +495,21 @@ public:
                                         , value_type
                                         , FunctorType > ;
 
+      queue_type * const queue =
+        arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
+        arg_policy.m_dependence.m_task
+          ? arg_policy.m_dependence.m_task->m_queue
+          : (queue_type*) 0 );
+
+      if ( 0 == queue ) {
+        Kokkos::abort("Kokkos spawn given null Future" );
+      }
+
       //----------------------------------------
       // Give single-thread back-ends an opportunity to clear
       // queue of ready tasks before allocating a new task
 
-      m_queue->iff_single_thread_recursive_execute();
+      queue->iff_single_thread_recursive_execute();
 
       //----------------------------------------
 
@@ -491,176 +517,129 @@ public:
 
       // Allocate task from memory pool
       f.m_task =
-        reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
+        reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
 
       if ( f.m_task ) {
 
         // Placement new construction
-        new ( f.m_task ) task_type( arg_functor );
-
-        // Reference count starts at two
-        // +1 for matching decrement when task is complete
-        // +1 for future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = sizeof(task_type);
-
-        assign( f.m_task , arg_options... );
-
-        // Spawning from within the execution space so the
-        // apply function pointer is guaranteed to be valid
-        f.m_task->m_apply = task_type::apply ;
-
-        m_queue->schedule( f.m_task );
-        // this task may be updated or executed at any moment
+        // Reference count starts at two:
+        //   +1 for the matching decrement when task is complete
+        //   +1 for the future
+        new ( f.m_task )
+          task_type( arg_function
+                   , queue
+                   , arg_policy.m_dependence.m_task /* dependence */
+                   , 2                              /* reference count */
+                   , int(sizeof(task_type))         /* allocation size */
+                   , int(arg_policy.m_task_type)
+                   , int(arg_policy.m_priority)
+                   , std::move(arg_functor) );
+
+        // The dependence (if any) is processed immediately
+        // within the schedule function, as such the dependence's
+        // reference count does not need to be incremented for
+        // the assignment.
+
+        queue->schedule_runnable( f.m_task );
+        // This task may be updated or executed at any moment,
+        // even during the call to 'schedule'.
       }
 
       return f ;
     }
 
-  /**\brief  The host process spawns a task with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   *  3) Team or Serial
-   */
-  template< typename FunctorType , typename ... Options >
-  inline
-  Future< typename FunctorType::value_type , ExecSpace >
-  host_spawn( FunctorType const & arg_functor
-            , Options const & ... arg_options
-            ) const
+  template< typename FunctorType , typename A1 , typename A2 >
+  KOKKOS_FUNCTION static
+  void
+  respawn( FunctorType         * arg_self
+         , Future<A1,A2> const & arg_dependence
+         , TaskPriority  const & arg_priority
+         )
     {
+      // Precondition: task is in Executing state
+
       using value_type  = typename FunctorType::value_type ;
-      using future_type = Future< value_type , execution_space > ;
       using task_type   = Impl::TaskBase< execution_space
                                         , value_type
                                         , FunctorType > ;
 
-      if ( m_queue == 0 ) {
-        Kokkos::abort("Kokkos::TaskScheduler not initialized");
-      }
+      task_type * const task = static_cast< task_type * >( arg_self );
 
-      future_type f ;
+      task->m_priority = static_cast<int>(arg_priority);
 
-      // Allocate task from memory pool
-      f.m_task =
-        reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
-
-      if ( f.m_task ) {
-
-        // Placement new construction
-        new( f.m_task ) task_type( arg_functor );
-
-        // Reference count starts at two:
-        // +1 to match decrement when task completes
-        // +1 for the future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = sizeof(task_type);
-
-        assign( f.m_task , arg_options... );
-
-        // Potentially spawning outside execution space so the
-        // apply function pointer must be obtained from execution space.
-        // Required for Cuda execution space function pointer.
-        m_queue->template proc_set_apply< FunctorType >( & f.m_task->m_apply );
+      task->add_dependence( arg_dependence.m_task );
 
-        m_queue->schedule( f.m_task );
-      }
-      return f ;
+      // Postcondition: task is in Executing-Respawn state
     }
 
+  //----------------------------------------
   /**\brief  Return a future that is complete
    *         when all input futures are complete.
    */
   template< typename A1 , typename A2 >
-  KOKKOS_FUNCTION
-  Future< ExecSpace >
-  when_all( int narg , Future< A1 , A2 > const * const arg ) const
+  KOKKOS_FUNCTION static
+  Future< execution_space >
+  when_all( Future< A1 , A2 > const arg[] , int narg )
     {
-      static_assert
-        ( std::is_same< execution_space
-                      , typename Future< A1 , A2 >::execution_space
-                      >::value
-        , "Future must have same execution space" );
-
-      using future_type = Future< ExecSpace > ;
-      using task_base   = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
+      using future_type = Future< execution_space > ;
+      using task_base   = Kokkos::Impl::TaskBase< execution_space , void , void > ;
 
       future_type f ;
 
-      size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
-
-      f.m_task =
-        reinterpret_cast< task_base * >( m_queue->allocate( size ) );
+      if ( narg ) {
 
-      if ( f.m_task ) {
-
-        new( f.m_task ) task_base();
-
-        // Reference count starts at two:
-        // +1 to match decrement when task completes
-        // +1 for the future
-        f.m_task->m_queue      = m_queue ;
-        f.m_task->m_ref_count  = 2 ;
-        f.m_task->m_alloc_size = size ;
-        f.m_task->m_dep_count  = narg ;
-        f.m_task->m_task_type  = task_base::Aggregate ;
-
-        task_base ** const dep = f.m_task->aggregate_dependences();
-
-        // Assign dependences to increment their reference count
-        // The futures may be destroyed upon returning from this call
-        // so increment reference count to track this assignment.
+        queue_type * queue = 0 ;
 
         for ( int i = 0 ; i < narg ; ++i ) {
-          task_base * const t = dep[i] = arg[i].m_task ;
+          task_base * const t = arg[i].m_task ;
           if ( 0 != t ) {
+            // Increment reference count to track subsequent assignment.
             Kokkos::atomic_increment( &(t->m_ref_count) );
+            if ( queue == 0 ) {
+              queue = t->m_queue ;
+            }
+            else if ( queue != t->m_queue ) {
+              Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+            }
           }
         }
 
-        m_queue->schedule( f.m_task );
-        // this when_all may be processed at any moment
-      }
+        if ( queue != 0 ) {
 
-      return f ;
-    }
+          size_t const size  = sizeof(task_base) + narg * sizeof(task_base*);
 
-  /**\brief  An executing task respawns itself with options
-   *
-   *  1) High, Normal, or Low priority
-   *  2) With or without dependence
-   */
-  template< class FunctorType , typename ... Options >
-  KOKKOS_FUNCTION
-  void respawn( FunctorType * task_self
-              , Options const & ... arg_options ) const
-    {
-      using value_type  = typename FunctorType::value_type ;
-      using task_type   = Impl::TaskBase< execution_space
-                                        , value_type
-                                        , FunctorType > ;
+          f.m_task =
+            reinterpret_cast< task_base * >( queue->allocate( size ) );
 
-      task_type * const task = static_cast< task_type * >( task_self );
+          if ( f.m_task ) {
 
-      // Reschedule task with no dependences.
-      m_queue->reschedule( task );
+            // Reference count starts at two:
+            // +1 to match decrement when task completes
+            // +1 for the future
+            new( f.m_task ) task_base( queue
+                                     , 2     /* reference count */
+                                     , size  /* allocation size */
+                                     , narg  /* dependence count */
+                                     );
 
-      // Dependences, if requested, are added here through parsing the arguments.
-      assign( task , arg_options... );
-    }
+            // Assign dependences, reference counts were already incremented
 
-  //----------------------------------------
+            task_base ** const dep = f.m_task->aggregate_dependences();
 
-  template< typename S >
-  friend
-  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
+            for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
+
+            queue->schedule_aggregate( f.m_task );
+            // this when_all may be processed at any moment
+          }
+        }
+      }
+
+      return f ;
+    }
 
   //----------------------------------------
 
-  inline
+  KOKKOS_INLINE_FUNCTION
   int allocation_capacity() const noexcept
     { return m_queue->m_memory.get_mem_size(); }
 
@@ -676,12 +655,192 @@ public:
   long allocated_task_count_accum() const noexcept
     { return m_queue->m_accum_alloc ; }
 
+  //----------------------------------------
+
+  template< typename S >
+  friend
+  void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
+
 };
 
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Construct a TaskTeam execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskTeam( T            const & arg
+        , TaskPriority const & arg_priority = TaskPriority::Regular
+        )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskTeam argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+// Construct a TaskSingle execution policy
+
+template< typename T >
+Kokkos::Impl::TaskPolicyData
+  < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+  , typename std::conditional< Kokkos::is_future< T >::value , T ,
+    typename Kokkos::Future< typename T::execution_space > >::type
+  >
+KOKKOS_INLINE_FUNCTION
+TaskSingle( T            const & arg
+          , TaskPriority const & arg_priority = TaskPriority::Regular
+          )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos TaskSingle argument must be Future or TaskScheduler" );
+
+  return
+    Kokkos::Impl::TaskPolicyData
+      < Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
+      , typename std::conditional< Kokkos::is_future< T >::value , T ,
+        typename Kokkos::Future< typename T::execution_space > >::type
+      >( arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+/**\brief  A host control thread spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+host_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  // May be spawning a Cuda task, must use the specialization
+  // to query on-device function pointer.
+  typename task_type::function_type const ptr =
+    Kokkos::Impl::TaskQueueSpecialization< exec_space >::
+      template get_function_pointer< task_type >();
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task spawns a task with options
+ *
+ *  1) Team or Serial
+ *  2) With scheduler or dependence
+ *  3) High, Normal, or Low priority
+ */
+template< int TaskEnum
+        , typename DepFutureType
+        , typename FunctorType >
+Future< typename FunctorType::value_type
+      , typename DepFutureType::execution_space >
+KOKKOS_INLINE_FUNCTION
+task_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
+          , FunctorType                                       && arg_functor
+          )
+{
+  using exec_space = typename DepFutureType::execution_space ;
+  using scheduler  = TaskScheduler< exec_space > ;
+
+  typedef Impl::TaskBase< exec_space
+                        , typename FunctorType::value_type
+                        , FunctorType
+                        > task_type ;
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) && \
+    defined( KOKKOS_ENABLE_CUDA )
+
+  static_assert( ! std::is_same< Kokkos::Cuda , exec_space >::value
+               , "Error calling Kokkos::task_spawn for Cuda space within Host code" );
+
+#endif
+
+  static_assert( TaskEnum == task_type::TaskTeam ||
+                 TaskEnum == task_type::TaskSingle
+               , "Kokkos host_spawn requires TaskTeam or TaskSingle" );
+
+  typename task_type::function_type const ptr = task_type::apply ;
+
+  return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
+}
+
+/**\brief  A task respawns itself with options
+ *
+ *  1) With scheduler or dependence
+ *  2) High, Normal, or Low priority
+ */
+template< typename FunctorType , typename T >
+void
+KOKKOS_INLINE_FUNCTION
+respawn( FunctorType         * arg_self
+       , T             const & arg
+       , TaskPriority  const & arg_priority = TaskPriority::Regular
+       )
+{
+  static_assert( Kokkos::is_future<T>::value ||
+                 Kokkos::is_scheduler<T>::value
+               , "Kokkos respawn argument must be Future or TaskScheduler" );
+
+  TaskScheduler< typename T::execution_space >::
+    respawn( arg_self , arg , arg_priority );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename A1 , typename A2 >
+KOKKOS_INLINE_FUNCTION
+Future< typename Future< A1 , A2 >::execution_space >
+when_all( Future< A1 , A2 > const arg[]
+        , int                     narg
+        )
+{
+  return TaskScheduler< typename Future<A1,A2>::execution_space >::
+    when_all( arg , narg );
+}
+
+//----------------------------------------------------------------------------
+// Wait for all runnable tasks to complete
+
 template< typename ExecSpace >
 inline
-void wait( TaskScheduler< ExecSpace > const & policy )
-{ policy.m_queue->execute(); }
+void wait( TaskScheduler< ExecSpace > const & scheduler )
+{ scheduler.m_queue->execute(); }
 
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
index aca482b427a11a21ecc5d71dddfffb715438fa85..8aa968d0535f1f6c32ac170a73d2ec60d018d824 100644
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -230,4 +230,3 @@ struct VerifyExecutionCanAccessMemorySpace
 #endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
 #endif /* #define KOKKOS_THREADS_HPP */
 
-
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
index 316f61fd4d9fcd4c7ce4ec37592659deef006bce..0668f89c86e040e5dd1017fc3c3f0a233e9affa3 100644
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@@ -31,23 +31,23 @@ KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
 CONDITIONAL_COPIES =
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-	KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
-	CONDITIONAL_COPIES += copy-cuda
+  KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+  CONDITIONAL_COPIES += copy-cuda
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
-	CONDITIONAL_COPIES += copy-threads
+  KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+  CONDITIONAL_COPIES += copy-threads
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	KOKKOS_HEADERS_QTHREAD += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
-	CONDITIONAL_COPIES += copy-qthread
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+  KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
+  CONDITIONAL_COPIES += copy-qthreads
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-	KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
-	CONDITIONAL_COPIES += copy-openmp
+  KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+  CONDITIONAL_COPIES += copy-openmp
 endif
 
 ifeq ($(KOKKOS_OS),CYGWIN)
@@ -60,6 +60,12 @@ ifeq ($(KOKKOS_OS),Darwin)
   COPY_FLAG =
 endif
 
+ifeq ($(KOKKOS_DEBUG),"no")
+  KOKKOS_DEBUG_CMAKE = OFF
+else
+  KOKKOS_DEBUG_CMAKE = ON
+endif
+
 messages: 
 	echo "Start Build"
 
@@ -91,6 +97,7 @@ build-makefile-kokkos:
 	echo "" >> Makefile.kokkos
 	echo "#Internal settings which need to propagated for Kokkos examples" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_CUDA = ${KOKKOS_INTERNAL_USE_CUDA}" >> Makefile.kokkos
+	echo "KOKKOS_INTERNAL_USE_QTHREADS = ${KOKKOS_INTERNAL_USE_QTHREADS}" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_OPENMP = ${KOKKOS_INTERNAL_USE_OPENMP}" >> Makefile.kokkos
 	echo "KOKKOS_INTERNAL_USE_PTHREADS = ${KOKKOS_INTERNAL_USE_PTHREADS}" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
@@ -107,7 +114,55 @@ build-makefile-kokkos:
 		> Makefile.kokkos.tmp
 	mv -f Makefile.kokkos.tmp Makefile.kokkos
 
-build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
+build-cmake-kokkos:
+	rm -f kokkos.cmake
+	echo "#Global Settings used to generate this library" >> kokkos.cmake
+	echo "set(KOKKOS_PATH $(PREFIX) CACHE PATH \"Kokkos installation path\")" >> kokkos.cmake
+	echo "set(KOKKOS_DEVICES $(KOKKOS_DEVICES) CACHE STRING \"Kokkos devices list\")" >> kokkos.cmake
+	echo "set(KOKKOS_ARCH $(KOKKOS_ARCH) CACHE STRING \"Kokkos architecture flags\")" >> kokkos.cmake
+	echo "set(KOKKOS_DEBUG $(KOKKOS_DEBUG_CMAKE) CACHE BOOL \"Kokkos debug enabled ?)\")" >> kokkos.cmake
+	echo "set(KOKKOS_USE_TPLS $(KOKKOS_USE_TPLS) CACHE STRING \"Kokkos templates list\")" >> kokkos.cmake
+	echo "set(KOKKOS_CXX_STANDARD $(KOKKOS_CXX_STANDARD) CACHE STRING \"Kokkos C++ standard\")" >> kokkos.cmake
+	echo "set(KOKKOS_OPTIONS $(KOKKOS_OPTIONS) CACHE STRING \"Kokkos options\")" >> kokkos.cmake
+	echo "set(KOKKOS_CUDA_OPTIONS $(KOKKOS_CUDA_OPTIONS) CACHE STRING \"Kokkos Cuda options\")" >> kokkos.cmake
+	echo "if(NOT $ENV{CXX})" >> kokkos.cmake
+	echo '  message(WARNING "You are currently using compiler $${CMAKE_CXX_COMPILER} while Kokkos was built with $(CXX) ; make sure this is the behavior you intended to be.")' >> kokkos.cmake
+	echo "endif()" >> kokkos.cmake
+	echo "if(NOT DEFINED ENV{NVCC_WRAPPER})" >> kokkos.cmake
+	echo "  set(NVCC_WRAPPER \"$(NVCC_WRAPPER)\" CACHE FILEPATH \"Path to command nvcc_wrapper\")" >> kokkos.cmake
+	echo "else()" >> kokkos.cmake
+	echo '  set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
+	echo "endif()" >> kokkos.cmake
+	echo "" >> kokkos.cmake  
+	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
+	echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
+	echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
+	echo "" >> kokkos.cmake  
+	echo "#Variables used in application Makefiles" >> kokkos.cmake
+	echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_CPPFLAGS \"$(KOKKOS_CPPFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LINK_DEPENDS \"$(KOKKOS_LINK_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LIBS \"$(KOKKOS_LIBS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_LDFLAGS \"$(KOKKOS_LDFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "" >> kokkos.cmake
+	echo "#Internal settings which need to propagated for Kokkos examples" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_CUDA \"${KOKKOS_INTERNAL_USE_CUDA}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_OPENMP \"${KOKKOS_INTERNAL_USE_OPENMP}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "set(KOKKOS_INTERNAL_USE_PTHREADS \"${KOKKOS_INTERNAL_USE_PTHREADS}\" CACHE STRING \"\")" >> kokkos.cmake
+	echo "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)" >> kokkos.cmake
+	echo "" >> kokkos.cmake
+	sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+	 	-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+	 	-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+	 	-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+	 	-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+	 	-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' kokkos.cmake \
+	 	> kokkos.cmake.tmp
+	mv -f kokkos.cmake.tmp kokkos.cmake
+
+build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
 
 mkdir: 
 	mkdir -p $(PREFIX)
@@ -124,9 +179,9 @@ copy-threads: mkdir
 	mkdir -p $(PREFIX)/include/Threads
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
 
-copy-qthread: mkdir
-	mkdir -p $(PREFIX)/include/Qthread
-	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREAD) $(PREFIX)/include/Qthread
+copy-qthreads: mkdir
+	mkdir -p $(PREFIX)/include/Qthreads
+	cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREADS) $(PREFIX)/include/Qthreads
 
 copy-openmp: mkdir
 	mkdir -p $(PREFIX)/include/OpenMP
@@ -137,6 +192,7 @@ install: mkdir $(CONDITIONAL_COPIES) build-lib
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
 	cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
 	cp $(COPY_FLAG) Makefile.kokkos $(PREFIX)
+	cp $(COPY_FLAG) kokkos.cmake $(PREFIX)
 	cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib
 	cp $(COPY_FLAG) KokkosCore_config.h $(PREFIX)/include
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
index a61791ca9c7be2779820b5ed96db1aec02644654..ecacffb77331c9d14134dc2dcc9a8eafabbc175f 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -46,7 +46,6 @@
 
 #include <omp.h>
 #include <iostream>
-#include <Kokkos_Parallel.hpp>
 #include <OpenMP/Kokkos_OpenMPexec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
@@ -107,58 +106,41 @@ private:
 
 public:
 
-  inline void execute() const {
-    this->template execute_schedule<typename Policy::schedule_type::type>();
-  }
-
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
-    execute_schedule() const
+  inline void execute() const
     {
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
+
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
 
-        ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
-      }
-/* END #pragma omp parallel */
-    }
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
 
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-    execute_schedule() const
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
 
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        std::pair<int64_t,int64_t> range(0,0);
 
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
+        do {
 
-        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
-        exec.reset_steal_target();
-        #pragma omp barrier
-        
-        long work_index = exec.get_work_index();
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
 
-        while(work_index != -1) {
-          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
-          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
-          ParallelFor::template exec_range< WorkTag >( m_functor , begin, end );
-          work_index = exec.get_work_index();
-        }
+          ParallelFor::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin() );
 
+        } while ( is_dynamic && 0 <= range.first );
       }
-/* END #pragma omp parallel */
+      // END #pragma omp parallel
     }
 
   inline
@@ -193,17 +175,18 @@ private:
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::member_type  Member ;
 
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
   typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
   // Static Assert WorkTag void if ReducerType not InvalidType
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd, WorkTag > ValueInit ;
   typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd, WorkTag > ValueJoin ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -247,92 +230,70 @@ private:
 
 public:
 
-  inline void execute() const {
-    this->template execute_schedule<typename Policy::schedule_type::type>();
-  }
-
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
-    execute_schedule() const
+  inline void execute() const
     {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+      enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
+                                      , Kokkos::Dynamic >::value };
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+      const size_t pool_reduce_bytes =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
+
+      OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        ParallelReduce::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
-      }
-/* END #pragma omp parallel */
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
 
-      // Reduction:
+        data.set_work_partition( m_policy.end() - m_policy.begin()
+                               , m_policy.chunk_size() );
 
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+        if ( is_dynamic ) {
+          // Make sure work partition is set before stealing
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
 
-      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
-      }
+        reference_type update =
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                         , data.pool_reduce_local() );
 
-      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+        std::pair<int64_t,int64_t> range(0,0);
 
-      if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        do {
 
-        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
-      }
-    }
+          range = is_dynamic ? data.get_work_stealing_chunk()
+                             : data.get_work_partition();
 
-  template<class Schedule>
-  inline
-  typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-    execute_schedule() const
-    {
-      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
-      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+          ParallelReduce::template
+            exec_range< WorkTag >( m_functor
+                                 , range.first  + m_policy.begin()
+                                 , range.second + m_policy.begin()
+                                 , update );
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
-
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-
-        exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
-        exec.reset_steal_target();
-        #pragma omp barrier
-
-        long work_index = exec.get_work_index();
-
-        reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
-        while(work_index != -1) {
-          const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
-          const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
-          ParallelReduce::template exec_range< WorkTag >
-            ( m_functor , begin,end
-            , update );
-          work_index = exec.get_work_index();
-        }
+        } while ( is_dynamic && 0 <= range.first );
       }
-/* END #pragma omp parallel */
+// END #pragma omp parallel
 
       // Reduction:
 
-      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
 
       for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
-        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
       }
 
       Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
 
       if ( m_result_ptr ) {
-        const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
 
         for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
       }
@@ -394,17 +355,18 @@ private:
 
   typedef Kokkos::RangePolicy< Traits ... > Policy ;
 
+  typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
+
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::WorkRange    WorkRange ;
   typedef typename Policy::member_type  Member ;
 
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   FunctorType, WorkTag > ValueInit ;
   typedef Kokkos::Impl::FunctorValueJoin<   FunctorType, WorkTag > ValueJoin ;
   typedef Kokkos::Impl::FunctorValueOps<    FunctorType, WorkTag > ValueOps ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType   m_functor ;
   const Policy        m_policy ;
@@ -452,53 +414,63 @@ public:
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
 
-      OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
+      const int    value_count       = Analysis::value_count( m_functor );
+      const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
+
+      OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                    , 0 // team_reduce_bytes
+                                    , 0 // team_shared_bytes
+                                    , 0 // thread_local_bytes
+                                    );
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        const pointer_type ptr =
-          pointer_type( exec.scratch_reduce() ) +
-          ValueTraits::value_count( m_functor );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+
+        const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
+
+        reference_type update_sum =
+          ValueInit::init( m_functor , data.pool_reduce_local() );
+
         ParallelScan::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueInit::init( m_functor , ptr ) , false );
-      }
-/* END #pragma omp parallel */
+          ( m_functor , range.begin() , range.end() , update_sum , false );
 
-      {
-        const unsigned thread_count = OpenMPexec::pool_size();
-        const unsigned value_count  = ValueTraits::value_count( m_functor );
+        if ( data.pool_rendezvous() ) {
 
-        pointer_type ptr_prev = 0 ;
+          pointer_type ptr_prev = 0 ;
 
-        for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+          const int n = data.pool_size();
 
-          pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+          for ( int i = 0 ; i < n ; ++i ) {
 
-          if ( ptr_prev ) {
-            for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
-            ValueJoin::join( m_functor , ptr + value_count , ptr );
-          }
-          else {
-            ValueInit::init( m_functor , ptr );
+            pointer_type ptr = (pointer_type)
+              data.pool_member(i)->pool_reduce_local();
+
+            if ( i ) {
+              for ( int j = 0 ; j < value_count ; ++j ) {
+                ptr[j+value_count] = ptr_prev[j+value_count] ;
+              }
+              ValueJoin::join( m_functor , ptr + value_count , ptr_prev );
+            }
+            else {
+              ValueInit::init( m_functor , ptr + value_count );
+            }
+
+            ptr_prev = ptr ;
           }
 
-          ptr_prev = ptr ;
+          data.pool_rendezvous_release();
         }
-      }
 
-#pragma omp parallel
-      {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
-        const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
-        const pointer_type ptr = pointer_type( exec.scratch_reduce() );
+        reference_type update_base =
+          ValueOps::reference
+            ( ((pointer_type)data.pool_reduce_local()) + value_count );
+
         ParallelScan::template exec_range< WorkTag >
-          ( m_functor , range.begin() , range.end()
-          , ValueOps::reference( ptr ) , true );
+          ( m_functor , range.begin() , range.end() , update_base , true );
       }
 /* END #pragma omp parallel */
+
     }
 
   //----------------------------------------
@@ -530,55 +502,59 @@ class ParallelFor< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
-  typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
+  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
+  typedef typename Policy::member_type          Member ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
   const int          m_shmem_size ;
 
-  template< class TagType, class Schedule >
+  template< class TagType >
   inline static
-  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Static>::value>::type
-  exec_team( const FunctorType & functor , Member member )
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( member );
-      }
-    }
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
 
-  template< class TagType, class Schedule >
-  inline static
-  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Static>::value >::type
-  exec_team( const FunctorType & functor , Member member )
-    {
-      const TagType t{} ;
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( t , member );
-      }
-    }
+        functor( Member( data, r , league_size ) );
 
-  template< class TagType, class Schedule >
-  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Dynamic>::value>::type
-  exec_team( const FunctorType & functor , Member member )
-    {
-      #pragma omp barrier
-      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
-        functor( member );
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
-  template< class TagType, class Schedule >
+
+  template< class TagType >
   inline static
-  typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
-  exec_team( const FunctorType & functor , Member member )
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      #pragma omp barrier
-      const TagType t{} ;
-      for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
-        functor( t , member );
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
@@ -587,31 +563,75 @@ public:
   inline
   void execute() const
     {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
       OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
 
-      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+      const size_t pool_reduce_size = 0 ; // Never shrinks
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
 
-      OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
+      OpenMPexec::resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
 
 #pragma omp parallel
       {
-        ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
-          ( m_functor
-          , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
+
+        const int active = data.organize_team( m_policy.team_size() );
+
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
+
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
+        }
+
+        if ( active ) {
+
+          std::pair<int64_t,int64_t> range(0,0);
+
+          do {
+
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelFor::template exec_team< WorkTag >
+              ( m_functor , data
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        }
+
+        data.disband_team();
       }
-/* END #pragma omp parallel */
+// END #pragma omp parallel
     }
 
+
   inline
   ParallelFor( const FunctorType & arg_functor ,
                const Policy      & arg_policy )
     : m_functor( arg_functor )
     , m_policy(  arg_policy )
-    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
     {}
 };
 
+//----------------------------------------------------------------------------
 
 template< class FunctorType , class ReducerType, class ... Properties >
 class ParallelReduce< FunctorType
@@ -622,20 +642,26 @@ class ParallelReduce< FunctorType
 {
 private:
 
+  enum { TEAM_REDUCE_SIZE = 512 };
+
   typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... >         Policy ;
 
-  typedef typename Policy::work_tag     WorkTag ;
-  typedef typename Policy::member_type  Member ;
+  typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
+
+  typedef typename Policy::work_tag             WorkTag ;
+  typedef typename Policy::schedule_type::type  SchedTag ;
+  typedef typename Policy::member_type          Member ;
+
+  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value
+                            , FunctorType, ReducerType> ReducerConditional;
 
-  typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
   typedef typename ReducerConditional::type ReducerTypeFwd;
 
-  typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag >  ValueTraits ;
   typedef Kokkos::Impl::FunctorValueInit<   ReducerTypeFwd , WorkTag >  ValueInit ;
   typedef Kokkos::Impl::FunctorValueJoin<   ReducerTypeFwd , WorkTag >  ValueJoin ;
 
-  typedef typename ValueTraits::pointer_type    pointer_type ;
-  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef typename Analysis::pointer_type    pointer_type ;
+  typedef typename Analysis::reference_type  reference_type ;
 
   const FunctorType  m_functor ;
   const Policy       m_policy ;
@@ -645,22 +671,48 @@ private:
 
   template< class TagType >
   inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  exec_team( const FunctorType & functor , Member member , reference_type update )
+  typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( member , update );
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
+
   template< class TagType >
   inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  exec_team( const FunctorType & functor , Member member , reference_type update )
+  typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
+  exec_team( const FunctorType & functor
+           , HostThreadTeamData & data
+           , reference_type     & update
+           , const int league_rank_begin
+           , const int league_rank_end
+           , const int league_size )
     {
-      const TagType t{} ;
-      for ( ; member.valid_static() ; member.next_static() ) {
-        functor( t , member , update );
+      const TagType t{};
+
+      for ( int r = league_rank_begin ; r < league_rank_end ; ) {
+
+        functor( t , Member( data, r , league_size ) , update );
+
+        if ( ++r < league_rank_end ) {
+          // Don't allow team members to lap one another
+          // so that they don't overwrite shared memory.
+          if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
+        }
       }
     }
 
@@ -669,44 +721,89 @@ public:
   inline
   void execute() const
     {
+      enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
+
       OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+      const size_t pool_reduce_size =
+        Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
 
-      const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+      const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
+      const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
+      const size_t thread_local_size = 0 ; // Never shrinks
 
-      OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
+      OpenMPexec::resize_thread_data( pool_reduce_size
+                                    , team_reduce_size
+                                    , team_shared_size
+                                    , thread_local_size );
 
 #pragma omp parallel
       {
-        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        HostThreadTeamData & data = *OpenMPexec::get_thread_data();
 
-        ParallelReduce::template exec_team< WorkTag >
-          ( m_functor
-          , Member( exec , m_policy , m_shmem_size, 0 )
-          , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
-      }
-/* END #pragma omp parallel */
+        const int active = data.organize_team( m_policy.team_size() );
 
-      {
-        const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
-
-        int max_active_threads = OpenMPexec::pool_size();
-        if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
-          max_active_threads = m_policy.league_size()* m_policy.team_size();
+        if ( active ) {
+          data.set_work_partition( m_policy.league_size()
+                                 , ( 0 < m_policy.chunk_size()
+                                   ? m_policy.chunk_size()
+                                   : m_policy.team_iter() ) );
+        }
 
-        for ( int i = 1 ; i < max_active_threads ; ++i ) {
-          ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+        if ( is_dynamic ) {
+          // Must synchronize to make sure each team has set its
+          // partition before begining the work stealing loop.
+          if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
         }
 
-        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+        if ( active ) {
+          reference_type update =
+            ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
+
+          std::pair<int64_t,int64_t> range(0,0);
 
-        if ( m_result_ptr ) {
-          const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+          do {
 
-          for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
+            range = is_dynamic ? data.get_work_stealing_chunk()
+                               : data.get_work_partition();
+
+            ParallelReduce::template exec_team< WorkTag >
+              ( m_functor , data , update
+              , range.first , range.second , m_policy.league_size() );
+
+          } while ( is_dynamic && 0 <= range.first );
+        } else {
+          ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
+                           , data.pool_reduce_local() );
         }
+
+        data.disband_team();
+      }
+// END #pragma omp parallel
+
+      // Reduction:
+
+      const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
+                       , ptr
+                       , OpenMPexec::get_thread_data(i)->pool_reduce_local() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
+
+      if ( m_result_ptr ) {
+        const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
+
+        for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
       }
     }
 
+  //----------------------------------------
+
   template< class ViewType >
   inline
   ParallelReduce( const FunctorType  & arg_functor ,
@@ -720,7 +817,10 @@ public:
     , m_policy(  arg_policy )
     , m_reducer( InvalidType() )
     , m_result_ptr( arg_result.ptr_on_device() )
-    , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+    , m_shmem_size( arg_policy.scratch_size(0) +
+                    arg_policy.scratch_size(1) +
+                    FunctorTeamShmemSize< FunctorType >
+                      ::value( arg_functor , arg_policy.team_size() ) )
     {}
 
   inline
@@ -731,7 +831,10 @@ public:
   , m_policy(  arg_policy )
   , m_reducer( reducer )
   , m_result_ptr(  reducer.result_view().data() )
-  , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
+  , m_shmem_size( arg_policy.scratch_size(0) +
+                  arg_policy.scratch_size(1) +
+                  FunctorTeamShmemSize< FunctorType >
+                    ::value( arg_functor , arg_policy.team_size() ) )
   {
   /*static_assert( std::is_same< typename ViewType::memory_space
                           , Kokkos::HostSpace >::value
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
index 5b3e9873e17bc360f28a8338b7b59b69cf627ec3..9144d8c2799a7db81af0886aafcff1ebcd828833 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
@@ -46,6 +46,7 @@
 #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
 
 #include <impl/Kokkos_TaskQueue_impl.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -55,231 +56,214 @@ namespace Impl {
 
 template class TaskQueue< Kokkos::OpenMP > ;
 
-//----------------------------------------------------------------------------
-
-TaskExec< Kokkos::OpenMP >::
-TaskExec()
-  : m_self_exec( 0 )
-  , m_team_exec( 0 )
-  , m_sync_mask( 0 )
-  , m_sync_value( 0 )
-  , m_sync_step( 0 )
-  , m_group_rank( 0 )
-  , m_team_rank( 0 )
-  , m_team_size( 1 )
-{
-}
-
-TaskExec< Kokkos::OpenMP >::
-TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
-  : m_self_exec( & arg_exec )
-  , m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
-  , m_sync_mask( 0 )
-  , m_sync_value( 0 )
-  , m_sync_step( 0 )
-  , m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
-  , m_team_rank(  arg_exec.pool_rank_rev() % arg_team_size )
-  , m_team_size(  arg_team_size )
-{
-  // This team spans
-  //    m_self_exec->pool_rev( team_size * group_rank )
-  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
-
-  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
-
-  sync[0] = int64_t(0) ;
-  sync[1] = int64_t(0) ;
-
-  for ( int i = 0 ; i < m_team_size ; ++i ) {
-    m_sync_value |= int64_t(1) << (8*i);
-    m_sync_mask  |= int64_t(3) << (8*i);
-  }
+class HostThreadTeamDataSingleton : private HostThreadTeamData {
+private:
+
+  HostThreadTeamDataSingleton() : HostThreadTeamData()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      const size_t num_pool_reduce_bytes  =   32 ;
+      const size_t num_team_reduce_bytes  =   32 ;
+      const size_t num_team_shared_bytes  = 1024 ;
+      const size_t num_thread_local_bytes = 1024 ;
+      const size_t alloc_bytes =
+        HostThreadTeamData::scratch_size( num_pool_reduce_bytes
+                                        , num_team_reduce_bytes
+                                        , num_team_shared_bytes
+                                        , num_thread_local_bytes );
+
+      HostThreadTeamData::scratch_assign
+        ( space.allocate( alloc_bytes )
+        , alloc_bytes
+        , num_pool_reduce_bytes
+        , num_team_reduce_bytes
+        , num_team_shared_bytes
+        , num_thread_local_bytes );
+    }
+
+  ~HostThreadTeamDataSingleton()
+    {
+      Kokkos::OpenMP::memory_space space ;
+      space.deallocate( HostThreadTeamData::scratch_buffer()
+                      , HostThreadTeamData::scratch_bytes() );
+    }
+
+public:
+
+  static HostThreadTeamData & singleton()
+    {
+      static HostThreadTeamDataSingleton s ;
+      return s ;
+    }
+};
 
-  Kokkos::memory_fence();
-}
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+//----------------------------------------------------------------------------
 
-void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
+void TaskQueueSpecialization< Kokkos::OpenMP >::execute
+  ( TaskQueue< Kokkos::OpenMP > * const queue )
 {
-  if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
-    Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
-  }
+  using execution_space = Kokkos::OpenMP ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
-  // Use team shared memory to synchronize.
-  // Alternate memory locations between barriers to avoid a sequence
-  // of barriers overtaking one another.
+  static task_root_type * const end =
+    (task_root_type *) task_root_type::EndTag ;
 
-  int64_t volatile * const sync =
-    ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+  HostThreadTeamData & team_data_single =
+    HostThreadTeamDataSingleton::singleton();
 
-  // This team member sets one byte within the sync variable
-  int8_t volatile * const sync_self =
-   ((int8_t *) sync) + m_team_rank ;
+  const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
+  // const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
 
 #if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
+fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
 fflush(stdout);
 #endif
 
-  *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
 
-  while ( m_sync_value != *sync ); // wait for team to arrive
+#pragma omp parallel
+  {
+    Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
 
-  ++m_sync_step ;
+    if ( self.organize_team( team_size ) ) {
 
-  if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
-    m_sync_value ^= m_sync_mask ;
-    if ( 1000 < m_sync_step ) m_sync_step = 0 ;
-  }
-}
+      Member single_exec( team_data_single );
+      Member team_exec( self );
 
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       );
+fflush(stdout);
 #endif
 
-//----------------------------------------------------------------------------
-
-void TaskQueueSpecialization< Kokkos::OpenMP >::execute
-  ( TaskQueue< Kokkos::OpenMP > * const queue )
-{
-  using execution_space = Kokkos::OpenMP ;
-  using queue_type      = TaskQueue< execution_space > ;
-  using task_root_type  = TaskBase< execution_space , void , void > ;
-  using PoolExec        = Kokkos::Impl::OpenMPexec ;
-  using Member          = TaskExec< execution_space > ;
+      // Loop until all queues are empty and no tasks in flight
 
-  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+      task_root_type * task = 0 ;
 
-  // Required:  team_size <= 8
+      do {
+        // Each team lead attempts to acquire either a thread team task
+        // or a single thread task for the team.
 
-  const int team_size = PoolExec::pool_size(2); // Threads per core
-  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+        if ( 0 == team_exec.team_rank() ) {
 
-  if ( 8 < team_size ) {
-    Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
-  }
+          bool leader_loop = false ;
 
-#pragma omp parallel
-  {
-    PoolExec & self = *PoolExec::get_thread_omp();
+          do {
 
-    Member single_exec ;
-    Member team_exec( self , team_size );
+            if ( 0 != task && end != task ) {
+              // team member #0 completes the previously executed task,
+              // completion may delete the task
+              queue->complete( task ); 
+            }
 
-    // Team shared memory
-    task_root_type * volatile * const task_shared =
-      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+            // If 0 == m_ready_count then set task = 0
 
-// Barrier across entire OpenMP thread pool to insure initialization
-#pragma omp barrier
+            task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
 
-    // Loop until all queues are empty and no tasks in flight
+            // Attempt to acquire a task
+            // Loop by priority and then type
+            for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+              for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+                task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
+              }
+            }
 
-    do {
+            // If still tasks are still executing
+            // and no task could be acquired
+            // then continue this leader loop
+            leader_loop = end == task ;
 
-      task_root_type * task = 0 ;
+            if ( ( ! leader_loop ) &&
+                 ( 0 != task ) &&
+                 ( task_root_type::TaskSingle == task->m_task_type ) ) {
 
-      // Each team lead attempts to acquire either a thread team task
-      // or a single thread task for the team.
+              // if a single thread task then execute now
 
-      if ( 0 == team_exec.team_rank() ) {
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , int64_t(task)
+       );
+fflush(stdout);
+#endif
 
-        task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+              (*task->m_apply)( task , & single_exec );
 
-        // Loop by priority and then type
-        for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
-          for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-            task = queue_type::pop_task( & queue->m_ready[i][j] );
-          }
+              leader_loop = true ;
+            }
+          } while ( leader_loop );
         }
-      }
-
-      // Team lead broadcast acquired task to team members:
-
-      if ( 1 < team_exec.team_size() ) {
-
-        if ( 0 == team_exec.team_rank() ) *task_shared = task ;
-
-        // Fence to be sure task_shared is stored before the barrier
-        Kokkos::memory_fence();
 
-        // Whole team waits for every team member to reach this statement
-        team_exec.team_barrier();
+        // Team lead either found 0 == m_ready_count or a team task
+        // Team lead broadcast acquired task:
 
-        // Fence to be sure task_shared is stored
-        Kokkos::memory_fence();
+        team_exec.team_broadcast( task , 0);
 
-        task = *task_shared ;
-      }
+        if ( 0 != task ) { // Thread Team Task
 
 #if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       , int64_t(task)
        );
 fflush(stdout);
 #endif
 
-      if ( 0 == task ) break ; // 0 == m_ready_count
-
-      if ( end == task ) {
-        // All team members wait for whole team to reach this statement.
-        // Is necessary to prevent task_shared from being updated
-        // before it is read by all threads.
-        team_exec.team_barrier();
-      }
-      else if ( task_root_type::TaskTeam == task->m_task_type ) {
-        // Thread Team Task
-        (*task->m_apply)( task , & team_exec );
+          (*task->m_apply)( task , & team_exec );
 
-        // The m_apply function performs a barrier
-
-        if ( 0 == team_exec.team_rank() ) {
-          // team member #0 completes the task, which may delete the task
-          queue->complete( task ); 
+          // The m_apply function performs a barrier
         }
-      }
-      else {
-        // Single Thread Task
+      } while( 0 != task );
 
-        if ( 0 == team_exec.team_rank() ) {
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
+       , self.pool_rank()
+       , self.pool_size()
+       , team_exec.team_rank()
+       , team_exec.team_size()
+       , team_exec.league_rank()
+       , team_exec.league_size()
+       );
+fflush(stdout);
+#endif
 
-          (*task->m_apply)( task , & single_exec );
+    }
 
-          queue->complete( task ); 
-        }
+    self.disband_team();
+
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
+       , self.pool_rank()
+       , self.pool_size()
+       );
+fflush(stdout);
+#endif
 
-        // All team members wait for whole team to reach this statement.
-        // Not necessary to complete the task.
-        // Is necessary to prevent task_shared from being updated
-        // before it is read by all threads.
-        team_exec.team_barrier();
-      }
-    } while(1);
   }
 // END #pragma omp parallel
 
+#if 0
+fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
+fflush(stdout);
+#endif
+
 }
 
 void TaskQueueSpecialization< Kokkos::OpenMP >::
@@ -289,13 +273,16 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = TaskQueue< execution_space > ;
   using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   if ( 1 == omp_get_num_threads() ) {
 
     task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-    Member single_exec ;
+    HostThreadTeamData & team_data_single =
+      HostThreadTeamDataSingleton::singleton();
+
+    Member single_exec( team_data_single );
 
     task_root_type * task = end ;
 
@@ -306,7 +293,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
       // Loop by priority and then type
       for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
         for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-          task = queue_type::pop_task( & queue->m_ready[i][j] );
+          task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
         }
       }
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 15dbb77c26c7432497417b0b27508b00d3d717af..3cfdf790bfb75165b936ce547828fd7f248f0b00 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -60,6 +60,7 @@ public:
   using execution_space = Kokkos::OpenMP ;
   using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
   using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
 
   // Must specify memory space
   using memory_space = Kokkos::HostSpace ;
@@ -70,296 +71,19 @@ public:
   // Must provide task queue execution function
   static void execute( queue_type * const );
 
-  // Must provide mechanism to set function pointer in
-  // execution space from the host process.
-  template< typename FunctorType >
+  template< typename TaskType >
   static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::OpenMP
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };
 
 extern template class TaskQueue< Kokkos::OpenMP > ;
 
-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::OpenMP >
-{
-private:
-
-  TaskExec( TaskExec && ) = delete ;
-  TaskExec( TaskExec const & ) = delete ;
-  TaskExec & operator = ( TaskExec && ) = delete ;
-  TaskExec & operator = ( TaskExec const & ) = delete ;
-
-
-  using PoolExec = Kokkos::Impl::OpenMPexec ;
-
-  friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
-  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
-
-  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure 
-  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
-  int64_t          m_sync_mask ;
-  int64_t mutable  m_sync_value ;
-  int     mutable  m_sync_step ;
-  int              m_group_rank ; ///< Which "team" subset of thread pool
-  int              m_team_rank ;  ///< Which thread within a team
-  int              m_team_size ;
-
-  TaskExec();
-  TaskExec( PoolExec & arg_exec , int arg_team_size );
-
-  void team_barrier_impl() const ;
-
-public:
-
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-  void * team_shared() const
-    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
-
-  int team_shared_size() const
-    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
-
-  /**\brief  Whole team enters this function call
-   *         before any teeam member returns from
-   *         this function call.
-   */
-  void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
-#else
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
-#endif
-
-  KOKKOS_INLINE_FUNCTION
-  int team_rank() const { return m_team_rank ; }
-
-  KOKKOS_INLINE_FUNCTION
-  int team_size() const { return m_team_size ; }
-};
-
 }} /* namespace Kokkos::Impl */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
-TeamThreadRange
-  ( Impl::TaskExec< Kokkos::OpenMP > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
-}
-
-template<typename iType1, typename iType2>
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::OpenMP > >
-TeamThreadRange
-  ( Impl:: TaskExec< Kokkos::OpenMP > & thread, const iType1 & begin, const iType2 & end )
-{
-  typedef typename std::common_type<iType1, iType2>::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::OpenMP > >(thread, begin, end);
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::OpenMP > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
-}
-
-/** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team.
- * This functionality requires C++11 support.
-*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
-  , const Lambda& lambda
-  )
-{
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i);
-  }
-}
-
-template<typename iType, class Lambda, typename ValueType>
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  ( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
-  , const Lambda& lambda
-  , ValueType& initialized_result)
-{
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i, result);
-  }
-
-  if ( 1 < loop_boundaries.thread.team_size() ) {
-
-    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-
-    loop_boundaries.thread.team_barrier();
-    shared[team_rank] = result;
-
-    loop_boundaries.thread.team_barrier();
-
-    // reduce across threads to thread 0
-    if (team_rank == 0) {
-      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
-        shared[0] += shared[i];
-      }
-    }
-
-    loop_boundaries.thread.team_barrier();
-
-    // broadcast result
-    initialized_result = shared[0];
-  }
-  else {
-    initialized_result = result ;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i, result);
-  }
-
-  if ( 1 < loop_boundaries.thread.team_size() ) {
-    ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-
-    loop_boundaries.thread.team_barrier();
-    shared[team_rank] = result;
-
-    loop_boundaries.thread.team_barrier();
-
-    // reduce across threads to thread 0
-    if (team_rank == 0) {
-      for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
-        join(shared[0], shared[i]);
-      }
-    }
-
-    loop_boundaries.thread.team_barrier();
-
-    // broadcast result
-    initialized_result = shared[0];
-  }
-  else {
-    initialized_result = result ;
-  }
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-  ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
-  int team_size = loop_boundaries.thread.team_size();
-  int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
-
-  // Intra-member scan
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-  shared[team_rank] = accum;
-  loop_boundaries.thread.team_barrier();
-
-  // Member 0 do scan on accumulated totals
-  if (team_rank == 0) {
-    for( iType i = 1; i < team_size; i+=1) {
-      shared[i] += shared[i-1];
-    }
-    accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
-  }
-
-  loop_boundaries.thread.team_barrier();
-
-  // Inter-member scan adding in accumulated totals
-  if (team_rank != 0) { accum = shared[team_rank-1]; }
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
index 34cf581a4796feb2e8b3d8a3f57343148ac955d9..2d50c6e54886087deea707d0dbb155566ed51428 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@@ -86,7 +86,7 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 
 int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
 
-OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
 
 void OpenMPexec::verify_is_process( const char * const label )
 {
@@ -113,67 +113,110 @@ void OpenMPexec::verify_initialized( const char * const label )
 
 }
 
-void OpenMPexec::clear_scratch()
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void OpenMPexec::clear_thread_data()
 {
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
+
+  const int old_alloc_bytes =
+    m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
+
+  Kokkos::HostSpace space ;
+
 #pragma omp parallel
   {
-    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-    typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
-    if ( m_pool[ rank_rev ] ) {
-      Record * const r = Record::get_record( m_pool[ rank_rev ] );
-      m_pool[ rank_rev ] = 0 ;
-      Record::decrement( r );
+    const int rank = m_map_rank[ omp_get_thread_num() ];
+
+    if ( 0 != m_pool[rank] ) {
+
+      m_pool[rank]->disband_pool();
+
+      space.deallocate( m_pool[rank] , old_alloc_bytes );
+
+      m_pool[rank] = 0 ;
     }
   }
 /* END #pragma omp parallel */
 }
 
-void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
-  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+  const size_t member_bytes =
+    sizeof(int64_t) *
+    HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
 
-  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
-  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+  HostThreadTeamData * root = m_pool[0] ;
 
-  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
-  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  const size_t old_pool_reduce  = root ? root->pool_reduce_bytes() : 0 ;
+  const size_t old_team_reduce  = root ? root->team_reduce_bytes() : 0 ;
+  const size_t old_team_shared  = root ? root->team_shared_bytes() : 0 ;
+  const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
+  const size_t old_alloc_bytes  = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;
 
-  // Requesting allocation and old allocation is too small:
+  // Allocate if any of the old allocation is tool small:
 
-  const bool allocate = ( old_reduce_size < reduce_size ) ||
-                        ( old_thread_size < thread_size );
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
 
   if ( allocate ) {
-    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
-    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
-  }
 
-  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
-  const int    pool_size  = m_pool_topo[0] ;
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
 
-  if ( allocate ) {
+    const size_t alloc_bytes =
+      member_bytes +
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    const int pool_size = omp_get_max_threads();
 
-    clear_scratch();
+    Kokkos::HostSpace space ;
 
 #pragma omp parallel
     {
-      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
-      const int rank     = pool_size - ( rank_rev + 1 );
+      const int rank = m_map_rank[ omp_get_thread_num() ];
 
-      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+      if ( 0 != m_pool[rank] ) {
 
-      Record * const r = Record::allocate( Kokkos::HostSpace()
-                                         , "openmp_scratch"
-                                         , alloc_size );
+        m_pool[rank]->disband_pool();
 
-      Record::increment( r );
+        space.deallocate( m_pool[rank] , old_alloc_bytes );
+      }
+
+      void * const ptr = space.allocate( alloc_bytes );
 
-      m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
+      m_pool[ rank ] = new( ptr ) HostThreadTeamData();
 
-      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+      m_pool[ rank ]->
+        scratch_assign( ((char *)ptr) + member_bytes
+                      , alloc_bytes
+                      , pool_reduce_bytes
+                      , team_reduce_bytes
+                      , team_shared_bytes
+                      , thread_local_bytes );
     }
 /* END #pragma omp parallel */
+
+    HostThreadTeamData::organize_pool( m_pool , pool_size );
   }
 }
 
@@ -197,14 +240,14 @@ void OpenMP::initialize( unsigned thread_count ,
   // Before any other call to OMP query the maximum number of threads
   // and save the value for re-initialization unit testing.
 
-  //Using omp_get_max_threads(); is problematic in conjunction with
-  //Hwloc on Intel (essentially an initial call to the OpenMP runtime
-  //without a parallel region before will set a process mask for a single core
-  //The runtime will than bind threads for a parallel region to other cores on the
-  //entering the first parallel region and make the process mask the aggregate of
-  //the thread masks. The intend seems to be to make serial code run fast, if you
-  //compile with OpenMP enabled but don't actually use parallel regions or so
-  //static int omp_max_threads = omp_get_max_threads();
+  // Using omp_get_max_threads(); is problematic in conjunction with
+  // Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  // without a parallel region before will set a process mask for a single core
+  // The runtime will than bind threads for a parallel region to other cores on the
+  // entering the first parallel region and make the process mask the aggregate of
+  // the thread masks. The intend seems to be to make serial code run fast, if you
+  // compile with OpenMP enabled but don't actually use parallel regions or so
+  // static int omp_max_threads = omp_get_max_threads();
   int nthreads = 0;
   #pragma omp parallel
   {
@@ -268,8 +311,6 @@ void OpenMP::initialize( unsigned thread_count ,
         // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
         // Call to 'new' may not be thread safe as well.
 
-        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
-
         const unsigned omp_rank    = omp_get_thread_num();
         const unsigned thread_r    = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
                                    ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
@@ -286,7 +327,19 @@ void OpenMP::initialize( unsigned thread_count ,
       Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
       Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
 
-      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+      // New, unified host thread team data:
+      {
+        size_t pool_reduce_bytes  =   32 * thread_count ;
+        size_t team_reduce_bytes  =   32 * thread_count ;
+        size_t team_shared_bytes  = 1024 * thread_count ;
+        size_t thread_local_bytes = 1024 ;
+
+        Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
+                                            , team_reduce_bytes
+                                            , team_shared_bytes
+                                            , thread_local_bytes
+                                            );
+      }
     }
   }
 
@@ -309,7 +362,7 @@ void OpenMP::initialize( unsigned thread_count ,
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
 }
@@ -321,7 +374,8 @@ void OpenMP::finalize()
   Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
   Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
 
-  Impl::OpenMPexec::clear_scratch();
+  // New, unified host thread team data:
+  Impl::OpenMPexec::clear_thread_data();
 
   Impl::OpenMPexec::m_pool_topo[0] = 0 ;
   Impl::OpenMPexec::m_pool_topo[1] = 0 ;
@@ -333,7 +387,7 @@ void OpenMP::finalize()
     hwloc::unbind_this_thread();
   }
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
   #endif
 }
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
index 63f7234da3a81a5e040f76e264377156cf024bb0..39ace3131927d8071c50fc44dedb046bf598f0de 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@@ -44,13 +44,22 @@
 #ifndef KOKKOS_OPENMPEXEC_HPP
 #define KOKKOS_OPENMPEXEC_HPP
 
+#include <Kokkos_OpenMP.hpp>
+
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
+
 #include <iostream>
 #include <sstream>
 #include <fstream>
+
+#include <omp.h>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
 namespace Kokkos {
 namespace Impl {
 
@@ -60,41 +69,19 @@ namespace Impl {
 class OpenMPexec {
 public:
 
+  friend class Kokkos::OpenMP ;
+
   enum { MAX_THREAD_COUNT = 4096 };
 
 private:
 
-  static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
-
   static int          m_pool_topo[ 4 ];
   static int          m_map_rank[ MAX_THREAD_COUNT ];
 
-  friend class Kokkos::OpenMP ;
-
-  int const  m_pool_rank ;
-  int const  m_pool_rank_rev ;
-  int const  m_scratch_exec_end ;
-  int const  m_scratch_reduce_end ;
-  int const  m_scratch_thread_end ;
-
-  int volatile  m_barrier_state ;
-
-  // Members for dynamic scheduling
-  // Which thread am I stealing from currently
-  int m_current_steal_target;
-  // This thread's owned work_range
-  Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
-  // Team Offset if one thread determines work_range for others
-  long m_team_work_index;
+  static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
 
-  // Is this thread stealing (i.e. its owned work_range is exhausted
-  bool m_stealing;
-
-  OpenMPexec();
-  OpenMPexec( const OpenMPexec & );
-  OpenMPexec & operator = ( const OpenMPexec & );
-
-  static void clear_scratch();
+  static
+  void clear_thread_data();
 
 public:
 
@@ -108,47 +95,9 @@ public:
   inline static
   int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
 
-  inline static
-  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
-
-  inline int pool_rank() const { return m_pool_rank ; }
-  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
-
-  inline long team_work_index() const { return m_team_work_index ; }
-
-  inline int scratch_reduce_size() const
-    { return m_scratch_reduce_end - m_scratch_exec_end ; }
-
-  inline int scratch_thread_size() const
-    { return m_scratch_thread_end - m_scratch_reduce_end ; }
-
-  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
-  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
-
-  inline
-  void state_wait( int state )
-    { Impl::spinwait( m_barrier_state , state ); }
-
-  inline
-  void state_set( int state ) { m_barrier_state = state ; }
-
-  ~OpenMPexec() {}
-
-  OpenMPexec( const int arg_poolRank
-            , const int arg_scratch_exec_size
-            , const int arg_scratch_reduce_size
-            , const int arg_scratch_thread_size )
-    : m_pool_rank( arg_poolRank )
-    , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
-    , m_scratch_exec_end( arg_scratch_exec_size )
-    , m_scratch_reduce_end( m_scratch_exec_end   + arg_scratch_reduce_size )
-    , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
-    , m_barrier_state(0)
-    {}
-
   static void finalize();
 
-  static void initialize( const unsigned  team_count ,
+  static void initialize( const unsigned team_count ,
                           const unsigned threads_per_team ,
                           const unsigned numa_count ,
                           const unsigned cores_per_numa );
@@ -156,133 +105,20 @@ public:
   static void verify_is_process( const char * const );
   static void verify_initialized( const char * const );
 
-  static void resize_scratch( size_t reduce_size , size_t thread_size );
 
-  inline static
-  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+  static
+  void resize_thread_data( size_t pool_reduce_bytes
+                         , size_t team_reduce_bytes
+                         , size_t team_shared_bytes
+                         , size_t thread_local_bytes );
 
-  /* Dynamic Scheduling related functionality */
-  // Initialize the work range for this thread
-  inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
-    m_work_range.first = (begin+chunk_size-1)/chunk_size;
-    m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
-  }
-
-  // Claim and index from this thread's range from the beginning
-  inline long get_work_index_begin () {
-    Kokkos::pair<long,long> work_range_new = m_work_range;
-    Kokkos::pair<long,long> work_range_old = work_range_new;
-    if(work_range_old.first>=work_range_old.second)
-      return -1;
-
-    work_range_new.first+=1;
-
-    bool success = false;
-    while(!success) {
-      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
-      success = ( (work_range_new == work_range_old) ||
-                  (work_range_new.first>=work_range_new.second));
-      work_range_old = work_range_new;
-      work_range_new.first+=1;
-    }
-    if(work_range_old.first<work_range_old.second)
-      return work_range_old.first;
-    else
-      return -1;
-  }
-
-  // Claim and index from this thread's range from the end
-  inline long get_work_index_end () {
-    Kokkos::pair<long,long> work_range_new = m_work_range;
-    Kokkos::pair<long,long> work_range_old = work_range_new;
-    if(work_range_old.first>=work_range_old.second)
-      return -1;
-    work_range_new.second-=1;
-    bool success = false;
-    while(!success) {
-      work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
-      success = ( (work_range_new == work_range_old) ||
-                  (work_range_new.first>=work_range_new.second) );
-      work_range_old = work_range_new;
-      work_range_new.second-=1;
-    }
-    if(work_range_old.first<work_range_old.second)
-      return work_range_old.second-1;
-    else
-      return -1;
-  }
-
-  // Reset the steal target
-  inline void reset_steal_target() {
-    m_current_steal_target = (m_pool_rank+1)%m_pool_topo[0];
-    m_stealing = false;
-  }
-
-  // Reset the steal target
-  inline void reset_steal_target(int team_size) {
-    m_current_steal_target = (m_pool_rank_rev+team_size);
-    if(m_current_steal_target>=m_pool_topo[0])
-      m_current_steal_target = 0;//m_pool_topo[0]-1;
-    m_stealing = false;
-  }
-
-  // Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
-  // Returns -1 fi no active steal target available
-  inline int get_steal_target() {
-    while(( m_pool[m_current_steal_target]->m_work_range.second <=
-            m_pool[m_current_steal_target]->m_work_range.first  ) &&
-          (m_current_steal_target!=m_pool_rank) ) {
-      m_current_steal_target = (m_current_steal_target+1)%m_pool_topo[0];
-    }
-    if(m_current_steal_target == m_pool_rank)
-      return -1;
-    else
-      return m_current_steal_target;
-  }
-
-  inline int get_steal_target(int team_size) {
-
-    while(( m_pool[m_current_steal_target]->m_work_range.second <=
-            m_pool[m_current_steal_target]->m_work_range.first  ) &&
-          (m_current_steal_target!=m_pool_rank_rev) ) {
-      if(m_current_steal_target + team_size < m_pool_topo[0])
-        m_current_steal_target = (m_current_steal_target+team_size);
-      else
-        m_current_steal_target = 0;
-    }
-
-    if(m_current_steal_target == m_pool_rank_rev)
-      return -1;
-    else
-      return m_current_steal_target;
-  }
-
-  inline long steal_work_index (int team_size = 0) {
-    long index = -1;
-    int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
-    while ( (steal_target != -1) && (index == -1)) {
-      index = m_pool[steal_target]->get_work_index_end();
-      if(index == -1)
-        steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
-    }
-    return index;
-  }
-
-  // Get a work index. Claim from owned range until its exhausted, then steal from other thread
-  inline long get_work_index (int team_size = 0) {
-    long work_index = -1;
-    if(!m_stealing) work_index = get_work_index_begin();
-
-    if( work_index == -1) {
-      memory_fence();
-      m_stealing = true;
-      work_index = steal_work_index(team_size);
-    }
-    m_team_work_index = work_index;
-    memory_fence();
-    return work_index;
-  }
+  inline static
+  HostThreadTeamData * get_thread_data() noexcept
+    { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
 
+  inline static
+  HostThreadTeamData * get_thread_data( int i ) noexcept
+    { return m_pool[i]; }
 };
 
 } // namespace Impl
@@ -294,356 +130,6 @@ public:
 namespace Kokkos {
 namespace Impl {
 
-class OpenMPexecTeamMember {
-public:
-
-  enum { TEAM_REDUCE_SIZE = 512 };
-
-  /** \brief  Thread states for team synchronization */
-  enum { Active = 0 , Rendezvous = 1 };
-
-  typedef Kokkos::OpenMP                         execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-  Impl::OpenMPexec    & m_exec ;
-  scratch_memory_space  m_team_shared ;
-  int                   m_team_scratch_size[2] ;
-  int                   m_team_base_rev ;
-  int                   m_team_rank_rev ;
-  int                   m_team_rank ;
-  int                   m_team_size ;
-  int                   m_league_rank ;
-  int                   m_league_end ;
-  int                   m_league_size ;
-
-  int                   m_chunk_size;
-  int                   m_league_chunk_end;
-  Impl::OpenMPexec    & m_team_lead_exec ;
-  int                   m_invalid_thread;
-  int                   m_team_alloc;
-
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
-        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
-        memory_fence();
-      }
-    }
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_shmem() const
-    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& team_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(0,1,0) ; }
-
-  KOKKOS_INLINE_FUNCTION
-  const execution_space::scratch_memory_space& thread_scratch(int) const
-    { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    {
-      if ( 1 < m_team_size && !m_invalid_thread) {
-        team_fan_in();
-        team_fan_out();
-      }
-    }
-#endif
-
-  template<class ValueType>
-  KOKKOS_INLINE_FUNCTION
-  void team_broadcast(ValueType& value, const int& thread_id) const
-  {
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { }
-#else
-    // Make sure there is enough scratch space:
-    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
-                         , ValueType , void >::type type ;
-
-    type volatile * const shared_value =
-      ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-    if ( team_rank() == thread_id ) *shared_value = value;
-    memory_fence();
-    team_barrier(); // Wait for 'thread_id' to write
-    value = *shared_value ;
-    team_barrier(); // Wait for team members to read
-#endif
-  }
-
-  template< class ValueType, class JoinOp >
-  KOKKOS_INLINE_FUNCTION ValueType
-    team_reduce( const ValueType & value
-               , const JoinOp & op_in ) const
-  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ValueType(); }
-  #else
-    {
-      memory_fence();
-      typedef ValueType value_type;
-      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
-  #endif
-#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
-                           , value_type , void >::type type ;
-
-      type * const local_value = ((type*) m_exec.scratch_thread());
-
-      // Set this thread's contribution
-      *local_value = value ;
-
-      // Fence to make sure the base team member has access:
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
-
-        // Join to the team value:
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
-        }
-        memory_fence();
-
-        // The base team member may "lap" the other team members,
-        // copy to their local value before proceeding.
-        for ( int i = 1 ; i < m_team_size ; ++i ) {
-          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
-        }
-
-        // Fence to make sure all team members have access
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *((type volatile const *)local_value);
-    }
-#endif
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename ArgType >
-  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return ArgType(); }
-#else
-    {
-      // Make sure there is enough scratch space:
-      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
-
-      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
-
-      *work_value = value ;
-
-      memory_fence();
-
-      if ( team_fan_in() ) {
-        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
-        // m_team_base[0]                 == highest ranking team member
-        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
-        //
-        // 1) copy from lower to higher rank, initialize lowest rank to zero
-        // 2) prefix sum from lowest to highest rank, skipping lowest rank
-
-        type accum = 0 ;
-
-        if ( global_accum ) {
-          for ( int i = m_team_size ; i-- ; ) {
-            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-            accum += val ;
-          }
-          accum = atomic_fetch_add( global_accum , accum );
-        }
-
-        for ( int i = m_team_size ; i-- ; ) {
-          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
-          const type offset = accum ;
-          accum += val ;
-          val = offset ;
-        }
-
-        memory_fence();
-      }
-
-      team_fan_out();
-
-      return *work_value ;
-    }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-    { return this-> template team_scan<Type>( value , 0 ); }
-
-  //----------------------------------------
-  // Private for the driver
-
-private:
-
-  typedef execution_space::scratch_memory_space space ;
-
-public:
-
-  template< class ... Properties >
-  inline
-  OpenMPexecTeamMember( Impl::OpenMPexec & exec
-                      , const TeamPolicyInternal< OpenMP, Properties ...> & team
-                      , const int shmem_size_L1
-                      , const int shmem_size_L2
-                      )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
-    , m_team_base_rev(0)
-    , m_team_rank_rev(0)
-    , m_team_rank(0)
-    , m_team_size( team.team_size() )
-    , m_league_rank(0)
-    , m_league_end(0)
-    , m_league_size( team.league_size() )
-    , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
-    , m_league_chunk_end(0)
-    , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
-    , m_team_alloc( team.team_alloc())
-    {
-      const int pool_rank_rev        = m_exec.pool_rank_rev();
-      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
-      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
-      const int pool_num_teams       = OpenMP::thread_pool_size(0)/team.team_alloc();
-      const int chunks_per_team      = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
-            int league_iter_end      = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
-            int league_iter_begin    = league_iter_end - chunks_per_team * m_chunk_size;
-      if (league_iter_begin < 0)     league_iter_begin = 0;
-      if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
-
-      if ((team.team_alloc()>m_team_size)?
-          (pool_team_rank_rev >= m_team_size):
-          (m_exec.pool_size() - pool_num_teams*m_team_size > m_exec.pool_rank())
-         )
-        m_invalid_thread = 1;
-      else
-        m_invalid_thread = 0;
-
-      m_team_rank_rev  = pool_team_rank_rev ;
-      if ( pool_team_rank_rev < m_team_size && !m_invalid_thread ) {
-        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
-        m_team_rank_rev  = pool_team_rank_rev ;
-        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
-        m_league_end     = league_iter_end ;
-        m_league_rank    = league_iter_begin ;
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                               0 );
-      }
-
-      if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
-        m_exec.set_work_range(m_league_rank,m_league_end,m_chunk_size);
-        m_exec.reset_steal_target(m_team_size);
-      }
-    }
-
-  bool valid_static() const
-    {
-      return m_league_rank < m_league_end ;
-    }
-
-  void next_static()
-    {
-      if ( m_league_rank < m_league_end ) {
-        team_barrier();
-        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                             ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                               0);
-      }
-      m_league_rank++;
-    }
-
-  bool valid_dynamic() {
-    if(m_invalid_thread)
-      return false;
-    if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
-      return true;
-    }
-
-    if (  m_team_rank_rev == 0 ) {
-      m_team_lead_exec.get_work_index(m_team_alloc);
-    }
-    team_barrier();
-
-    long work_index = m_team_lead_exec.team_work_index();
-
-    m_league_rank = work_index * m_chunk_size;
-    m_league_chunk_end = (work_index +1 ) * m_chunk_size;
-
-    if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
-
-    if(m_league_rank>=0)
-      return true;
-    return false;
-  }
-
-  void next_dynamic() {
-    if(m_invalid_thread)
-      return;
-
-    if ( m_league_rank < m_league_chunk_end ) {
-      team_barrier();
-      new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
-                                           ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
-                                             0);
-    }
-    m_league_rank++;
-  }
-
-  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
-};
-
 template< class ... Properties >
 class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
 {
@@ -671,8 +157,11 @@ public:
 
   template< class FunctorType >
   inline static
-  int team_size_max( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(1); }
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
 
   template< class FunctorType >
   inline static
@@ -702,7 +191,8 @@ private:
                   , const int team_size_request )
     {
       const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
       const int team_grain = traits::execution_space::thread_pool_size(2);
 
       m_league_size = league_size_request ;
@@ -823,7 +313,7 @@ private:
   }
 
 public:
-  typedef Impl::OpenMPexecTeamMember member_type ;
+  typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
 };
 } // namespace Impl
 
@@ -850,216 +340,6 @@ int OpenMP::thread_pool_rank()
 #endif
 }
 
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >
-TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType& count ) {
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, count );
-}
-
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::OpenMPexecTeamMember >
-TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType1& begin, const iType2& end ) {
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, iType(begin), iType(end) );
-}
-
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
-ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
-KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
-  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
-}
-
 } // namespace Kokkos
 
-namespace Kokkos {
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, ValueType& result) {
-
-  result = ValueType();
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-
-  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
-                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-
-  init_result = loop_boundaries.thread.team_reduce(result,join);
-}
-
-} //namespace Kokkos
-
-namespace Kokkos {
-/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
- * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-    loop_boundaries, const Lambda& lambda) {
-  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-  #pragma ivdep
-  #endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
- * val is performed and put into result. This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, ValueType& result) {
-  result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
-  }
-}
-
-/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
- * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
- * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
- * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
- * '1 for *'). This functionality requires C++11 support.*/
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
-
-  ValueType result = init_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  init_result = result;
-}
-
-/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
- *          for each i=0..N-1.
- *
- * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
- * Depending on the target execution space the operator might be called twice: once with final=false
- * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
- * "i" needs to be added to val no matter whether final==true or not. In a serial execution
- * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
- * to the final sum value over all vector lanes.
- * This functionality requires C++11 support.*/
-template< typename iType, class FunctorType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
-      loop_boundaries, const FunctorType & lambda) {
-
-  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
-  typedef typename ValueTraits::value_type value_type ;
-
-  value_type scan_val = value_type();
-
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    lambda(i,scan_val,true);
-  }
-}
-
-} // namespace Kokkos
-
-namespace Kokkos {
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  lambda();
-}
-
-template<class FunctorType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
-  if(single_struct.team_member.team_rank()==0) lambda();
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  lambda(val);
-}
-
-template<class FunctorType, class ValueType>
-KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
-  if(single_struct.team_member.team_rank()==0) {
-    lambda(val);
-  }
-  single_struct.team_member.team_broadcast(val,0);
-}
-}
-
 #endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
deleted file mode 100644
index b4df5e35bb7897b7e7bdf76acb4f2bc4d9a9fe77..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-// 
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-// 
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
-// ************************************************************************
-//@HEADER
-*/
-
-#include <Kokkos_Core_fwd.hpp>
-
-#if defined( KOKKOS_ENABLE_QTHREAD )
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <iostream>
-#include <sstream>
-#include <utility>
-#include <Kokkos_Qthread.hpp>
-#include <Kokkos_Atomic.hpp>
-#include <impl/Kokkos_Error.hpp>
-
-// Defines to enable experimental Qthread functionality
-
-#define QTHREAD_LOCAL_PRIORITY
-#define CLONED_TASKS
-
-#include <qthread/qthread.h>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
-
-/** s_exec is indexed by the reverse rank of the workers
- *  for faster fan-in / fan-out lookups
- *  [ n - 1 , n - 2 , ... , 0 ]
- */
-QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
-
-int  s_number_shepherds            = 0 ;
-int  s_number_workers_per_shepherd = 0 ;
-int  s_number_workers              = 0 ;
-
-inline
-QthreadExec ** worker_exec()
-{
-  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
-}
-
-const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
-
-int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
-int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
-int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
-
-QthreadExecFunctionPointer volatile s_active_function = 0 ;
-const void               * volatile s_active_function_arg = 0 ;
-
-} /* namespace */
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-
-int Qthread::is_initialized()
-{
-  return Impl::s_number_workers != 0 ;
-}
-
-int Qthread::concurrency()
-{
-  return Impl::s_number_workers_per_shepherd ;
-}
-
-int Qthread::in_parallel()
-{
-  return Impl::s_active_function != 0 ;
-}
-
-void Qthread::initialize( int thread_count )
-{
-  // Environment variable: QTHREAD_NUM_SHEPHERDS
-  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
-  // Environment variable: QTHREAD_HWPAR
-
-  {
-    char buffer[256];
-    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
-    putenv(buffer);
-  }
-
-  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
-                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
-                       ( thread_count    == qthread_num_workers() );
-
-  bool ok_symmetry = true ;
-
-  if ( ok_init ) {
-    Impl::s_number_shepherds            = qthread_num_shepherds();
-    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
-    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
-
-    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
-      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
-    }
-  }
-
-  if ( ! ok_init || ! ok_symmetry ) {
-    std::ostringstream msg ;
-
-    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
-    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
-    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
-    msg << " : qthread_num_workers = " << qthread_num_workers();
-
-    if ( ! ok_symmetry ) {
-      msg << " : qthread_num_workers_local = {" ;
-      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
-        msg << " " << qthread_num_workers_local(i) ;
-      }
-      msg << " }" ;
-    }
-
-    Impl::s_number_workers   = 0 ;
-    Impl::s_number_shepherds = 0 ;
-    Impl::s_number_workers_per_shepherd = 0 ;
-
-    if ( ok_init ) { qthread_finalize(); }
-
-    Kokkos::Impl::throw_runtime_exception( msg.str() );
-  }
-
-  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
-
-  // Init the array for used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-}
-
-void Qthread::finalize()
-{
-  Impl::QthreadExec::clear_workers();
-
-  if ( Impl::s_number_workers ) {
-    qthread_finalize();
-  }
-
-  Impl::s_number_workers    = 0 ;
-  Impl::s_number_shepherds  = 0 ;
-  Impl::s_number_workers_per_shepherd = 0 ;
-}
-
-void Qthread::print_configuration( std::ostream & s , const bool detail )
-{
-  s << "Kokkos::Qthread {"
-    << " num_shepherds(" << Impl::s_number_shepherds << ")"
-    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
-    << " }" << std::endl ;
-}
-
-Qthread & Qthread::instance( int )
-{
-  static Qthread q ;
-  return q ;
-}
-
-void Qthread::fence()
-{
-}
-
-int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
-int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-namespace {
-
-aligned_t driver_exec_all( void * arg )
-{
-  QthreadExec & exec = **worker_exec();
-
-  (*s_active_function)( exec , s_active_function_arg );
-
-/*
-  fprintf( stdout
-         , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
-         , exec.worker_rank()
-         , exec.worker_size()
-         , exec.shepherd_rank()
-         , exec.shepherd_size()
-         , exec.shepherd_worker_rank()
-         , exec.shepherd_worker_size()
-         );
-  fflush(stdout);
-*/
-
-  return 0 ;
-}
-
-aligned_t driver_resize_worker_scratch( void * arg )
-{
-  static volatile int lock_begin = 0 ;
-  static volatile int lock_end   = 0 ;
-
-  QthreadExec ** const exec = worker_exec();
-
-  //----------------------------------------
-  // Serialize allocation for thread safety
-
-  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
-
-  const bool ok = 0 == *exec ;
-
-  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
-
-  lock_begin = 0 ; // release lock
-
-  if ( ok ) { new( *exec ) QthreadExec(); }
-
-  //----------------------------------------
-  // Wait for all calls to complete to insure that each worker has executed.
-
-  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
-
-  while ( lock_end );
-
-/*
-  fprintf( stdout
-         , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
-         , (**exec).worker_rank()
-         , (**exec).worker_size()
-         , (**exec).shepherd_rank()
-         , (**exec).shepherd_size()
-         , (**exec).shepherd_worker_rank()
-         , (**exec).shepherd_worker_size()
-         );
-  fflush(stdout);
-*/
-
-  //----------------------------------------
-
-  if ( ! ok ) {
-    fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
-    fflush( stderr );
-  }
-
-  return 0 ;
-}
-
-void verify_is_process( const char * const label , bool not_active = false )
-{
-  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
-  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
-
-  if ( not_process || is_active ) {
-    std::string msg( label );
-    msg.append( " : FAILED" );
-    if ( not_process ) msg.append(" : not called by main process");
-    if ( is_active )   msg.append(" : parallel execution in progress");
-    Kokkos::Impl::throw_runtime_exception( msg );
-  }
-}
-
-}
-
-int QthreadExec::worker_per_shepherd()
-{
-  return s_number_workers_per_shepherd ;
-}
-
-QthreadExec::QthreadExec()
-{
-  const int shepherd_rank        = qthread_shep();
-  const int shepherd_worker_rank = qthread_worker_local(NULL);
-  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
-
-  m_worker_base          = s_exec ;
-  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
-  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
-  m_reduce_end           = s_worker_reduce_end ;
-  m_shepherd_rank        = shepherd_rank ;
-  m_shepherd_size        = s_number_shepherds ;
-  m_shepherd_worker_rank = shepherd_worker_rank ;
-  m_shepherd_worker_size = s_number_workers_per_shepherd ;
-  m_worker_rank          = worker_rank ;
-  m_worker_size          = s_number_workers ;
-  m_worker_state         = QthreadExec::Active ;
-}
-
-void QthreadExec::clear_workers()
-{
-  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-    QthreadExec * const exec = s_exec[iwork] ;
-    s_exec[iwork] = 0 ;
-    free( exec );
-  }
-}
-
-void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
-{
-  new( & space )
-    Qthread::scratch_memory_space(
-      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
-      s_worker_shared_end - s_worker_shared_begin
-    );
-}
-
-void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
-{
-  const int exec_all_reduce_alloc = align_alloc( reduce_size );
-  const int shepherd_scan_alloc   = align_alloc( 8 );
-  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
-
-  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
-       s_worker_shared_end < shepherd_shared_end ) {
-
-/*
-  fprintf( stdout , "QthreadExec::resize\n");
-  fflush(stdout);
-*/
-
-    // Clear current worker memory before allocating new worker memory
-    clear_workers();
-
-    // Increase the buffers to an aligned allocation
-    s_worker_reduce_end   = exec_all_reduce_alloc ;
-    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
-    s_worker_shared_end   = shepherd_shared_end ;
-
-    // Need to query which shepherd this main 'process' is running...
- 
-    const int main_shep = qthread_shep();
-
-    // Have each worker resize its memory for proper first-touch
-#if 0
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
-      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
-    }}
-#else
-    // If this function is used before the 'qthread.task_policy' unit test
-    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
-    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
-
-      if ( num_clone ) {
-        const int ret = qthread_fork_clones_to_local_priority
-          ( driver_resize_worker_scratch   /* function */
-          , NULL                           /* function data block */
-          , NULL                           /* pointer to return value feb */
-          , jshep                          /* shepherd number */
-          , num_clone - 1                  /* number of instances - 1 */
-          );
-
-        assert(ret == QTHREAD_SUCCESS);
-      }
-    }
-#endif
-
-    driver_resize_worker_scratch( NULL );
-
-    // Verify all workers allocated
-
-    bool ok = true ;
-    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
-
-    if ( ! ok ) {
-      std::ostringstream msg ;
-      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
-      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
-         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
-      }
-      msg << " }" ;
-      Kokkos::Impl::throw_runtime_exception( msg.str() );
-    }
-  }
-}
-
-void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
-{
-  verify_is_process("QthreadExec::exec_all(...)",true);
-
-/*
-  fprintf( stdout , "QthreadExec::exec_all\n");
-  fflush(stdout);
-*/
-
-  s_active_function     = func ;
-  s_active_function_arg = arg ;
-
-  // Need to query which shepherd this main 'process' is running...
- 
-  const int main_shep = qthread_shep();
-
-#if 0
-  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
-    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
-  }}
-#else
-  // If this function is used before the 'qthread.task_policy' unit test
-  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
-  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
-    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
-
-    if ( num_clone ) {
-      const int ret = qthread_fork_clones_to_local_priority
-        ( driver_exec_all   /* function */
-        , NULL              /* function data block */
-        , NULL              /* pointer to return value feb */
-        , jshep             /* shepherd number */
-        , num_clone - 1     /* number of instances - 1 */
-        );
-
-      assert(ret == QTHREAD_SUCCESS);
-    }
-  }
-#endif
-
-  driver_exec_all( NULL );
-
-  s_active_function     = 0 ;
-  s_active_function_arg = 0 ;
-}
-
-void * QthreadExec::exec_all_reduce_result()
-{
-  return s_exec[0]->m_scratch_alloc ;
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
-QthreadTeamPolicyMember::QthreadTeamPolicyMember()
-  : m_exec( **worker_exec() )
-  , m_team_shared(0,0)
-  , m_team_size( 1 )
-  , m_team_rank( 0 )
-  , m_league_size(1)
-  , m_league_end(1)
-  , m_league_rank(0)
-{
-  m_exec.shared_reset( m_team_shared );
-}
-
-QthreadTeamPolicyMember::QthreadTeamPolicyMember( const QthreadTeamPolicyMember::TaskTeam & )
-  : m_exec( **worker_exec() )
-  , m_team_shared(0,0)
-  , m_team_size( s_number_workers_per_shepherd )
-  , m_team_rank( m_exec.shepherd_worker_rank() )
-  , m_league_size(1)
-  , m_league_end(1)
-  , m_league_rank(0)
-{
-  m_exec.shared_reset( m_team_shared );
-}
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
-
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
deleted file mode 100644
index f948eb2903b631e82727e670e84339383d5891c9..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
+++ /dev/null
@@ -1,620 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_QTHREADEXEC_HPP
-#define KOKKOS_QTHREADEXEC_HPP
-
-#include <impl/Kokkos_spinwait.hpp>
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-//----------------------------------------------------------------------------
-
-class QthreadExec ;
-
-typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
-
-class QthreadExec {
-private:
-
-  enum { Inactive = 0 , Active = 1 };
-
-  const QthreadExec * const * m_worker_base ;
-  const QthreadExec * const * m_shepherd_base ;
-
-  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
-  int     m_reduce_end ;     ///< End of scratch reduction memory
-
-  int     m_shepherd_rank ;
-  int     m_shepherd_size ;
-
-  int     m_shepherd_worker_rank ;
-  int     m_shepherd_worker_size ;
-
-  /*
-   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
-   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
-   */
-  int     m_worker_rank ;
-  int     m_worker_size ;
-
-  int mutable volatile m_worker_state ;
-
-
-  friend class Kokkos::Qthread ;
-
-  ~QthreadExec();
-  QthreadExec( const QthreadExec & );
-  QthreadExec & operator = ( const QthreadExec & );
-
-public:
-
-  QthreadExec();
-
-  /** Execute the input function on all available Qthread workers */
-  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
-
-  //----------------------------------------
-  /** Barrier across all workers participating in the 'exec_all' */
-  void exec_all_barrier() const
-    {
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  /** Barrier across workers within the shepherd with rank < team_rank */
-  void shepherd_barrier( const int team_size ) const
-    {
-      if ( m_shepherd_worker_rank < team_size ) {
-
-        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-        int n , j ;
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-        }
-
-        if ( rev_rank ) {
-          m_worker_state = QthreadExec::Inactive ;
-          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-        }
-
-        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-        }
-      }
-    }
-
-  //----------------------------------------
-  /** Reduce across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ReducerType , class ArgTag >
-  inline
-  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
-    {
-      typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
-      typedef typename ReducerConditional::type ReducerTypeFwd;
-      typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        const QthreadExec & fan = *m_worker_base[j];
-
-        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
-
-        ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-  /** Scall across all workers participating in the 'exec_all' */
-  template< class FunctorType , class ArgTag >
-  inline
-  void exec_all_scan( const FunctorType & func ) const
-    {
-      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
-      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
-      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
-
-      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_worker_base[0] is the
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-        for ( int i = 1 ; i < m_worker_size ; ++i ) {
-          ValueOps::copy( func
-                        , m_worker_base[i-1]->m_scratch_alloc
-                        , m_worker_base[i]->m_scratch_alloc
-                        );
-        }
-
-        ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
-
-        // Join from lower ranking to higher ranking worker.
-        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
-        for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
-          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
-        }
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
-        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-    }
-
-  //----------------------------------------
-
-  template< class Type>
-  inline
-  volatile Type * shepherd_team_scratch_value() const
-    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
-
-  template< class Type >
-  inline
-  void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
-    {
-      if ( m_shepherd_base ) {
-        Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
-        memory_fence();
-        shepherd_barrier( team_size );
-        value = *shared_value ;
-      }
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_reduce( const int team_size , const Type & value ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < n ; ++i ) {
-          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-        for ( int i = 1 ; i < n ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class JoinOp >
-  inline
-  typename JoinOp::value_type
-    shepherd_reduce( const int team_size
-                   , const typename JoinOp::value_type & value
-                   , const JoinOp & op ) const
-    {
-      typedef typename JoinOp::value_type Type ;
-
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
-        }
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  template< class Type >
-  inline
-  Type shepherd_scan( const int team_size
-                    , const Type & value
-                    ,       Type * const global_value = 0 ) const
-    {
-      *shepherd_team_scratch_value<Type>() = value ;
-
-      memory_fence();
-
-      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
-
-      int n , j ;
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
-      }
-
-      if ( rev_rank ) {
-        m_worker_state = QthreadExec::Inactive ;
-        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
-      }
-      else {
-        // Root thread scans across values before releasing threads
-        // Worker data is in reverse order, so m_shepherd_base[0] is the
-        // highest ranking thread.
-
-        // Copy from lower ranking to higher ranking worker.
-
-        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
-        for ( int i = 1 ; i < team_size ; ++i ) {
-          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-          accum += tmp ;
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
-        }
-
-        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
-          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
-
-        // Join from lower ranking to higher ranking worker.
-        for ( int i = team_size ; --i ; ) {
-          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
-        }
-
-        memory_fence();
-      }
-
-      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
-        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
-      }
-
-      return *shepherd_team_scratch_value<Type>();
-    }
-
-  //----------------------------------------
-
-  static inline
-  int align_alloc( int size )
-    {
-      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
-      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
-      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
-    }
-
-  void shared_reset( Qthread::scratch_memory_space & );
-
-  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
-
-  static void * exec_all_reduce_result();
-
-  static void resize_worker_scratch( const int reduce_size , const int shared_size );
-  static void clear_workers();
-
-  //----------------------------------------
-
-  inline int worker_rank() const { return m_worker_rank ; }
-  inline int worker_size() const { return m_worker_size ; }
-  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
-  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
-  inline int shepherd_rank() const { return m_shepherd_rank ; }
-  inline int shepherd_size() const { return m_shepherd_size ; }
-
-  static int worker_per_shepherd();
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-class QthreadTeamPolicyMember {
-private:
-
-  typedef Kokkos::Qthread                        execution_space ;
-  typedef execution_space::scratch_memory_space  scratch_memory_space ;
-
-
-        Impl::QthreadExec   & m_exec ;
-  scratch_memory_space        m_team_shared ;
-  const int                   m_team_size ;
-  const int                   m_team_rank ;
-  const int                   m_league_size ;
-  const int                   m_league_end ;
-        int                   m_league_rank ;
-
-public:
-
-  KOKKOS_INLINE_FUNCTION
-  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
-
-  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
-  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    {}
-#else
-    { m_exec.shepherd_barrier( m_team_size ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
-#endif
-
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
-#endif
-
-  template< typename JoinOp >
-  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
-    team_reduce( const typename JoinOp::value_type & value
-               , const JoinOp & op ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return typename JoinOp::value_type(); }
-#else
-    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
-   *
-   *  The highest rank thread can compute the reduction total as
-   *    reduction_total = dev.team_scan( value ) + value ;
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
-#endif
-
-  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
-   *          with intra-team non-deterministic ordering accumulation.
-   *
-   *  The global inter-team accumulation value will, at the end of the
-   *  league's parallel execution, be the scan's total.
-   *  Parallel execution ordering of the league's teams is non-deterministic.
-   *  As such the base value for each team's scan operation is similarly
-   *  non-deterministic.
-   */
-  template< typename Type >
-  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
-#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-    { return Type(); }
-#else
-    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
-#endif
-
-  //----------------------------------------
-  // Private driver for task-team parallel
-
-  struct TaskTeam {};
-
-  QthreadTeamPolicyMember();
-  explicit QthreadTeamPolicyMember( const TaskTeam & );
-
-  //----------------------------------------
-  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
-
-  // Initialize
-  template< class ... Properties >
-  QthreadTeamPolicyMember( Impl::QthreadExec & exec
-                         , const Kokkos::Impl::TeamPolicyInternal<Qthread,Properties...> & team )
-    : m_exec( exec )
-    , m_team_shared(0,0)
-    , m_team_size(   team.m_team_size )
-    , m_team_rank(   exec.shepherd_worker_rank() )
-    , m_league_size( team.m_league_size )
-    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
-    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
-  {
-    m_exec.shared_reset( m_team_shared );
-  }
-
-  // Continue
-  operator bool () const { return m_league_rank < m_league_end ; }
-
-  // iterate
-  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
-};
-
-
-template< class ... Properties >
-class TeamPolicyInternal< Kokkos::Qthread , Properties ... >
-  : public PolicyTraits< Properties... >
-{
-private:
-
-  const int m_league_size ;
-  const int m_team_size ;
-  const int m_shepherd_iter ;
-
-public:
-
-  //! Tag this class as a kokkos execution policy
-  typedef TeamPolicyInternal  execution_policy ;
-  typedef Qthread             execution_space ;
-  typedef PolicyTraits< Properties ... >  traits ;
-
-  //----------------------------------------
-
-  template< class FunctorType >
-  inline static
-  int team_size_max( const FunctorType & )
-    { return Qthread::instance().shepherd_worker_size(); }
-
-  template< class FunctorType >
-  static int team_size_recommended( const FunctorType & f )
-    { return team_size_max( f ); }
-
-  template< class FunctorType >
-  inline static
-  int team_size_recommended( const FunctorType & f , const int& )
-    { return team_size_max( f ); }
-
-  //----------------------------------------
-
-  inline int team_size()   const { return m_team_size ; }
-  inline int league_size() const { return m_league_size ; }
-
-  // One active team per shepherd
-  TeamPolicyInternal( Kokkos::Qthread & q
-                    , const int league_size
-                    , const int team_size
-                    , const int /* vector_length */ = 0
-                    )
-    : m_league_size( league_size )
-    , m_team_size( team_size < q.shepherd_worker_size()
-                 ? team_size : q.shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
-    {
-    }
-
-  // One active team per shepherd
-  TeamPolicyInternal( const int league_size
-                    , const int team_size
-                    , const int /* vector_length */ = 0
-                    )
-    : m_league_size( league_size )
-    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
-                 ? team_size : Qthread::instance().shepherd_worker_size() )
-    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
-    {
-    }
-
-  typedef Impl::QthreadTeamPolicyMember member_type ;
-
-  friend class Impl::QthreadTeamPolicyMember ;
-};
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif /* #define KOKKOS_QTHREADEXEC_HPP */
-
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b92494084c10763ad60ba458888204bd2bd77a3
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
@@ -0,0 +1,519 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ENABLE_QTHREADS )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+
+#include <Kokkos_Qthreads.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthreads functionality.
+//#define QTHREAD_LOCAL_PRIORITY
+//#define CLONED_TASKS
+
+//#include <qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+enum { MAXIMUM_QTHREADS_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1, n - 2, ..., 0 ]
+ */
+QthreadsExec * s_exec[ MAXIMUM_QTHREADS_WORKERS ];
+
+int  s_number_shepherds            = 0;
+int  s_number_workers_per_shepherd = 0;
+int  s_number_workers              = 0;
+
+inline
+QthreadsExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local( NULL ) + 1 );
+}
+
+const int s_base_size = QthreadsExec::align_alloc( sizeof(QthreadsExec) );
+
+int s_worker_reduce_end   = 0;  // End of worker reduction memory.
+int s_worker_shared_end   = 0;  // Total of worker scratch memory.
+int s_worker_shared_begin = 0;  // Beginning of worker shared memory.
+
+QthreadsExecFunctionPointer volatile s_active_function     = 0;
+const void                * volatile s_active_function_arg = 0;
+
+} // namespace
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+int Qthreads::is_initialized()
+{
+  return Impl::s_number_workers != 0;
+}
+
+int Qthreads::concurrency()
+{
+  return Impl::s_number_workers_per_shepherd;
+}
+
+int Qthreads::in_parallel()
+{
+  return Impl::s_active_function != 0;
+}
+
+void Qthreads::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf( buffer, sizeof(buffer), "QTHREAD_HWPAR=%d", thread_count );
+    putenv( buffer );
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local( NO_SHEPHERD ) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local( NO_SHEPHERD );
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd;
+
+    for ( int i = 0; ok_symmetry && i < Impl::s_number_shepherds; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local( i ) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg;
+
+    msg << "Kokkos::Qthreads::initialize(" << thread_count << ") FAILED";
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local( NO_SHEPHERD );
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {";
+      for ( int i = 0; i < Impl::s_number_shepherds; ++i ) {
+        msg << " " << qthread_num_workers_local( i );
+      }
+      msg << " }";
+    }
+
+    Impl::s_number_workers              = 0;
+    Impl::s_number_shepherds            = 0;
+    Impl::s_number_workers_per_shepherd = 0;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadsExec::resize_worker_scratch( 256, 256 );
+
+  // Init the array for used for arbitrarily sized atomics.
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthreads::finalize()
+{
+  Impl::QthreadsExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers              = 0;
+  Impl::s_number_shepherds            = 0;
+  Impl::s_number_workers_per_shepherd = 0;
+}
+
+void Qthreads::print_configuration( std::ostream & s, const bool detail )
+{
+  s << "Kokkos::Qthreads {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl;
+}
+
+Qthreads & Qthreads::instance( int )
+{
+  static Qthreads q;
+  return q;
+}
+
+void Qthreads::fence()
+{
+}
+
+int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
+int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadsExec & exec = **worker_exec();
+
+  (*s_active_function)( exec, s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0;
+  static volatile int lock_end   = 0;
+
+  QthreadsExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety.
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin, 0, 1 ) ); // Spin wait to claim lock.
+
+  const bool ok = 0 == *exec;
+
+  if ( ok ) { *exec = (QthreadsExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0; // Release lock.
+
+  if ( ok ) { new( *exec ) QthreadsExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end, 1 ) ) { lock_end = 0; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadsExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr, "Kokkos::QthreadsExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0;
+}
+
+void verify_is_process( const char * const label, bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local( NULL );
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+} // namespace
+
+int QthreadsExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd;
+}
+
+QthreadsExec::QthreadsExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local( NULL );
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank;
+
+  m_worker_base          = s_exec;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size;
+  m_reduce_end           = s_worker_reduce_end;
+  m_shepherd_rank        = shepherd_rank;
+  m_shepherd_size        = s_number_shepherds;
+  m_shepherd_worker_rank = shepherd_worker_rank;
+  m_shepherd_worker_size = s_number_workers_per_shepherd;
+  m_worker_rank          = worker_rank;
+  m_worker_size          = s_number_workers;
+  m_worker_state         = QthreadsExec::Active;
+}
+
+void QthreadsExec::clear_workers()
+{
+  for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+    QthreadsExec * const exec = s_exec[iwork];
+    s_exec[iwork] = 0;
+    free( exec );
+  }
+}
+
+void QthreadsExec::shared_reset( Qthreads::scratch_memory_space & space )
+{
+  new( & space )
+    Qthreads::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadsExec::resize_worker_scratch( const int reduce_size, const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout, "QthreadsExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory.
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation.
+    s_worker_reduce_end   = exec_all_reduce_alloc;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc;
+    s_worker_shared_end   = shepherd_shared_end;
+
+    // Need to query which shepherd this main 'process' is running.
+
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch.
+#if 0
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i ) {
+        qthread_fork_to( driver_resize_worker_scratch, NULL, NULL, jshep );
+      }
+    }
+#else
+    // If this function is used before the 'qthreads.task_policy' unit test,
+    // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   // Function
+          , NULL                           // Function data block
+          , NULL                           // Pointer to return value feb
+          , jshep                          // Shepherd number
+          , num_clone - 1                  // Number of instances - 1
+          );
+
+        assert( ret == QTHREAD_SUCCESS );
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated.
+
+    bool ok = true;
+    for ( int iwork = 0; ok && iwork < s_number_workers; ++iwork ) { ok = 0 != s_exec[iwork]; }
+
+    if ( ! ok ) {
+      std::ostringstream msg;
+      msg << "Kokkos::Impl::QthreadsExec::resize : FAILED for workers {";
+      for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }";
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadsExec::exec_all( Qthreads &, QthreadsExecFunctionPointer func, const void * arg )
+{
+  verify_is_process("QthreadsExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout, "QthreadsExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func;
+  s_active_function_arg = arg;
+
+  // Need to query which shepherd this main 'process' is running.
+
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0, iwork = 0; jshep < s_number_shepherds; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i, ++iwork ) {
+      qthread_fork_to( driver_exec_all, NULL, NULL, jshep );
+    }
+  }
+#else
+  // If this function is used before the 'qthreads.task_policy' unit test,
+  // the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   // Function
+        , NULL              // Function data block
+        , NULL              // Pointer to return value feb
+        , jshep             // Shepherd number
+        , num_clone - 1     // Number of instances - 1
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0;
+  s_active_function_arg = 0;
+}
+
+void * QthreadsExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc;
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( 1 )
+  , m_team_rank( 0 )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMember::TaskTeam & )
+  : m_exec( **worker_exec() )
+  , m_team_shared( 0, 0 )
+  , m_team_size( s_number_workers_per_shepherd )
+  , m_team_rank( m_exec.shepherd_worker_rank() )
+  , m_league_size( 1 )
+  , m_league_end( 1 )
+  , m_league_rank( 0 )
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..64856eb99e014272fd92f638e2d7f312d3039120
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
@@ -0,0 +1,640 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADSEXEC_HPP
+#define KOKKOS_QTHREADSEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsExec;
+
+typedef void (*QthreadsExecFunctionPointer)( QthreadsExec &, const void * );
+
+class QthreadsExec {
+private:
+  enum { Inactive = 0, Active = 1 };
+
+  const QthreadsExec * const * m_worker_base;
+  const QthreadsExec * const * m_shepherd_base;
+
+  void  * m_scratch_alloc;  ///< Scratch memory [ reduce, team, shared ]
+  int     m_reduce_end;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank;
+  int     m_shepherd_size;
+
+  int     m_shepherd_worker_rank;
+  int     m_shepherd_worker_size;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank;
+  int     m_worker_size;
+
+  int mutable volatile m_worker_state;
+
+  friend class Kokkos::Qthreads;
+
+  ~QthreadsExec();
+  QthreadsExec( const QthreadsExec & );
+  QthreadsExec & operator = ( const QthreadsExec & );
+
+public:
+  QthreadsExec();
+
+  /** Execute the input function on all available Qthreads workers. */
+  static void exec_all( Qthreads &, QthreadsExecFunctionPointer, const void * );
+
+  /** Barrier across all workers participating in the 'exec_all'. */
+  void exec_all_barrier() const
+  {
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Barrier across workers within the shepherd with rank < team_rank. */
+  void shepherd_barrier( const int team_size ) const
+  {
+    if ( m_shepherd_worker_rank < team_size ) {
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n, j;
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadsExec::Inactive;
+        Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+      }
+
+      for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+      }
+    }
+  }
+
+  /** Reduce across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ReducerType, class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
+  {
+    typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
+    typedef typename ReducerConditional::type ReducerTypeFwd;
+    typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      const QthreadsExec & fan = *m_worker_base[j];
+
+      Impl::spinwait_while_equal( fan.m_worker_state, QthreadsExec::Active );
+
+      ValueJoin::join( ReducerConditional::select( func, reduce ), m_scratch_alloc, fan.m_scratch_alloc );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  /** Scan across all workers participating in the 'exec_all'. */
+  template< class FunctorType, class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+  {
+    typedef Kokkos::Impl::FunctorValueInit< FunctorType, ArgTag > ValueInit;
+    typedef Kokkos::Impl::FunctorValueJoin< FunctorType, ArgTag > ValueJoin;
+    typedef Kokkos::Impl::FunctorValueOps<  FunctorType, ArgTag > ValueOps;
+
+    const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_worker_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+      for ( int i = 1; i < m_worker_size; ++i ) {
+        ValueOps::copy( func
+                      , m_worker_base[i-1]->m_scratch_alloc
+                      , m_worker_base[i]->m_scratch_alloc
+                      );
+      }
+
+      ValueInit::init( func, m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+      // Join from lower ranking to higher ranking worker.
+      // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+      for ( int i = m_worker_size - 1; --i > 0; ) {
+        ValueJoin::join( func, m_worker_base[i-1]->m_scratch_alloc, m_worker_base[i]->m_scratch_alloc );
+      }
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
+      m_worker_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+  }
+
+  //----------------------------------------
+
+  template< class Type >
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+  { return (volatile Type*)( ( (unsigned char *) m_scratch_alloc ) + m_reduce_end ); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value, const int team_size, const int team_rank ) const
+  {
+    if ( m_shepherd_base ) {
+      Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value; }
+      memory_fence();
+      shepherd_barrier( team_size );
+      value = *shared_value;
+    }
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size, const Type & value ) const
+  {
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < n; ++i ) {
+        accum += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+      for ( int i = 1; i < n; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+  shepherd_reduce( const int team_size
+                 , const typename JoinOp::value_type & value
+                 , const JoinOp & op ) const
+  {
+    typedef typename JoinOp::value_type Type;
+
+    volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
+    *shared_value = value;
+//    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      volatile Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        op.join( accum, *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+      }
+      for ( int i = 1; i < team_size; ++i ) {
+        *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+  {
+    *shepherd_team_scratch_value<Type>() = value;
+
+    memory_fence();
+
+    const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+    int n, j;
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
+    }
+
+    if ( rev_rank ) {
+      m_worker_state = QthreadsExec::Inactive;
+      Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
+    }
+    else {
+      // Root thread scans across values before releasing threads.
+      // Worker data is in reverse order, so m_shepherd_base[0] is the
+      // highest ranking thread.
+
+      // Copy from lower ranking to higher ranking worker.
+
+      Type accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+      for ( int i = 1; i < team_size; ++i ) {
+        const Type tmp = *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        accum += tmp;
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp;
+      }
+
+      *m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+        global_value ? atomic_fetch_add( global_value, accum ) : 0;
+
+      // Join from lower ranking to higher ranking worker.
+      for ( int i = team_size; --i; ) {
+        *m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+      }
+
+      memory_fence();
+    }
+
+    for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
+      m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
+    }
+
+    return *shepherd_team_scratch_value<Type>();
+  }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+  {
+    enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */ };
+    enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+    return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK;
+  }
+
+  void shared_reset( Qthreads::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size, const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank; }
+  inline int worker_size() const { return m_worker_size; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size; }
+  inline int shepherd_rank() const { return m_shepherd_rank; }
+  inline int shepherd_size() const { return m_shepherd_size; }
+
+  static int worker_per_shepherd();
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+namespace Impl {
+
+class QthreadsTeamPolicyMember {
+private:
+  typedef Kokkos::Qthreads                       execution_space;
+  typedef execution_space::scratch_memory_space  scratch_memory_space;
+
+  Impl::QthreadsExec   & m_exec;
+  scratch_memory_space   m_team_shared;
+  const int              m_team_size;
+  const int              m_team_rank;
+  const int              m_league_size;
+  const int              m_league_end;
+        int              m_league_rank;
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  {}
+#else
+  { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value, int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_broadcast<Type>( value, m_team_size, rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_reduce<Type>( m_team_size, value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+  team_reduce( const typename JoinOp::value_type & value
+             , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return typename JoinOp::value_type(); }
+#else
+  { return m_exec.template shepherd_reduce<JoinOp>( m_team_size, value, op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the league's
+   *  parallel execution, be the scan's total.  Parallel execution ordering of
+   *  the league's teams is non-deterministic.  As such the base value for each
+   *  team's scan operation is similarly non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value, Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  { return Type(); }
+#else
+  { return m_exec.template shepherd_scan<Type>( m_team_size, value, global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel.
+
+  struct TaskTeam {};
+
+  QthreadsTeamPolicyMember();
+  explicit QthreadsTeamPolicyMember( const TaskTeam & );
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i( exec, team ); i; i.next_team() ) { ... }
+
+  // Initialize.
+  template< class ... Properties >
+  QthreadsTeamPolicyMember( Impl::QthreadsExec & exec
+                          , const Kokkos::Impl::TeamPolicyInternal< Qthreads, Properties... > & team )
+    : m_exec( exec )
+    , m_team_shared( 0, 0 )
+    , m_team_size( team.m_team_size )
+    , m_team_rank( exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue.
+  operator bool () const { return m_league_rank < m_league_end; }
+
+  // Iterate.
+  void next_team() { ++m_league_rank; m_exec.shared_reset( m_team_shared ); }
+};
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Qthreads, Properties ... >
+  : public PolicyTraits< Properties... >
+{
+private:
+  const int m_league_size;
+  const int m_team_size;
+  const int m_shepherd_iter;
+
+public:
+  //! Tag this class as a kokkos execution policy.
+  typedef TeamPolicyInternal              execution_policy;
+  typedef Qthreads                        execution_space;
+  typedef PolicyTraits< Properties ... >  traits;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+  { return Qthreads::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+  { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f, const int& )
+  { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size; }
+  inline int league_size() const { return m_league_size; }
+
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( Kokkos::Qthreads & q
+                    , const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+  {}
+
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const int team_size
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthreads::instance().shepherd_worker_size()
+                 ? team_size : Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Make sure this is correct.
+  // One active team per shepherd.
+  TeamPolicyInternal( const int league_size
+                    , const Kokkos::AUTO_t & /* team_size_request */
+                    , const int /* vector_length */ = 0
+                    )
+    : m_league_size( league_size )
+    , m_team_size( Qthreads::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
+  {}
+
+  // TODO: Doesn't do anything yet.  Fix this.
+  /** \brief set chunk_size to a discrete value*/
+  inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+    TeamPolicyInternal p = *this;
+//    p.m_chunk_size = chunk_size_;
+    return p;
+  }
+
+  typedef Impl::QthreadsTeamPolicyMember member_type;
+
+  friend class Impl::QthreadsTeamPolicyMember;
+};
+
+} // namespace Impl
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif // #define KOKKOS_QTHREADSEXEC_HPP
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
similarity index 86%
rename from lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
rename to lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
index cb5b18094833a48905293175f6655f08f4596c8c..9f996075403f7cdd06fddfcb60d829dfab64bf0a 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp
@@ -41,8 +41,8 @@
 //@HEADER
 */
 
-#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
-#define KOKKOS_QTHREAD_PARALLEL_HPP
+#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
+#define KOKKOS_QTHREADS_PARALLEL_HPP
 
 #include <vector>
 
@@ -51,7 +51,7 @@
 #include <impl/Kokkos_StaticAssert.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthreads/Kokkos_QthreadsExec.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -63,7 +63,7 @@ namespace Impl {
 template< class FunctorType , class ... Traits >
 class ParallelFor< FunctorType
                  , Kokkos::RangePolicy< Traits ... >
-                 , Kokkos::Qthread
+                 , Kokkos::Qthreads
                  >
 {
 private:
@@ -99,7 +99,7 @@ private:
     }
 
   // Function is called once by every concurrent thread.
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelFor & self = * ((const ParallelFor *) arg );
 
@@ -116,7 +116,7 @@ public:
   inline
   void execute() const
     {
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
 
     }
 
@@ -134,7 +134,7 @@ template< class FunctorType , class ReducerType , class ... Traits >
 class ParallelReduce< FunctorType
                     , Kokkos::RangePolicy< Traits ... >
                     , ReducerType
-                    , Kokkos::Qthread
+                    , Kokkos::Qthreads
                     >
 {
 private:
@@ -186,7 +186,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelReduce & self = * ((const ParallelReduce *) arg );
 
@@ -205,10 +205,10 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
 
-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
 
       Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
 
@@ -246,11 +246,11 @@ public:
 template< class FunctorType , class ... Properties >
 class ParallelFor< FunctorType
                  , TeamPolicy< Properties ... >
-                 , Kokkos::Qthread >
+                 , Kokkos::Qthreads >
 {
 private:
 
-  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
   typedef typename Policy::member_type  Member ;
   typedef typename Policy::work_tag     WorkTag ;
 
@@ -282,7 +282,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelFor & self = * ((const ParallelFor *) arg );
 
@@ -297,10 +297,10 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch
+      QthreadsExec::resize_worker_scratch
         ( /* reduction   memory */ 0
         , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
     }
 
   ParallelFor( const FunctorType & arg_functor ,
@@ -316,12 +316,12 @@ template< class FunctorType , class ReducerType , class ... Properties >
 class ParallelReduce< FunctorType
                     , TeamPolicy< Properties... >
                     , ReducerType
-                    , Kokkos::Qthread
+                    , Kokkos::Qthreads
                     >
 {
 private:
 
-  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
+  typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
 
   typedef typename Policy::work_tag     WorkTag ;
   typedef typename Policy::member_type  Member ;
@@ -365,7 +365,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelReduce & self = * ((const ParallelReduce *) arg );
 
@@ -383,13 +383,13 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch
+      QthreadsExec::resize_worker_scratch
         ( /* reduction   memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
         , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
 
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
 
-      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+      const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
 
       Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
 
@@ -429,7 +429,7 @@ public:
 template< class FunctorType , class ... Traits >
 class ParallelScan< FunctorType
                   , Kokkos::RangePolicy< Traits ... >
-                  , Kokkos::Qthread
+                  , Kokkos::Qthreads
                   >
 {
 private:
@@ -474,7 +474,7 @@ private:
       }
     }
 
-  static void exec( QthreadExec & exec , const void * arg )
+  static void exec( QthreadsExec & exec , const void * arg )
   {
     const ParallelScan & self = * ((const ParallelScan *) arg );
 
@@ -497,8 +497,8 @@ public:
   inline
   void execute() const
     {
-      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
-      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
+      QthreadsExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
+      Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelScan::exec , this );
     }
 
   ParallelScan( const FunctorType & arg_functor
@@ -521,37 +521,37 @@ namespace Kokkos {
 
 template< typename iType >
 KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >
-TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType& count )
+Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType& count )
 {
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, count );
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, count );
 }
 
 template< typename iType1, typename iType2 >
 KOKKOS_INLINE_FUNCTION
 Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::QthreadTeamPolicyMember >
-TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
+                                       Impl::QthreadsTeamPolicyMember >
+TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
 {
   typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, iType(begin), iType(end) );
+  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, iType(begin), iType(end) );
 }
 
 template<typename iType>
 KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
-  ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadsTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >(thread,count);
 }
 
 KOKKOS_INLINE_FUNCTION
-Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
-  return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember> PerTeam(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
 }
 
 KOKKOS_INLINE_FUNCTION
-Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
-  return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember> PerThread(const Impl::QthreadsTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
 }
 
 /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
@@ -560,7 +560,7 @@ Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::Qt
  * This functionality requires C++11 support.*/
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
     lambda(i);
 }
@@ -571,7 +571,7 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qthrea
  * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
                      const Lambda & lambda, ValueType& result) {
 
   result = ValueType();
@@ -595,7 +595,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
  * '1 for *'). This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
                      const Lambda & lambda, const JoinType& join, ValueType& init_result) {
 
   ValueType result = init_result;
@@ -615,7 +615,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
  * This functionality requires C++11 support.*/
 template<typename iType, class Lambda>
 KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
     loop_boundaries, const Lambda& lambda) {
   #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
   #pragma ivdep
@@ -630,7 +630,7 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qthr
  * val is performed and put into result. This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
       loop_boundaries, const Lambda & lambda, ValueType& result) {
   result = ValueType();
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@@ -652,7 +652,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
  * '1 for *'). This functionality requires C++11 support.*/
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
-void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
       loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
 
   ValueType result = init_result;
@@ -679,7 +679,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
  * This functionality requires C++11 support.*/
 template< typename iType, class FunctorType >
 KOKKOS_INLINE_FUNCTION
-void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
       loop_boundaries, const FunctorType & lambda) {
 
   typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
@@ -697,25 +697,25 @@ void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qth
 
 template<class FunctorType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
   lambda();
 }
 
 template<class FunctorType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
   if(single_struct.team_member.team_rank()==0) lambda();
 }
 
 template<class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
   lambda(val);
 }
 
 template<class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION
-void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
   if(single_struct.team_member.team_rank()==0) {
     lambda(val);
   }
@@ -724,4 +724,4 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& singl
 
 } // namespace Kokkos
 
-#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..614a2c03f03e8c9cfbd15653295a254a350fb25a
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
@@ -0,0 +1,320 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
+
+#include <impl/Kokkos_TaskQueue_impl.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+TaskExec< Kokkos::Qthreads >::TaskExec()
+  : m_self_exec( 0 ),
+    m_team_exec( 0 ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( 0 ),
+    m_team_rank( 0 ),
+    m_team_size( 1 )
+{}
+
+TaskExec< Kokkos::Qthreads >::
+TaskExec( Kokkos::Impl::QthreadsExec & arg_exec, int const arg_team_size )
+  : m_self_exec( & arg_exec ),
+    m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ),
+    m_sync_mask( 0 ),
+    m_sync_value( 0 ),
+    m_sync_step( 0 ),
+    m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ),
+    m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ),
+    m_team_size( arg_team_size )
+{
+  // This team spans
+  //    m_self_exec->pool_rev( team_size * group_rank )
+  //    m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
+
+  int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
+
+  sync[0] = int64_t(0) ;
+  sync[1] = int64_t(0) ;
+
+  for ( int i = 0 ; i < m_team_size ; ++i ) {
+    m_sync_value |= int64_t(1) << (8*i);
+    m_sync_mask  |= int64_t(3) << (8*i);
+  }
+
+  Kokkos::memory_fence();
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void TaskExec< Kokkos::Qthreads >::team_barrier() const
+{
+  if ( 1 < m_team_size ) {
+
+    if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
+      Kokkos::abort("TaskQueue<Qthreads> scratch_reduce memory too small");
+    }
+
+    // Use team shared memory to synchronize.
+    // Alternate memory locations between barriers to avoid a sequence
+    // of barriers overtaking one another.
+
+    int64_t volatile * const sync =
+      ((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
+
+    // This team member sets one byte within the sync variable
+    int8_t volatile * const sync_self =
+     ((int8_t *) sync) + m_team_rank ;
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    *sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
+
+    while ( m_sync_value != *sync ); // wait for team to arrive
+
+#if 0
+fprintf( stdout,
+         "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n",
+         m_group_rank,
+         m_team_rank,
+         m_sync_step,
+         m_sync_value,
+         *sync
+       );
+fflush(stdout);
+#endif
+
+    ++m_sync_step ;
+
+    if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
+      m_sync_value ^= m_sync_mask ;
+      if ( 1000 < m_sync_step ) m_sync_step = 0 ;
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::execute
+  ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using PoolExec        = Kokkos::Impl::QthreadsExec ;
+  using Member          = TaskExec< execution_space > ;
+
+  task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+  // Required:  team_size <= 8
+
+  const int team_size = PoolExec::pool_size(2); // Threads per core
+  // const int team_size = PoolExec::pool_size(1); // Threads per NUMA
+
+  if ( 8 < team_size ) {
+    Kokkos::abort("TaskQueue<Qthreads> unsupported team size");
+  }
+
+#pragma omp parallel
+  {
+    PoolExec & self = *PoolExec::get_thread_omp();
+
+    Member single_exec ;
+    Member team_exec( self, team_size );
+
+    // Team shared memory
+    task_root_type * volatile * const task_shared =
+      (task_root_type **) team_exec.m_team_exec->scratch_thread();
+
+// Barrier across entire Qthreads thread pool to insure initialization
+#pragma omp barrier
+
+    // Loop until all queues are empty and no tasks in flight
+
+    do {
+
+      // Each team lead attempts to acquire either a thread team task
+      // or collection of single thread tasks for the team.
+
+      if ( 0 == team_exec.team_rank() ) {
+
+        task_root_type * tmp =
+          0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
+
+        // Loop by priority and then type
+        for ( int i = 0 ; i < queue_type::NumQueue && end == tmp ; ++i ) {
+          for ( int j = 0 ; j < 2 && end == tmp ; ++j ) {
+            tmp = queue_type::pop_task( & queue->m_ready[i][j] );
+          }
+        }
+
+        *task_shared = tmp ;
+
+        // Fence to be sure shared_task_array is stored
+        Kokkos::memory_fence();
+      }
+
+      // Whole team waits for every team member to reach this statement
+      team_exec.team_barrier();
+
+      Kokkos::memory_fence();
+
+      task_root_type * const task = *task_shared ;
+
+#if 0
+fprintf( stdout,
+         "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n",
+         team_exec.m_group_rank,
+         team_exec.m_team_rank,
+         uintptr_t(task_shared),
+         uintptr_t(task)
+       );
+fflush(stdout);
+#endif
+
+      if ( 0 == task ) break ; // 0 == m_ready_count
+
+      if ( end == task ) {
+        team_exec.team_barrier();
+      }
+      else if ( task_root_type::TaskTeam == task->m_task_type ) {
+        // Thread Team Task
+        (*task->m_apply)( task, & team_exec );
+
+        // The m_apply function performs a barrier
+
+        if ( 0 == team_exec.team_rank() ) {
+          // team member #0 completes the task, which may delete the task
+          queue->complete( task );
+        }
+      }
+      else {
+        // Single Thread Task
+
+        if ( 0 == team_exec.team_rank() ) {
+
+          (*task->m_apply)( task, & single_exec );
+
+          queue->complete( task );
+        }
+
+        // All team members wait for whole team to reach this statement.
+        // Not necessary to complete the task.
+        // Is necessary to prevent task_shared from being updated
+        // before it is read by all threads.
+        team_exec.team_barrier();
+      }
+    } while(1);
+  }
+// END #pragma omp parallel
+
+}
+
+void TaskQueueSpecialization< Kokkos::Qthreads >::
+  iff_single_thread_recursive_execute
+    ( TaskQueue< Kokkos::Qthreads > * const queue )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = TaskQueue< execution_space > ;
+  using task_root_type  = TaskBase< execution_space, void, void > ;
+  using Member          = TaskExec< execution_space > ;
+
+  if ( 1 == omp_get_num_threads() ) {
+
+    task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
+
+    Member single_exec ;
+
+    task_root_type * task = end ;
+
+    do {
+
+      task = end ;
+
+      // Loop by priority and then type
+      for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
+        for ( int j = 0 ; j < 2 && end == task ; ++j ) {
+          task = queue_type::pop_task( & queue->m_ready[i][j] );
+        }
+      }
+
+      if ( end == task ) break ;
+
+      (*task->m_apply)( task, & single_exec );
+
+      queue->complete( task );
+
+    } while(1);
+  }
+}
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
+
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..836452dde93767f172e47d2c19f74498e4dde246
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
+#define KOKKOS_IMPL_QTHREADS_TASK_HPP
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskQueueSpecialization< Kokkos::Qthreads >
+{
+public:
+
+  using execution_space = Kokkos::Qthreads ;
+  using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
+  using task_base_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  // Must specify memory space
+  using memory_space = Kokkos::HostSpace ;
+
+  static
+  void iff_single_thread_recursive_execute( queue_type * const );
+
+  // Must provide task queue execution function
+  static void execute( queue_type * const );
+
+  // Must provide mechanism to set function pointer in
+  // execution space from the host process.
+  template< typename FunctorType >
+  static
+  void proc_set_apply( task_base_type::function_type * ptr )
+    {
+      using TaskType = TaskBase< execution_space,
+                                 typename FunctorType::value_type,
+                                 FunctorType
+                               > ;
+       *ptr = TaskType::apply ;
+    }
+};
+
+extern template class TaskQueue< Kokkos::Qthreads > ;
+
+//----------------------------------------------------------------------------
+
+template<>
+class TaskExec< Kokkos::Qthreads >
+{
+private:
+
+  TaskExec( TaskExec && ) = delete ;
+  TaskExec( TaskExec const & ) = delete ;
+  TaskExec & operator = ( TaskExec && ) = delete ;
+  TaskExec & operator = ( TaskExec const & ) = delete ;
+
+
+  using PoolExec = Kokkos::Impl::QthreadsExec ;
+
+  friend class Kokkos::Impl::TaskQueue< Kokkos::Qthreads > ;
+  friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Qthreads > ;
+
+  PoolExec * const m_self_exec ;  ///< This thread's thread pool data structure
+  PoolExec * const m_team_exec ;  ///< Team thread's thread pool data structure
+  int64_t          m_sync_mask ;
+  int64_t mutable  m_sync_value ;
+  int     mutable  m_sync_step ;
+  int              m_group_rank ; ///< Which "team" subset of thread pool
+  int              m_team_rank ;  ///< Which thread within a team
+  int              m_team_size ;
+
+  TaskExec();
+  TaskExec( PoolExec & arg_exec, int arg_team_size );
+
+public:
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  void * team_shared() const
+    { return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
+
+  int team_shared_size() const
+    { return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
+
+  /**\brief  Whole team enters this function call
+   *         before any teeam member returns from
+   *         this function call.
+   */
+  void team_barrier() const ;
+#else
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+  KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const { return m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const { return m_team_size ; }
+};
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+#endif /* #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
similarity index 91%
rename from lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
rename to lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
index 50444177ceaa46218f9757636d46c8a1a0b339bf..aa159cff6a5211d721a7b6beb31a5969851d080d 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.cpp.old
@@ -41,11 +41,11 @@
 //@HEADER
 */
 
-// Experimental unified task-data parallel manycore LDRD
+// Experimental unified task-data parallel manycore LDRD.
 
 #include <Kokkos_Core_fwd.hpp>
 
-#if defined( KOKKOS_ENABLE_QTHREAD )
+#if defined( KOKKOS_ENABLE_QTHREADS )
 
 #include <stdio.h>
 
@@ -56,17 +56,15 @@
 #include <string>
 
 #include <Kokkos_Atomic.hpp>
-#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+#include <Qthreads/Kokkos_Qthreads_TaskPolicy.hpp>
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-//----------------------------------------------------------------------------
-
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-typedef TaskMember< Kokkos::Qthread , void , void > Task ;
+typedef TaskMember< Kokkos::Qthreads , void , void > Task ;
 
 namespace {
 
@@ -173,16 +171,16 @@ Task::TaskMember( const function_dealloc_type  arg_dealloc
 
 void Task::throw_error_add_dependence() const
 {
-  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
+  std::cerr << "TaskMember< Qthreads >::add_dependence ERROR"
             << " state(" << m_state << ")"
             << " dep_size(" << m_dep_size << ")"
             << std::endl ;
-  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
+  throw std::runtime_error("TaskMember< Qthreads >::add_dependence ERROR");
 }
 
 void Task::throw_error_verify_type()
 {
-  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
+  throw std::runtime_error("TaskMember< Qthreads >::verify_type ERROR");
 }
 
 //----------------------------------------------------------------------------
@@ -190,7 +188,7 @@ void Task::throw_error_verify_type()
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
 void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
 {
-  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthreads>::assign ERROR" ;
   static const char msg_error_count[]       = ": negative reference count" ;
   static const char msg_error_complete[]    = ": destroy task that is not complete" ;
   static const char msg_error_dependences[] = ": destroy task that has dependences" ;
@@ -294,7 +292,7 @@ fflush(stdout);
       assign( & m_dep[i] , 0 );
     }
 
-    // Set qthread FEB to full so that dependent tasks are allowed to execute.
+    // Set Qthreads FEB to full so that dependent tasks are allowed to execute.
     // This 'task' may be deleted immediately following this function call.
     qthread_fill( & m_qfeb );
 
@@ -319,10 +317,10 @@ aligned_t Task::qthread_func( void * arg )
                                         );
 
   if ( task->m_apply_team && ! task->m_apply_single ) {
-    Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
+    Kokkos::Impl::QthreadsTeamPolicyMember::TaskTeam task_team_tag ;
 
     // Initialize team size and rank with shephered info
-    Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
+    Kokkos::Impl::QthreadsTeamPolicyMember member( task_team_tag );
 
     (*task->m_apply_team)( task , member );
 
@@ -344,7 +342,7 @@ fflush(stdout);
   }
   else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
     // Team hard-wired to one, no cloning
-    Kokkos::Impl::QthreadTeamPolicyMember member ;
+    Kokkos::Impl::QthreadsTeamPolicyMember member ;
     (*task->m_apply_team)( task , member );
     task->closeout();
   }
@@ -384,8 +382,8 @@ void Task::schedule()
   // Increment active task count before spawning.
   Kokkos::atomic_increment( m_active_count );
 
-  // spawn in qthread.  must malloc the precondition array and give to qthread.
-  // qthread will eventually free this allocation so memory will not be leaked.
+  // spawn in Qthreads.  must malloc the precondition array and give to Qthreads.
+  // Qthreads will eventually free this allocation so memory will not be leaked.
 
   // concern with thread safety of malloc, does this need to be guarded?
   aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
@@ -393,7 +391,7 @@ void Task::schedule()
   qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
 
   for ( int i = 0 ; i < m_dep_size ; ++i ) {
-    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthreads precondition flag
   }
 
   if ( m_apply_team && ! m_apply_single ) {
@@ -446,7 +444,7 @@ fflush(stdout);
 namespace Kokkos {
 namespace Experimental {
 
-TaskPolicy< Kokkos::Qthread >::
+TaskPolicy< Kokkos::Qthreads >::
 TaskPolicy
   ( const unsigned /* arg_task_max_count */
   , const unsigned /* arg_task_max_size */
@@ -462,7 +460,7 @@ TaskPolicy
 
   if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
     std::ostringstream msg ;
-    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthread >( "
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads >( "
         << "default_depedence = " << arg_task_default_dependence_capacity
         << " , team_size = " << arg_task_team_size
         << " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
@@ -470,14 +468,14 @@ TaskPolicy
   }
 }
 
-TaskPolicy< Kokkos::Qthread >::member_type &
-TaskPolicy< Kokkos::Qthread >::member_single()
+TaskPolicy< Kokkos::Qthreads >::member_type &
+TaskPolicy< Kokkos::Qthreads >::member_single()
 {
   static member_type s ;
   return s ;
 }
 
-void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
 {
   volatile int * const active_task_count = & policy.m_active_count ;
   while ( *active_task_count ) qthread_yield();
@@ -486,6 +484,5 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
 } // namespace Experimental
 } // namespace Kokkos
 
-#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
-
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
similarity index 90%
rename from lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
rename to lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
index 565dbf7e61716717bdbac0e1b3adf007493cf27d..1e5a4dc593cc6de9fff9d2a762b4f864c6c12e9c 100644
--- a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskPolicy.hpp.old
@@ -43,15 +43,15 @@
 
 // Experimental unified task-data parallel manycore LDRD
 
-#ifndef KOKKOS_QTHREAD_TASKSCHEDULER_HPP
-#define KOKKOS_QTHREAD_TASKSCHEDULER_HPP
+#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
+#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
 
 #include <string>
 #include <typeinfo>
 #include <stdexcept>
 
 //----------------------------------------------------------------------------
-// Defines to enable experimental Qthread functionality
+// Defines to enable experimental Qthreads functionality
 
 #define QTHREAD_LOCAL_PRIORITY
 #define CLONED_TASKS
@@ -63,7 +63,7 @@
 
 //----------------------------------------------------------------------------
 
-#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Qthreads.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_View.hpp>
 
@@ -78,13 +78,13 @@ namespace Experimental {
 namespace Impl {
 
 template<>
-class TaskMember< Kokkos::Qthread , void , void >
+class TaskMember< Kokkos::Qthreads , void , void >
 {
 public:
 
   typedef TaskMember * (* function_verify_type) ( TaskMember * );
   typedef void         (* function_single_type) ( TaskMember * );
-  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::QthreadsTeamPolicyMember & );
   typedef void         (* function_dealloc_type)( TaskMember * );
 
 private:
@@ -94,7 +94,7 @@ private:
   const function_single_type   m_apply_single ;  ///< Apply function
   const function_team_type     m_apply_team ;    ///< Apply function
   int volatile * const         m_active_count ;  ///< Count of active tasks on this policy
-  aligned_t                    m_qfeb ;          ///< Qthread full/empty bit
+  aligned_t                    m_qfeb ;          ///< Qthreads full/empty bit
   TaskMember ** const          m_dep ;           ///< Dependences
   const int                    m_dep_capacity ;  ///< Capacity of dependences
   int                          m_dep_size ;      ///< Actual count of dependences
@@ -129,7 +129,7 @@ protected :
 
   ~TaskMember();
 
-  // Used by TaskMember< Qthread , ResultType , void >
+  // Used by TaskMember< Qthreads , ResultType , void >
   TaskMember( const function_verify_type   arg_verify
             , const function_dealloc_type  arg_dealloc
             , const function_single_type   arg_apply_single
@@ -139,7 +139,7 @@ protected :
             , const unsigned               arg_dependence_capacity
             );
 
-  // Used for TaskMember< Qthread , void , void >
+  // Used for TaskMember< Qthreads , void , void >
   TaskMember( const function_dealloc_type  arg_dealloc
             , const function_single_type   arg_apply_single
             , const function_team_type     arg_apply_team
@@ -175,15 +175,15 @@ public:
   /*  Inheritence Requirements on task types:
    *    typedef  FunctorType::value_type  value_type ;
    *    class DerivedTaskType
-   *      : public TaskMember< Qthread , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , FunctorType >
    *      { ... };
-   *    class TaskMember< Qthread , value_type , FunctorType >
-   *      : public TaskMember< Qthread , value_type , void >
+   *    class TaskMember< Qthreads , value_type , FunctorType >
+   *      : public TaskMember< Qthreads , value_type , void >
    *      , public Functor
    *      { ... };
    *  If value_type != void
-   *    class TaskMember< Qthread , value_type , void >
-   *      : public TaskMember< Qthread , void , void >
+   *    class TaskMember< Qthreads , value_type , void >
+   *      : public TaskMember< Qthreads , void , void >
    *
    *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
    *
@@ -300,10 +300,10 @@ public:
   KOKKOS_INLINE_FUNCTION static
   void apply_single( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
       //   , public FunctorType
       //   { ... };
 
@@ -316,10 +316,10 @@ public:
   KOKKOS_INLINE_FUNCTION static
   void apply_single( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
-      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      // TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthreads , ResultType , void >
       //   , public FunctorType
       //   { ... };
 
@@ -333,9 +333,9 @@ public:
   template< class FunctorType , class ResultType >
   KOKKOS_INLINE_FUNCTION static
   void apply_team( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t
-                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
       derived_type & m = * static_cast< derived_type * >( t );
 
@@ -345,9 +345,9 @@ public:
   template< class FunctorType , class ResultType >
   KOKKOS_INLINE_FUNCTION static
   void apply_team( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t
-                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+                 , Kokkos::Impl::QthreadsTeamPolicyMember & member )
     {
-      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+      typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
 
       derived_type & m = * static_cast< derived_type * >( t );
 
@@ -356,7 +356,7 @@ public:
 };
 
 //----------------------------------------------------------------------------
-/** \brief  Base class for tasks with a result value in the Qthread execution space.
+/** \brief  Base class for tasks with a result value in the Qthreads execution space.
  *
  *  The FunctorType must be void because this class is accessed by the
  *  Future class for the task and result value.
@@ -365,8 +365,8 @@ public:
  *  can correctly static_cast from the 'root class' to this class.
  */
 template < class ResultType >
-class TaskMember< Kokkos::Qthread , ResultType , void >
-  : public TaskMember< Kokkos::Qthread , void , void >
+class TaskMember< Kokkos::Qthreads , ResultType , void >
+  : public TaskMember< Kokkos::Qthreads , void , void >
 {
 public:
 
@@ -379,7 +379,7 @@ public:
 
 protected:
 
-  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , void , void >  task_root_type ;
   typedef task_root_type::function_dealloc_type        function_dealloc_type ;
   typedef task_root_type::function_single_type         function_single_type ;
   typedef task_root_type::function_team_type           function_team_type ;
@@ -404,16 +404,16 @@ protected:
 };
 
 template< class ResultType , class FunctorType >
-class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
-  : public TaskMember< Kokkos::Qthread , ResultType , void >
+class TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthreads , ResultType , void >
   , public FunctorType
 {
 public:
 
   typedef FunctorType  functor_type ;
 
-  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
-  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
+  typedef TaskMember< Kokkos::Qthreads , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthreads , ResultType , void >  task_base_type ;
   typedef task_root_type::function_dealloc_type              function_dealloc_type ;
   typedef task_root_type::function_single_type               function_single_type ;
   typedef task_root_type::function_team_type                 function_team_type ;
@@ -447,16 +447,16 @@ public:
 namespace Kokkos {
 namespace Experimental {
 
-void wait( TaskPolicy< Kokkos::Qthread > & );
+void wait( TaskPolicy< Kokkos::Qthreads > & );
 
 template<>
-class TaskPolicy< Kokkos::Qthread >
+class TaskPolicy< Kokkos::Qthreads >
 {
 public:
 
-  typedef Kokkos::Qthread                        execution_space ;
+  typedef Kokkos::Qthreads                        execution_space ;
   typedef TaskPolicy                             execution_policy ;
-  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
+  typedef Kokkos::Impl::QthreadsTeamPolicyMember  member_type ;
 
 private:
 
@@ -650,7 +650,7 @@ public:
 
   static member_type & member_single();
 
-  friend void wait( TaskPolicy< Kokkos::Qthread > & );
+  friend void wait( TaskPolicy< Kokkos::Qthreads > & );
 };
 
 } /* namespace Experimental */
@@ -660,5 +660,5 @@ public:
 //----------------------------------------------------------------------------
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+#endif /* #define KOKKOS_QTHREADS_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..55235cd6d27a9df0e40bd28dff8caa13df94073e
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
@@ -0,0 +1,319 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Manage task allocation, deallocation, and scheduling.
+ *
+ *  Task execution is handled here directly for the Qthread implementation.
+ */
+template<>
+class TaskQueue< Kokkos::Qthread > {
+private:
+
+  using execution_space = Kokkos::Qthread ;
+  using memory_space    = Kokkos::HostSpace
+  using device_type     = Kokkos::Device< execution_space, memory_space > ;
+  using memory_pool     = Kokkos::Experimental::MemoryPool< device_type > ;
+  using task_root_type  = Kokkos::Impl::TaskBase< execution_space, void, void > ;
+
+  friend class Kokkos::TaskScheduler< execution_space > ;
+
+  struct Destroy {
+    TaskQueue * m_queue ;
+    void destroy_shared_allocation();
+  };
+
+  //----------------------------------------
+
+  enum : int { TASK_STATE_NULL         =  0,  ///<  Does not exist
+               TASK_STATE_CONSTRUCTING =  1,  ///<  Is under construction
+               TASK_STATE_WAITING      =  2,  ///<  Is waiting for execution
+               TASK_STATE_EXECUTING    =  4,  ///<  Is executing
+               TASK_STATE_RESPAWN      =  8,  ///<  Requested respawn
+               TASK_STATE_COMPLETE     = 16   ///<  Execution is complete
+             };
+
+  // Queue is organized as [ priority ][ type ]
+
+  memory_pool  m_memory ;
+  unsigned     m_team_size ;   // Number of threads in a team
+  long         m_accum_alloc ; // Accumulated number of allocations
+  int          m_count_alloc ; // Current number of allocations
+  int          m_max_alloc ;   // Maximum number of allocations
+  int          m_ready_count ; // Number of ready or executing
+
+  //----------------------------------------
+
+  ~TaskQueue();
+  TaskQueue() = delete ;
+  TaskQueue( TaskQueue && ) = delete ;
+  TaskQueue( TaskQueue const & ) = delete ;
+  TaskQueue & operator = ( TaskQueue && ) = delete ;
+  TaskQueue & operator = ( TaskQueue const & ) = delete ;
+
+  TaskQueue
+    ( const memory_space & arg_space,
+      unsigned const arg_memory_pool_capacity,
+      unsigned const arg_memory_pool_superblock_capacity_log2
+    );
+
+  // Schedule a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next is the dependence or zero
+  //   Postcondition:
+  //     task->m_next is linked list membership
+  KOKKOS_FUNCTION
+  void schedule( task_root_type * const );
+
+  // Reschedule a task
+  //   Precondition:
+  //     task is in Executing state
+  //     task->m_next == LockTag
+  //   Postcondition:
+  //     task is in Executing-Respawn state
+  //     task->m_next == 0 (no dependence)
+  KOKKOS_FUNCTION
+  void reschedule( task_root_type * );
+
+  // Complete a task
+  //   Precondition:
+  //     task is not executing
+  //     task->m_next == LockTag  =>  task is complete
+  //     task->m_next != LockTag  =>  task is respawn
+  //   Postcondition:
+  //     task->m_wait == LockTag  =>  task is complete
+  //     task->m_wait != LockTag  =>  task is waiting
+  KOKKOS_FUNCTION
+  void complete( task_root_type * );
+
+public:
+
+  // If and only if the execution space is a single thread
+  // then execute ready tasks.
+  KOKKOS_INLINE_FUNCTION
+  void iff_single_thread_recursive_execute()
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      specialization::iff_single_thread_recursive_execute( this );
+#endif
+    }
+
+  void execute() { specialization::execute( this ); }
+
+  template< typename FunctorType >
+  void proc_set_apply( typename task_root_type::function_type * ptr )
+    {
+      specialization::template proc_set_apply< FunctorType >( ptr );
+    }
+
+  // Assign task pointer with reference counting of assigned tasks
+  template< typename LV, typename RV >
+  KOKKOS_FUNCTION static
+  void assign( TaskBase< execution_space, LV, void > ** const lhs,
+               TaskBase< execution_space, RV, void > *  const rhs )
+    {
+      using task_lhs = TaskBase< execution_space, LV, void > ;
+#if 0
+  {
+    printf( "assign( 0x%lx { 0x%lx %d %d }, 0x%lx { 0x%lx %d %d } )\n",
+            uintptr_t( lhs ? *lhs : 0 ),
+            uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_task_type : 0 ),
+            int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ),
+            uintptr_t(rhs),
+            uintptr_t( rhs ? rhs->m_next : 0 ),
+            int( rhs ? rhs->m_task_type : 0 ),
+            int( rhs ? rhs->m_ref_count : 0 )
+          );
+    fflush( stdout );
+  }
+#endif
+
+      if ( *lhs )
+      {
+        const int count = Kokkos::atomic_fetch_add( &((*lhs)->m_ref_count), -1 );
+
+        if ( ( 1 == count ) && ( (*lhs)->m_state == TASK_STATE_COMPLETE ) ) {
+          // Reference count is zero and task is complete, deallocate.
+          (*lhs)->m_queue->deallocate( *lhs, (*lhs)->m_alloc_size );
+        }
+        else if ( count <= 1 ) {
+          Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
+        }
+
+        // GEM: Should I check that there are no dependences here?  Can the state
+        //      be set to complete while there are still dependences?
+      }
+
+      if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count), 1 ); }
+
+      // Force write of *lhs
+
+      *static_cast< task_lhs * volatile * >(lhs) = rhs ;
+
+      Kokkos::memory_fence();
+    }
+
+  KOKKOS_FUNCTION
+  size_t allocate_block_size( size_t n ); ///< Actual block size allocated
+
+  KOKKOS_FUNCTION
+  void * allocate( size_t n ); ///< Allocate from the memory pool
+
+  KOKKOS_FUNCTION
+  void deallocate( void * p, size_t n ); ///< Deallocate to the memory pool
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class TaskBase< Kokkos::Qthread, void, void >
+{
+public:
+
+  enum : int16_t   { TaskTeam   = TaskBase< void, void, void >::TaskTeam,
+                     TaskSingle = TaskBase< void, void, void >::TaskSingle,
+                     Aggregate  = TaskBase< void, void, void >::Aggregate };
+
+  enum : uintptr_t { LockTag = TaskBase< void, void, void >::LockTag,
+                     EndTag  = TaskBase< void, void, void >::EndTag };
+
+  using execution_space = Kokkos::Qthread ;
+  using queue_type      = TaskQueue< execution_space > ;
+
+  template< typename > friend class Kokkos::TaskScheduler ;
+
+  typedef void (* function_type) ( TaskBase *, void * );
+
+  // sizeof(TaskBase) == 48
+
+  function_type  m_apply ;       ///< Apply function pointer
+  queue_type   * m_queue ;       ///< Queue in which this task resides
+  TaskBase     * m_dep ;         ///< Dependence
+  int32_t        m_ref_count ;   ///< Reference count
+  int32_t        m_alloc_size ;  ///< Allocation size
+  int32_t        m_dep_count ;   ///< Aggregate's number of dependences
+  int16_t        m_task_type ;   ///< Type of task
+  int16_t        m_priority ;    ///< Priority of runnable task
+  aligned_t      m_qfeb ;        ///< Qthread full/empty bit
+  int            m_state ;       ///< State of the task
+
+  TaskBase( TaskBase && ) = delete ;
+  TaskBase( const TaskBase & ) = delete ;
+  TaskBase & operator = ( TaskBase && ) = delete ;
+  TaskBase & operator = ( const TaskBase & ) = delete ;
+
+  KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase() noexcept
+    : m_apply(0),
+      m_queue(0),
+      m_dep(0),
+      m_ref_count(0),
+      m_alloc_size(0),
+      m_dep_count(0),
+      m_task_type( TaskSingle ),
+      m_priority( 1 /* TaskRegularPriority */ ),
+      m_qfeb(0),
+      m_state( queue_type::TASK_STATE_CONSTRUCTING )
+    {
+      qthread_empty( & m_qfeb ); // Set to full when complete
+    }
+
+  //----------------------------------------
+
+  static aligned_t qthread_func( void * arg );
+
+  KOKKOS_INLINE_FUNCTION
+  TaskBase ** aggregate_dependences()
+    { return reinterpret_cast<TaskBase**>( this + 1 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void requested_respawn()
+    { return m_state == queue_type::TASK_STATE_RESPAWN; }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskBase* dep )
+    {
+      // Assign dependence to m_dep.  It will be processed in the subsequent
+      // call to schedule.  Error if the dependence is reset.
+      if ( 0 != Kokkos::atomic_exchange( & m_dep, dep ) ) {
+        Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
+      }
+
+      if ( 0 != dep ) {
+        // The future may be destroyed upon returning from this call
+        // so increment reference count to track this assignment.
+        Kokkos::atomic_fetch_add( &(dep->m_ref_count), 1 );
+      }
+    }
+
+  using get_return_type = void ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_return_type get() const {}
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4a9190c731c6034724b63094c55967de78caab64
--- /dev/null
+++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
@@ -0,0 +1,436 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ENABLE_TASKPOLICY )
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
+{
+  m_queue->~TaskQueue();
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::TaskQueue
+  ( const TaskQueue< ExecSpace >::memory_space & arg_space,
+    unsigned const arg_memory_pool_capacity,
+    unsigned const arg_memory_pool_superblock_capacity_log2 )
+  : m_memory( arg_space,
+              arg_memory_pool_capacity,
+              arg_memory_pool_superblock_capacity_log2 )
+    m_team_size( unsigned( qthread_num_workers_local(NO_SHEPHERD) ) ),
+    m_accum_alloc(0),
+    m_count_alloc(0),
+    m_max_alloc(0),
+    m_ready_count(0)
+{}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+TaskQueue< ExecSpace >::~TaskQueue()
+{
+  // Verify that ready count is zero.
+  if ( 0 != m_ready_count ) {
+    Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
+{
+  return m_memory.allocate_block_size( n );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void * TaskQueue< ExecSpace >::allocate( size_t n )
+{
+  void * const p = m_memory.allocate(n);
+
+  if ( p ) {
+    Kokkos::atomic_increment( & m_accum_alloc );
+    Kokkos::atomic_increment( & m_count_alloc );
+
+    if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
+  }
+
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::deallocate( void * p, size_t n )
+{
+  m_memory.deallocate( p, n );
+  Kokkos::atomic_decrement( & m_count_alloc );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+#if 0
+  printf( "schedule( 0x%lx { %d %d %d }\n",
+          uintptr_t(task),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count );
+#endif
+
+  // The task has been constructed and is waiting to be executed.
+  task->m_state = TASK_STATE_WAITING ;
+
+  if ( task->m_task_type != task_root_type::Aggregate ) {
+    // Scheduling a single or team task.
+
+    // Increment active task count before spawning.
+    Kokkos::atomic_increment( m_ready_count );
+
+    if ( task->m_dep == 0 ) {
+      // Schedule a task with no dependences.
+
+      if ( task_root_type::TaskTeam == task->m_task_type && m_team_size > 1 ) {
+        // If more than one shepherd spawn on a shepherd other than this shepherd
+        const int num_shepherd  = qthread_num_shepherds();
+        const int this_shepherd = qthread_shep();
+        int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+        fprintf( stdout,
+                 "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+                 qthread_shep(),
+                 qthread_worker_local(NULL),
+                 reinterpret_cast<unsigned long>(this),
+                 spawn_shepherd,
+                 m_team_size - 1
+               );
+        fflush(stdout);
+#endif
+
+        qthread_spawn_cloneable(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          spawn_shepherd,
+          unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+          m_team_size - 1
+        );
+      }
+      else {
+        qthread_spawn(
+          & task_root_type::qthread_func,
+          task,
+          0,
+          NULL,
+          0, // no depenedences
+          0, // dependences array
+          NO_SHEPHERD,
+          QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+        );
+      }
+    }
+    else if ( task->m_dep->m_task_type != task_root_type::Aggregate )
+    // Malloc the precondition array to pass to qthread_spawn().  For
+    // non-aggregate tasks, it is a single pointer since there are no
+    // dependences.  Qthreads will eventually free this allocation so memory will
+    // not be leaked. Is malloc thread-safe?  Should this call be guarded?  The
+    // memory can't be allocated from the pool allocator because Qthreads frees
+    // it using free().
+    aligned_t ** qprecon = (aligned_t **) malloc( sizeof(aligned_t *) );
+
+    *qprecon = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+    if ( task->m_task_type == task_root_type::TaskTeam && m_team_size > 1) {
+      // If more than one shepherd spawn on a shepherd other than this shepherd
+      const int num_shepherd  = qthread_num_shepherds();
+      const int this_shepherd = qthread_shep();
+      int spawn_shepherd      = ( this_shepherd + 1 ) % num_shepherd ;
+
+#if 0
+  fprintf( stdout,
+           "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
+           qthread_shep(),
+           qthread_worker_local(NULL),
+           reinterpret_cast<unsigned long>(this),
+           spawn_shepherd,
+           m_team_size - 1
+         );
+  fflush(stdout);
+#endif
+
+      qthread_spawn_cloneable(
+        & Task::qthread_func,
+        this,
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        spawn_shepherd,
+        unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
+        m_team_size - 1
+      );
+    }
+    else {
+      qthread_spawn(
+        & Task::qthread_func, /* function */
+        this,                 /* function argument */
+        0,
+        NULL,
+        m_dep_size,
+        qprecon, /* dependences */
+        NO_SHEPHERD,
+        QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+      );
+    }
+  }
+  else {
+    // GEM: How do I handle an aggregate (when_all) task?
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
+{
+  // Precondition:
+  //   task is in Executing state
+  //   task->m_next == LockTag
+  //
+  // Postcondition:
+  //   task is in Executing-Respawn state
+  //   task->m_next == 0 (no dependence)
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+
+  if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
+    Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::complete
+  ( TaskQueue< ExecSpace >::task_root_type * task )
+{
+  // Complete a runnable task that has finished executing
+  // or a when_all task when all of its dependeneces are complete.
+
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
+
+#if 0
+  printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n",
+          uintptr_t(task),
+          uintptr_t(task->m_wait),
+          uintptr_t(task->m_next),
+          task->m_task_type,
+          task->m_priority,
+          task->m_ref_count
+        );
+  fflush( stdout );
+#endif
+
+  const bool runnable = task_root_type::Aggregate != task->m_task_type ;
+
+  //----------------------------------------
+
+  if ( runnable && lock != task->m_next ) {
+    // Is a runnable task has finished executing and requested respawn.
+    // Schedule the task for subsequent execution.
+
+    schedule( task );
+  }
+  //----------------------------------------
+  else {
+    // Is either an aggregate or a runnable task that executed
+    // and did not respawn.  Transition this task to complete.
+
+    // If 'task' is an aggregate then any of the runnable tasks that
+    // it depends upon may be attempting to complete this 'task'.
+    // Must only transition a task once to complete status.
+    // This is controled by atomically locking the wait queue.
+
+    // Stop other tasks from adding themselves to this task's wait queue
+    // by locking the head of this task's wait queue.
+
+    task_root_type * x = Kokkos::atomic_exchange( & task->m_wait, lock );
+
+    if ( x != (task_root_type *) lock ) {
+
+      // This thread has transitioned this 'task' to complete.
+      // 'task' is no longer in a queue and is not executing
+      // so decrement the reference count from 'task's creation.
+      // If no other references to this 'task' then it will be deleted.
+
+      TaskQueue::assign( & task, zero );
+
+      // This thread has exclusive access to the wait list so
+      // the concurrency-safe pop_task function is not needed.
+      // Schedule the tasks that have been waiting on the input 'task',
+      // which may have been deleted.
+
+      while ( x != end ) {
+
+        // Set x->m_next = zero  <=  no dependence
+
+        task_root_type * const next =
+          (task_root_type *) Kokkos::atomic_exchange( & x->m_next, zero );
+
+        schedule( x );
+
+        x = next ;
+      }
+    }
+  }
+
+  if ( runnable ) {
+    // A runnable task was popped from a ready queue and executed.
+    // If respawned into a ready queue then the ready count was incremented
+    // so decrement whether respawned or not.
+    Kokkos::atomic_decrement( & m_ready_count );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template<>
+aligned_t
+TaskBase< Kokkos::Qthreads, void, void >::qthread_func( void * arg )
+{
+  using execution_space = Kokkos::Qthreads ;
+  using task_root_type  = TaskBase< execution_space , void , void > ;
+  using Member          = Kokkos::Impl::QthreadsTeamPolicyMember;
+
+  task_root_type * const task = reinterpret_cast< task_root_type * >( arg );
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state,
+                                          queue_type::TASK_STATE_WAITING,
+                                          queue_type::TASK_STATE_EXECUTING
+                                        );
+
+  if ( task_root_type::TaskTeam == task->m_task_type )
+  {
+    if ( 1 < task->m_queue->m_team_size ) {
+      // Team task with team size of more than 1.
+      Member::TaskTeam task_team_tag ;
+
+      // Initialize team size and rank with shephered info
+      Member member( task_team_tag );
+
+      (*task->m_apply)( task , & member );
+
+#if 0
+      fprintf( stdout,
+              "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n",
+              qthread_shep(),
+              qthread_worker_local(NULL),
+              reinterpret_cast<unsigned long>(task),
+              member.team_rank(),
+              member.team_size()
+            );
+      fflush(stdout);
+#endif
+
+      member.team_barrier();
+      if ( member.team_rank() == 0 ) task->closeout();
+      member.team_barrier();
+    }
+    else {
+      // Team task with team size of 1.
+      Member member ;
+      (*task->m_apply)( task , & member );
+      task->closeout();
+    }
+  }
+  else {
+    (*task->m_apply)( task );
+    task->closeout();
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+
+#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
+
diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthreads/README
similarity index 99%
rename from lib/kokkos/core/src/Qthread/README
rename to lib/kokkos/core/src/Qthreads/README
index 6e6c86a9efc2680916e2556bda28914833e6749d..e35b1f698ec7ca3e3ee020eeee4445de43023c78 100644
--- a/lib/kokkos/core/src/Qthread/README
+++ b/lib/kokkos/core/src/Qthreads/README
@@ -22,4 +22,3 @@ sh autogen.sh
 # install
 
 make install
-
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 0f69be9ed4db6547d52e1c96b735069fb2332081..b1f53489f432ba093ea2222b16c88ee68e005374 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -264,7 +264,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
   const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
 
   for ( int i = 0 ; i < n ; ++i ) {
-    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
   }
 
   exec.m_pool_state = ThreadsExec::Inactive ;
@@ -308,7 +308,7 @@ void ThreadsExec::fence()
 {
   if ( s_thread_pool_size[0] ) {
     // Wait for the root thread to complete:
-    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+    Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
   }
 
   s_current_function     = 0 ;
@@ -724,7 +724,7 @@ void ThreadsExec::initialize( unsigned thread_count ,
   // Init the array for used for arbitrarily sized atomics
   Impl::init_lock_array_host_space();
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
   #endif
 }
@@ -777,7 +777,7 @@ void ThreadsExec::finalize()
   s_threads_process.m_pool_fan_size   = 0 ;
   s_threads_process.m_pool_state = ThreadsExec::Inactive ;
 
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
   #endif
 }
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 385dd492d0e8cc9417b50dd817538abf4f27246c..a6db02ebac84b96a736519a22a537bdc53ea6b1a 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -187,13 +187,13 @@ public:
       // Fan-in reduction with highest ranking thread as the root
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
       }
 
       if ( rev_rank ) {
         m_pool_state = ThreadsExec::Rendezvous ;
         // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
       }
       else {
         // Root thread does the reduction and broadcast
@@ -229,13 +229,13 @@ public:
       // Fan-in reduction with highest ranking thread as the root
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
       }
 
       if ( rev_rank ) {
         m_pool_state = ThreadsExec::Rendezvous ;
         // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
       }
       else {
         // Root thread does the reduction and broadcast
@@ -264,7 +264,7 @@ public:
 
         ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
 
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
 
         Join::join( f , reduce_memory() , fan.reduce_memory() );
       }
@@ -280,7 +280,7 @@ public:
       const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
 
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
       }
     }
 
@@ -312,7 +312,7 @@ public:
         ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
 
         // Wait: Active -> ReductionAvailable (or ScanAvailable)
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
         Join::join( f , work_value , fan.reduce_memory() );
       }
 
@@ -330,8 +330,8 @@ public:
 
           // Wait: Active             -> ReductionAvailable
           // Wait: ReductionAvailable -> ScanAvailable
-          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
-          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
 
           Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
         }
@@ -342,7 +342,7 @@ public:
 
         // Wait for all threads to complete inclusive scan
         // Wait: ScanAvailable -> Rendezvous
-        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
       }
 
       //--------------------------------
@@ -350,7 +350,7 @@ public:
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
         // Wait: ReductionAvailable -> ScanAvailable
-        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
         // Set: ScanAvailable -> Rendezvous
         fan.m_pool_state = ThreadsExec::Rendezvous ;
       }
@@ -377,13 +377,13 @@ public:
       // Wait for all threads to copy previous thread's inclusive scan value
       // Wait for all threads: Rendezvous -> ScanCompleted
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
       }
       if ( rev_rank ) {
         // Set: ScanAvailable -> ScanCompleted
         m_pool_state = ThreadsExec::ScanCompleted ;
         // Wait: ScanCompleted -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
       }
       // Set: ScanCompleted -> Active
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@@ -410,7 +410,7 @@ public:
       // Fan-in reduction with highest ranking thread as the root
       for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
         // Wait: Active -> Rendezvous
-        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
       }
 
       for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@@ -418,7 +418,7 @@ public:
       if ( rev_rank ) {
         m_pool_state = ThreadsExec::Rendezvous ;
         // Wait: Rendezvous -> Active
-        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
       }
       else {
         // Root thread does the thread-scan before releasing threads
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
index b9edb64551f21d96f35a5276b06b501101b4e3e7..701495428193148f0efaf8dbf1cdededabd66460 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -49,6 +49,7 @@
 #include <utility>
 #include <impl/Kokkos_spinwait.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
 
 #include <Kokkos_Atomic.hpp>
 
@@ -103,13 +104,13 @@ public:
 
       // Wait for fan-in threads
       for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
-        Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
+        Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
       }
 
       // If not root then wait for release
       if ( m_team_rank_rev ) {
         m_exec->state() = ThreadsExec::Rendezvous ;
-        Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
+        Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
       }
 
       return ! m_team_rank_rev ;
@@ -350,6 +351,10 @@ public:
         const int team_rank_rev = pool_rank_rev % team.team_alloc();
         const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
         const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+        if(pool_league_rank_rev >= pool_league_size) {
+          m_invalid_thread = 1;
+          return;
+        }
         const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
 
         const int pool_num_teams       = m_exec->pool_size()/team.team_alloc();
@@ -505,7 +510,8 @@ private:
            , const int team_size_request )
    {
       const int pool_size  = traits::execution_space::thread_pool_size(0);
-      const int team_max   = traits::execution_space::thread_pool_size(1);
+      const int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      const int team_max   = pool_size<max_host_team_size?pool_size:max_host_team_size;
       const int team_grain = traits::execution_space::thread_pool_size(2);
 
       m_league_size = league_size_request ;
@@ -552,8 +558,12 @@ public:
 
   template< class FunctorType >
   inline static
-  int team_size_max( const FunctorType & )
-    { return traits::execution_space::thread_pool_size(1); }
+  int team_size_max( const FunctorType & ) {
+      int pool_size = traits::execution_space::thread_pool_size(1);
+      int max_host_team_size =  Impl::HostThreadTeamData::max_team_members;
+      return pool_size<max_host_team_size?pool_size:max_host_team_size;
+    }
+
 
   template< class FunctorType >
   static int team_size_recommended( const FunctorType & )
@@ -819,9 +829,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
 #pragma ivdep
 #endif
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    result+=tmp;
+    lambda(i,result);
   }
 }
 
@@ -835,18 +843,14 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
 template< typename iType, class Lambda, typename ValueType, class JoinType >
 KOKKOS_INLINE_FUNCTION
 void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
-      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result ) {
 
-  ValueType result = init_result;
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
   for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
+    lambda(i,result);
   }
-  init_result = result;
 }
 
 /** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c4db3e15ef4593422eca54ab5d295f5469d3a5ad
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -0,0 +1,2356 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
+#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP
+
+#include <iostream>
+#include <algorithm>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
+#define KOKKOS_MDRANGE_IVDEP
+#endif
+
+
+#ifdef KOKKOS_MDRANGE_IVDEP
+ #define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
+#else
+ #define KOKKOS_ENABLE_IVDEP_MDRANGE
+#endif
+
+
+
+namespace Kokkos { namespace Experimental { namespace Impl {
+
+// Temporary, for testing new loop macros
+#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
+
+
+#define LOOP_1L(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE \
+  for( type i0=0; i0<static_cast<type>(tile[0]); ++i0)
+
+#define LOOP_2L(type, tile) \
+  for( type i1=0; i1<static_cast<type>(tile[1]); ++i1) \
+  LOOP_1L(type, tile)
+
+#define LOOP_3L(type, tile) \
+  for( type i2=0; i2<static_cast<type>(tile[2]); ++i2) \
+  LOOP_2L(type, tile)
+
+#define LOOP_4L(type, tile) \
+  for( type i3=0; i3<static_cast<type>(tile[3]); ++i3) \
+  LOOP_3L(type, tile)
+
+#define LOOP_5L(type, tile) \
+  for( type i4=0; i4<static_cast<type>(tile[4]); ++i4) \
+  LOOP_4L(type, tile)
+
+#define LOOP_6L(type, tile) \
+  for( type i5=0; i5<static_cast<type>(tile[5]); ++i5) \
+  LOOP_5L(type, tile)
+
+#define LOOP_7L(type, tile) \
+  for( type i6=0; i6<static_cast<type>(tile[6]); ++i6) \
+  LOOP_6L(type, tile)
+
+#define LOOP_8L(type, tile) \
+  for( type i7=0; i7<static_cast<type>(tile[7]); ++i7) \
+  LOOP_7L(type, tile)
+
+
+#define LOOP_1R(type, tile) \
+  KOKKOS_ENABLE_IVDEP_MDRANGE \
+  for ( type i0=0; i0<static_cast<type>(tile[0]); ++i0 )
+
+#define LOOP_2R(type, tile) \
+  LOOP_1R(type, tile) \
+  for ( type i1=0; i1<static_cast<type>(tile[1]); ++i1 )
+
+#define LOOP_3R(type, tile) \
+  LOOP_2R(type, tile) \
+  for ( type i2=0; i2<static_cast<type>(tile[2]); ++i2 )
+
+#define LOOP_4R(type, tile) \
+  LOOP_3R(type, tile) \
+  for ( type i3=0; i3<static_cast<type>(tile[3]); ++i3 )
+
+#define LOOP_5R(type, tile) \
+  LOOP_4R(type, tile) \
+  for ( type i4=0; i4<static_cast<type>(tile[4]); ++i4 )
+
+#define LOOP_6R(type, tile) \
+  LOOP_5R(type, tile) \
+  for ( type i5=0; i5<static_cast<type>(tile[5]); ++i5 )
+
+#define LOOP_7R(type, tile) \
+  LOOP_6R(type, tile) \
+  for ( type i6=0; i6<static_cast<type>(tile[6]); ++i6 )
+
+#define LOOP_8R(type, tile) \
+  LOOP_7R(type, tile) \
+  for ( type i7=0; i7<static_cast<type>(tile[7]); ++i7 )
+
+
+#define LOOP_ARGS_1 i0 + m_offset[0]
+#define LOOP_ARGS_2 LOOP_ARGS_1, i1 + m_offset[1]
+#define LOOP_ARGS_3 LOOP_ARGS_2, i2 + m_offset[2]
+#define LOOP_ARGS_4 LOOP_ARGS_3, i3 + m_offset[3]
+#define LOOP_ARGS_5 LOOP_ARGS_4, i4 + m_offset[4]
+#define LOOP_ARGS_6 LOOP_ARGS_5, i5 + m_offset[5]
+#define LOOP_ARGS_7 LOOP_ARGS_6, i6 + m_offset[6]
+#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
+
+
+
+// New Loop Macros...
+// parallel_for, non-tagged
+#define APPLY( func, ... ) \
+  func( __VA_ARGS__ );
+
+// LayoutRight
+// d = 0 to start
+#define LOOP_R_1( func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY( func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define LOOP_R_2( func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_R_1( func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define LOOP_R_3( func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_R_2( func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define LOOP_R_4( func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_R_3( func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define LOOP_R_5( func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_R_4( func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define LOOP_R_6( func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_R_5( func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define LOOP_R_7( func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_R_6( func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define LOOP_R_8( func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_R_7( func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define LOOP_L_1( func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY( func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define LOOP_L_2( func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_L_1( func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_3( func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_L_2( func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_4( func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_L_3( func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_5( func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_L_4( func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_6( func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_L_5( func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_7( func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_L_6( func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_8( func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_L_7( func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+// TODO: rank not necessary to pass through, can hardcode the values
+#define LOOP_LAYOUT_1( func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    APPLY( func, i0 + m_offset[0] )              \
+  } 
+
+#define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      LOOP_L_1( func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      LOOP_L_2( func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      LOOP_L_3( func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      LOOP_L_4( func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      LOOP_L_5( func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      LOOP_L_6( func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      LOOP_L_7( func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_1( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_1( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_2( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_2( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_2( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_3( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_3( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_3( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_4( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_4( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_4( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_5( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_5( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_5( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_6( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_6( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_6( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_7( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_7( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_7( func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_8( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_8( func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_8( func, type, is_left, m_offset, extent_partial, rank ) }
+
+
+// parallel_reduce, non-tagged
+// Reduction version
+#define APPLY_REDUX( val, func, ... ) \
+  func( __VA_ARGS__, val );
+
+// LayoutRight
+// d = 0 to start
+#define LOOP_R_1_REDUX( val, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY_REDUX( val, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define LOOP_R_2_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_R_1_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define LOOP_R_3_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_R_2_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define LOOP_R_4_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_R_3_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define LOOP_R_5_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_R_4_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define LOOP_R_6_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_R_5_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define LOOP_R_7_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_R_6_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define LOOP_R_8_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_R_7_REDUX( val, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define LOOP_L_1_REDUX( val, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    APPLY_REDUX( val, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define LOOP_L_2_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    LOOP_L_1_REDUX( val, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_3_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    LOOP_L_2_REDUX( val, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_4_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    LOOP_L_3_REDUX( val, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_5_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    LOOP_L_4_REDUX( val, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_6_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    LOOP_L_5_REDUX( val, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_7_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    LOOP_L_6_REDUX( val, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define LOOP_L_8_REDUX( val, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    LOOP_L_7_REDUX( val, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+#define LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    APPLY_REDUX( val, func, i0 + m_offset[0] )              \
+  } 
+
+#define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      LOOP_L_1_REDUX( val, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      LOOP_L_2_REDUX( val, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      LOOP_L_3_REDUX( val, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      LOOP_L_4_REDUX( val, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      LOOP_L_5_REDUX( val, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      LOOP_L_6_REDUX( val, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      LOOP_L_7_REDUX( val, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_1_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_2_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_3_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_4_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_5_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_6_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_7_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TILE_LOOP_8_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent_partial, rank ) }
+// end New Loop Macros
+
+
+// tagged macros
+#define TAGGED_APPLY( tag, func, ... ) \
+  func( tag, __VA_ARGS__ );
+
+// LayoutRight
+// d = 0 to start
+#define TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY( tag, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_8( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY( tag, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_8( tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+// TODO: rank not necessary to pass through, can hardcode the values
+#define TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    TAGGED_APPLY( tag, func, i0 + m_offset[0] )              \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      TAGGED_LOOP_L_1( tag, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      TAGGED_LOOP_L_2( tag, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      TAGGED_LOOP_L_3( tag, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      TAGGED_LOOP_L_4( tag, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      TAGGED_LOOP_L_5( tag, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      TAGGED_LOOP_L_6( tag, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      TAGGED_LOOP_L_7( tag, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_1( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_2( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_3( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_4( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_5( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_6( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_7( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_8( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+
+// parallel_reduce, tagged
+// Reduction version
+#define TAGGED_APPLY_REDUX( val, tag, func, ... ) \
+  func( tag, __VA_ARGS__, val );
+
+// LayoutRight
+// d = 0 to start
+#define TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, __VA_ARGS__, i0 + m_offset[d] )              \
+  }
+
+#define TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i1 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i2 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i3 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i4 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i5 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i6 + m_offset[d] ) \
+  }
+
+#define TAGGED_LOOP_R_8_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, d+1 , __VA_ARGS__, i7 + m_offset[d] ) \
+  }
+
+//LayoutLeft
+// d = rank-1 to start
+#define TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, d, ... )    \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[d]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[d] , __VA_ARGS__ )              \
+  }
+
+#define TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i1 = (type)0; i1 < static_cast<type>(extent[d]); ++i1) {          \
+    TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, d-1, i1 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i2 = (type)0; i2 < static_cast<type>(extent[d]); ++i2) {          \
+    TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, d-1, i2 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i3 = (type)0; i3 < static_cast<type>(extent[d]); ++i3) {          \
+    TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, d-1, i3 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i4 = (type)0; i4 < static_cast<type>(extent[d]); ++i4) {          \
+    TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, d-1, i4 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i5 = (type)0; i5 < static_cast<type>(extent[d]); ++i5) {          \
+    TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, d-1, i5 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i6 = (type)0; i6 < static_cast<type>(extent[d]); ++i6) {          \
+    TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, d-1, i6 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+#define TAGGED_LOOP_L_8_REDUX( val, tag, func, type, m_offset, extent, d, ... )             \
+  for( type i7 = (type)0; i7 < static_cast<type>(extent[d]); ++i7) {          \
+    TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, d-1, i7 + m_offset[d] , __VA_ARGS__ ) \
+  }
+
+// Left vs Right
+#define TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  KOKKOS_ENABLE_IVDEP_MDRANGE                            \
+  for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
+    TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] )              \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[rank-1]); ++i1) {   \
+      TAGGED_LOOP_L_1_REDUX( val, tag, func, type, m_offset, extent, rank-2, i1 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
+      TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[rank-1]); ++i2) {   \
+      TAGGED_LOOP_L_2_REDUX( val, tag, func, type, m_offset, extent, rank-2, i2 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
+      TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[rank-1]); ++i3) {   \
+      TAGGED_LOOP_L_3_REDUX( val, tag, func, type, m_offset, extent, rank-2, i3 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
+      TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[rank-1]); ++i4) {   \
+      TAGGED_LOOP_L_4_REDUX( val, tag, func, type, m_offset, extent, rank-2, i4 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
+      TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[rank-1]); ++i5) {   \
+      TAGGED_LOOP_L_5_REDUX( val, tag, func, type, m_offset, extent, rank-2, i5 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
+      TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[rank-1]); ++i6) {   \
+      TAGGED_LOOP_L_6_REDUX( val, tag, func, type, m_offset, extent, rank-2, i6 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
+      TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] )   \
+    } \
+  } 
+
+#define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank )  \
+  if (is_left) { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[rank-1]); ++i7) {   \
+      TAGGED_LOOP_L_7_REDUX( val, tag, func, type, m_offset, extent, rank-2, i7 + m_offset[rank-1] ) \
+    } \
+  } \
+  else         { \
+    for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
+      TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] )   \
+    } \
+  } 
+
+// Partial vs Full Tile
+#define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_1_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_2_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_3_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_4_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_5_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_6_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_7_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+#define TAGGED_TILE_LOOP_8_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
+  if (cond) { TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent_full, rank ) } \
+  else      { TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent_partial, rank ) }
+
+// end tagged macros
+
+
+
+
+// Structs for calling loops
+template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
+struct Tile_Loop_Type;
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<1, IsLeft, IType, void, void >
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_1( func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_1_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<2, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_2( func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_2_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<3, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_3( func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_3_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<4, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_4( func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_4_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<5, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_5( func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_5_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<6, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_6( func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_6_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<7, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_7( func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_7_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+};
+
+template < bool IsLeft, typename IType >
+struct Tile_Loop_Type<8, IsLeft, IType, void, void>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_8( func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TILE_LOOP_8_REDUX( value, func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+};
+
+// tagged versions
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<1, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type >
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_1( Tagged(), func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_1_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 1 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<2, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_2( Tagged(), func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_2_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 2 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<3, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_3( Tagged(), func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_3_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 3 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<4, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_4( Tagged(), func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_4_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 4 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<5, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_5( Tagged(), func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_5_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 5 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<6, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_6( Tagged(), func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_6_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 6 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<7, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_7( Tagged(), func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_7_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 7 );
+  }
+};
+
+template < bool IsLeft, typename IType, typename Tagged >
+struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::is_same<Tagged,void>::value>::type>
+{
+  template < typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_8( Tagged(), func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+
+  template < typename ValType, typename Func, typename Offset, typename ExtentA, typename ExtentB >
+  static void apply(ValType &value, Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b)
+  {
+    TAGGED_TILE_LOOP_8_REDUX( value, Tagged(), func, IType, IsLeft, cond, offset, a, b, 8 );
+  }
+};
+// end Structs for calling loops
+
+
+template <typename T>
+using is_void = std::is_same< T , void >;
+
+template < typename RP
+         , typename Functor
+         , typename Tag = void
+         , typename ValueType = void
+         , typename Enable = void
+         >
+struct HostIterateTile;
+
+//For ParallelFor
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = ValueType;
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func )
+    : m_rp(rp)
+    , m_func(func)
+  {
+  }
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) 
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag 
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { 
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else 
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank w/ cuda+serial
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args...);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args...);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+//  value_type  & m_v;
+
+};
+
+
+// ValueType: For reductions
+template < typename RP
+         , typename Functor
+         , typename Tag
+         , typename ValueType
+         >
+struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
+{
+  using index_type = typename RP::index_type;
+  using point_type = typename RP::point_type;
+
+  using value_type = ValueType;
+
+  inline
+  HostIterateTile( RP const& rp, Functor const& func, value_type & v )
+    : m_rp(rp) //Cuda 7.0 does not like braces...
+    , m_func(func)
+    , m_v(v) // use with non-void ValueType struct
+  {
+// Errors due to braces rather than parenthesis for init (with cuda 7.0)
+//      /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: error: too many braces around initializer for ‘int’ [-fpermissive]
+//      /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: error: aggregate value used where an integer was expected
+  }
+
+  inline
+  bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
+    bool is_full_tile = true;
+
+      for ( int i = 0; i < RP::rank; ++i ) {
+        if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
+            partial_tile[i] = m_rp.m_tile[i] ;
+        }
+        else {
+          is_full_tile = false ;
+            partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 
+                            : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i]) 
+                            : (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
+        }
+      }
+
+    return is_full_tile ;
+  } // end check bounds
+
+
+  template <int Rank>
+  struct RankTag 
+  {
+    typedef RankTag type;
+    enum { value = (int)Rank };
+  };
+
+
+#if KOKKOS_ENABLE_NEW_LOOP_MACROS
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { 
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
+
+  }
+
+#else 
+  template <typename IType>
+  inline
+  void
+  operator()(IType tile_idx) const
+  { operator_impl( tile_idx , RankTag<RP::rank>() ); }
+  // added due to compiler error when using sfinae to choose operator based on rank
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<2> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ; 
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_2R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_2 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 2
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<3> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_3R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_3 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 3
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<4> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_4R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_4 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 4
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<5> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_5R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_5 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 5
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<6> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_6R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_6 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 6
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<7> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_7R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_7 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 7
+
+
+  template <typename IType>
+  inline
+  void operator_impl( IType tile_idx , const RankTag<8> ) const
+  {
+    point_type m_offset;
+    point_type m_tiledims;
+
+    if (RP::outer_direction == RP::Left) {
+      for (int i=0; i<RP::rank; ++i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ; 
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+    else {
+      for (int i=RP::rank-1; i>=0; --i) {
+        m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
+        tile_idx /= m_rp.m_tile_end[i];
+      }
+    }
+
+    //Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
+    const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
+
+    if (RP::inner_direction == RP::Left) {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8L(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Left
+    else {
+     if ( full_tile ) {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      } else {
+//      #pragma simd
+        LOOP_8R(index_type, m_tiledims) {
+          apply( LOOP_ARGS_8 );
+        }
+      }
+    } // end RP::Right
+
+  } //end op() rank == 8
+#endif
+
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func(args... , m_v);
+    }
+
+    template <typename... Args>
+    typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
+    apply(Args &&... args) const
+    {
+      m_func( m_tag, args... , m_v);
+    }
+
+
+  RP         const& m_rp;
+  Functor    const& m_func;
+  value_type  & m_v;
+  typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
+
+};
+
+
+// ------------------------------------------------------------------ //
+
+// MDFunctor - wraps the range_policy and functor to pass to IterateTile
+// Serial, Threads, OpenMP
+// Cuda uses DeviceIterateTile directly within md_parallel_for
+// ParallelReduce
+template < typename MDRange, typename Functor, typename ValueType = void >
+struct MDFunctor
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using value_type   = ValueType;
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+                                                                           , Functor
+                                                                           , work_tag
+                                                                           , value_type
+                                                                           >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
+    : m_range( range )
+    , m_func( f )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+//  KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
+  inline
+  void operator()(index_type t, value_type & v) const
+  {
+    iterate_type(m_range, m_func, v)(t);
+  }
+
+  MDRange   m_range;
+  Functor   m_func;
+};
+
+// ParallelFor
+template < typename MDRange, typename Functor >
+struct MDFunctor< MDRange, Functor, void >
+{
+  using range_policy = MDRange;
+  using functor_type = Functor;
+  using work_tag     = typename range_policy::work_tag;
+  using index_type   = typename range_policy::index_type;
+  using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
+                                                                           , Functor
+                                                                           , work_tag
+                                                                           , void
+                                                                           >;
+
+
+  inline
+  MDFunctor( MDRange const& range, Functor const& f )
+    : m_range( range )
+    , m_func( f )
+  {}
+
+  inline
+  MDFunctor( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor const& ) = default;
+
+  inline
+  MDFunctor( MDFunctor && ) = default;
+
+  inline
+  MDFunctor& operator=( MDFunctor && ) = default;
+
+  inline
+  void operator()(index_type t) const
+  {
+    iterate_type(m_range, m_func)(t);
+  }
+
+  MDRange m_range;
+  Functor m_func;
+};
+
+#undef KOKKOS_ENABLE_NEW_LOOP_MACROS
+
+} } } //end namespace Kokkos::Experimental::Impl
+
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
index 0ffbc0548ab663c9b6afa8799f162e3c7bbd7510..7d7fd3d1334901f1cc57e554f6c46f7f17ca09c4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
   return __ffs(i) - 1;
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_ffs(i) - 1;
-#elif defined( __INTEL_COMPILER )
+#elif defined( KOKKOS_COMPILER_INTEL )
   return _bit_scan_forward(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
 #else
-
   unsigned t = 1u;
   int r = 0;
   while ( i && ( i & t == 0 ) )
@@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
   enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
 #if defined( __CUDA_ARCH__ )
   return shift - __clz(i);
+#elif defined( KOKKOS_COMPILER_INTEL )
+  return _bit_scan_reverse(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return shift - __cntlz4(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return shift - __builtin_clz(i);
-#elif defined( __INTEL_COMPILER )
-  return _bit_scan_reverse(i);
 #else
   unsigned t = 1u << shift;
   int r = 0;
@@ -101,10 +104,12 @@ int bit_count( unsigned i )
 {
 #if defined( __CUDA_ARCH__ )
   return __popc(i);
-#elif defined( __GNUC__ ) || defined( __GNUG__ )
-  return __builtin_popcount(i);
 #elif defined ( __INTEL_COMPILER )
   return _popcnt32(i);
+#elif defined( KOKKOS_COMPILER_IBM )
+  return __popcnt4(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
 #else
   // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
   i = i - ( ( i >> 1 ) & ~0u / 3u );                             // temp
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index cd38eaa9da867a31a9274684f235456b30590d92..7c38430c44986d5dcffad9c03c9f587ffdc91863 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
   }
 #endif
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::initialize();
 #endif
 }
@@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
 void finalize_internal( const bool all_spaces = false )
 {
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
     Kokkos::Profiling::finalize();
 #endif
 
@@ -449,5 +449,323 @@ void fence()
   Impl::fence_internal();
 }
 
+void print_configuration( std::ostream & out , const bool detail )
+{
+  std::ostringstream msg;
+
+  msg << "Compiler:" << std::endl;
+#ifdef KOKKOS_COMPILER_APPLECC
+  msg << "  KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CLANG
+  msg << "  KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_CRAYC
+  msg << "  KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_GNU
+  msg << "  KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_IBM
+  msg << "  KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_INTEL
+  msg << "  KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_NVCC
+  msg << "  KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
+#endif
+#ifdef KOKKOS_COMPILER_PGI
+  msg << "  KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
+#endif
+
+
+  msg << "Architecture:" << std::endl;
+#ifdef KOKKOS_ENABLE_ISA_KNC
+  msg << "  KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
+#endif
+#ifdef KOKKOS_ENABLE_ISA_X86_64
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
+#else
+  msg << "  KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
+#endif
+
+
+  msg << "Devices:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA: ";
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PTHREAD: ";
+#ifdef KOKKOS_ENABLE_PTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_STDTHREAD: ";
+#ifdef KOKKOS_ENABLE_STDTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINTHREAD: ";
+#ifdef KOKKOS_ENABLE_WINTHREAD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Default Device:" << std::endl;
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
+#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Atomics:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_GNU_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_GNU_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_OPENMP_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
+#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Vectorization:" << std::endl;
+  msg << "  KOKKOS_ENABLE_PRAGMA_IVDEP: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_SIMD: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_UNROLL: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PRAGMA_VECTOR: ";
+#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+  msg << "Memory:" << std::endl;
+  msg << "  KOKKOS_ENABLE_HBWSPACE: ";
+#ifdef KOKKOS_ENABLE_HBWSPACE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
+#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_POSIX_MEMALIGN: ";
+#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+
+  msg << "Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_ASM: ";
+#ifdef KOKKOS_ENABLE_ASM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX1Z: ";
+#ifdef KOKKOS_ENABLE_CXX1Z
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
+#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_HWLOC: ";
+#ifdef KOKKOS_ENABLE_HWLOC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_LIBRT: ";
+#ifdef KOKKOS_ENABLE_LIBRT
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_MPI: ";
+#ifdef KOKKOS_ENABLE_MPI
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_PROFILING: ";
+#ifdef KOKKOS_ENABLE_PROFILING
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#ifdef KOKKOS_ENABLE_CUDA
+  msg << "Cuda Options:" << std::endl;
+  msg << "  KOKKOS_ENABLE_CUDA_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
+#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUDA_UVM: ";
+#ifdef KOKKOS_ENABLE_CUDA_UVM
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
+#ifdef KOKKOS_ENABLE_CUSPARSE
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+  msg << "yes" << std::endl;
+#else
+  msg << "no" << std::endl;
+#endif
+
+#endif
+
+  msg << "\nRuntime Configuration:" << std::endl;
+#ifdef KOKKOS_ENABLE_CUDA
+  Cuda::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+  OpenMP::print_configuration(msg, detail);
+#endif
+#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
+  Threads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_QTHREADS
+  Qthreads::print_configuration(msg, detail);
+#endif
+#ifdef KOKKOS_ENABLE_SERIAL
+  Serial::print_configuration(msg, detail);
+#endif
+
+  out << msg.str() << std::endl;
+}
+
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b425b3f19fa159925364d20ac6d5bc85b45bebae
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -0,0 +1,653 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORANALYSIS_HPP
+#define KOKKOS_FUNCTORANALYSIS_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Reducer.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct FunctorPatternInterface {
+  struct FOR {};
+  struct REDUCE {};
+  struct SCAN {};
+};
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If 'value_type' is not explicitly declared in the functor
+ *  then attempt to deduce the type from FunctorType::operator()
+ *  interface used by the pattern and policy.
+ *
+ *  For the REDUCE pattern generate a Reducer and finalization function
+ *  derived from what is available within the functor.
+ */
+template< typename PatternInterface , class Policy , class Functor >
+struct FunctorAnalysis {
+private:
+
+  using FOR    = FunctorPatternInterface::FOR ;
+  using REDUCE = FunctorPatternInterface::REDUCE ;
+  using SCAN   = FunctorPatternInterface::SCAN ;
+
+  //----------------------------------------
+
+  struct VOID {};
+
+  template< typename P = Policy , typename = std::false_type >
+  struct has_work_tag
+    {
+      using type = void ;
+      using wtag = VOID ;
+    };
+
+  template< typename P >
+  struct has_work_tag
+    < P , typename std::is_same< typename P::work_tag , void >::type >
+    {
+      using type = typename P::work_tag ;
+      using wtag = typename P::work_tag ;
+    };
+
+  using Tag  = typename has_work_tag<>::type ;
+  using WTag = typename has_work_tag<>::wtag ;
+
+  //----------------------------------------
+  // Check for Functor::value_type, which is either a simple type T or T[]
+
+  template< typename F , typename = std::false_type >
+  struct has_value_type { using type = void ; };
+
+  template< typename F >
+  struct has_value_type
+    < F , typename std::is_same< typename F::value_type , void >::type >
+  {
+    using type = typename F::value_type ;
+
+    static_assert( ! std::is_reference< type >::value &&
+                   std::rank< type >::value <= 1 &&
+                   std::extent< type >::value == 0
+                 , "Kokkos Functor::value_type is T or T[]" );
+  };
+
+  //----------------------------------------
+  // If Functor::value_type does not exist then evaluate operator(),
+  // depending upon the pattern and whether the policy has a work tag,
+  // to determine the reduction or scan value_type.
+
+  template< typename F
+          , typename P = PatternInterface
+          , typename V = typename has_value_type<F>::type
+          , bool     T = std::is_same< Tag , void >::value
+          >
+  struct deduce_value_type { using type = V ; };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , true > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , REDUCE , void , false > {
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & ) const );
+
+    template< typename M , typename A >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , true > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  template< typename F >
+  struct deduce_value_type< F , SCAN , void , false > {
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag , M , A & , I ) const );
+
+    template< typename M , typename A , typename I >
+    KOKKOS_INLINE_FUNCTION static
+    A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
+
+    using type = decltype( deduce( & F::operator() ) );
+  };
+
+  //----------------------------------------
+
+  using candidate_type = typename deduce_value_type< Functor >::type ;
+
+  enum { candidate_is_void  = std::is_same< candidate_type , void >::value
+       , candidate_is_array = std::rank< candidate_type >::value == 1 };
+
+  //----------------------------------------
+
+public:
+
+  using value_type = typename std::remove_extent< candidate_type >::type ;
+
+  static_assert( ! std::is_const< value_type >::value
+               , "Kokkos functor operator reduce argument cannot be const" );
+
+private:
+
+  // Stub to avoid defining a type 'void &'
+  using ValueType = typename
+    std::conditional< candidate_is_void , VOID , value_type >::type ;
+
+public:
+
+  using pointer_type = typename
+    std::conditional< candidate_is_void , void , ValueType * >::type ;
+
+  using reference_type = typename
+    std::conditional< candidate_is_array  , ValueType * , typename
+    std::conditional< ! candidate_is_void , ValueType & , void >
+    ::type >::type ;
+
+private:
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< IsArray , unsigned >::type
+  get_length( FF const & f ) { return f.value_count ; }
+
+  template< bool IsArray , class FF >
+  KOKKOS_INLINE_FUNCTION static
+  typename std::enable_if< ! IsArray , unsigned >::type
+  get_length( FF const & ) { return 1 ; }
+
+public:
+
+  enum { StaticValueSize = ! candidate_is_void &&
+                           ! candidate_is_array
+                         ? sizeof(ValueType) : 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Functor & f )
+    { return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
+
+  //----------------------------------------
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const Unknown & )
+    { return 1 ; }
+
+  template< class Unknown >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const Unknown & )
+    { return sizeof(ValueType); }
+
+private:
+
+  enum INTERFACE : int
+    { DISABLE           = 0
+    , NO_TAG_NOT_ARRAY  = 1
+    , NO_TAG_IS_ARRAY   = 2
+    , HAS_TAG_NOT_ARRAY = 3
+    , HAS_TAG_IS_ARRAY  = 4
+    , DEDUCED =
+       ! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
+       std::is_same<Tag,void>::value
+         ? (candidate_is_array ? NO_TAG_IS_ARRAY  : NO_TAG_NOT_ARRAY)
+         : (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
+    };
+
+  //----------------------------------------
+  // parallel_reduce join operator
+
+  template< class F , INTERFACE >
+  struct has_join_function ;
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , NO_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( dst , src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      typedef volatile       ValueType & vref_type ;
+      typedef volatile const ValueType & cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , *dst , *src ); }
+    };
+
+  template< class F >
+  struct has_join_function< F , HAS_TAG_IS_ARRAY >
+    {
+      typedef volatile       ValueType * vref_type ;
+      typedef volatile const ValueType * cvref_type ;
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+        { f.join( WTag() , dst , src ); }
+    };
+
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceJoin
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const & f
+               , ValueType volatile * dst
+               , ValueType volatile const * src )
+       {
+         const int n = FunctorAnalysis::value_count( f );
+         for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
+       }
+    };
+
+  template< class F >
+  struct DeduceJoin< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void join( F const &
+               , ValueType volatile *
+               , ValueType volatile const * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceJoin< F , I ,
+    decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
+    : public has_join_function<F,I> {};
+
+  //----------------------------------------
+
+  template< class , INTERFACE >
+  struct has_init_function ;
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), *dst ); }
+    };
+
+  template< class F >
+  struct has_init_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & f , ValueType * dst )
+        { f.init( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceInit
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
+    };
+
+  template< class F >
+  struct DeduceInit< F , DISABLE , void >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void init( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceInit< F , I ,
+    decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
+    : public has_init_function<F,I> {};
+
+  //----------------------------------------
+
+public:
+
+  struct Reducer
+  {
+  private:
+
+    Functor     const & m_functor ;
+    ValueType * const   m_result ;
+    int         const   m_length ;
+
+  public:
+
+    using reducer        = Reducer ;
+    using value_type     = FunctorAnalysis::value_type ;
+    using memory_space   = void ;
+    using reference_type = FunctorAnalysis::reference_type ;
+
+    KOKKOS_INLINE_FUNCTION
+    void join( ValueType volatile * dst
+             , ValueType volatile const * src ) const noexcept
+      { DeduceJoin<>::join( m_functor , dst , src ); }
+
+    KOKKOS_INLINE_FUNCTION
+    void init( ValueType * dst ) const noexcept
+      { DeduceInit<>::init( m_functor , dst ); }
+
+    KOKKOS_INLINE_FUNCTION explicit
+    constexpr Reducer( Functor const & arg_functor
+                     , ValueType     * arg_value = 0
+                     , int             arg_length = 0 ) noexcept
+      : m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
+
+    KOKKOS_INLINE_FUNCTION
+    constexpr int length() const noexcept { return m_length ; }
+
+    KOKKOS_INLINE_FUNCTION
+    ValueType & operator[]( int i ) const noexcept
+      { return m_result[i]; }
+
+  private:
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< IsArray , ValueType * >::type
+    ref() const noexcept { return m_result ; }
+
+    template< bool IsArray >
+    constexpr
+    typename std::enable_if< ! IsArray , ValueType & >::type
+    ref() const noexcept { return *m_result ; }
+
+  public:
+
+    KOKKOS_INLINE_FUNCTION
+    auto result() const noexcept
+      -> decltype( Reducer::template ref< candidate_is_array >() )
+      { return Reducer::template ref< candidate_is_array >(); }
+ };
+
+  //----------------------------------------
+
+private:
+
+  template< class , INTERFACE >
+  struct has_final_function ;
+
+  // No tag, not array
+  template< class F >
+  struct has_final_function< F , NO_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( *dst ); }
+    };
+
+  // No tag, is array
+  template< class F >
+  struct has_final_function< F , NO_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( dst ); }
+    };
+
+  // Has tag, not array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_NOT_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType & ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), *dst ); }
+    };
+
+  // Has tag, is array
+  template< class F >
+  struct has_final_function< F , HAS_TAG_IS_ARRAY >
+    {
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (F::*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void enable_if( void (*)( WTag const & , ValueType * ) );
+
+      KOKKOS_INLINE_FUNCTION static
+      void final( F const & f , ValueType * dst )
+        { f.final( WTag(), dst ); }
+    };
+
+  template< class F   = Functor
+          , INTERFACE = DEDUCED
+          , typename  = void >
+  struct DeduceFinal
+    {
+      KOKKOS_INLINE_FUNCTION
+      static void final( F const & , ValueType * ) {}
+    };
+
+  template< class F , INTERFACE I >
+  struct DeduceFinal< F , I ,
+    decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
+    : public has_init_function<F,I> {};
+
+public:
+
+  static void final( Functor const & f , ValueType * result )
+    { DeduceFinal<>::final( f , result ); }
+
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORANALYSIS_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index 96d30d0c4acac8af49f6b2c25ef2bb1c04508a28..eb1f5ce96c28fa05d70dd2bf840133688d82b247 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -62,7 +62,7 @@
 #include <memkind.h>
 #endif
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 
@@ -198,7 +198,7 @@ void * HBWSpace::allocate( const size_t arg_alloc_size ) const
     case STD_MALLOC: msg << "STD_MALLOC" ; break ;
     }
     msg << " ]( " << arg_alloc_size << " ) FAILED" ;
-    if ( ptr == NULL ) { msg << " NULL" ; } 
+    if ( ptr == NULL ) { msg << " NULL" ; }
     else { msg << " NOT ALIGNED " << ptr ; }
 
     std::cerr << msg.str() << std::endl ;
@@ -218,7 +218,7 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
     if ( m_alloc_mech == STD_MALLOC ) {
       void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
       memkind_free(MEMKIND_TYPE, alloc_ptr );
-    }    
+    }
 
   }
 }
@@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::deallocateData(
       Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
@@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
       )
   , m_space( arg_space )
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
   }
@@ -297,7 +297,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
 
 void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
 allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
-                , const std::string & arg_alloc_label 
+                , const std::string & arg_alloc_label
                 , const size_t arg_alloc_size )
 {
   if ( ! arg_alloc_size ) return (void *) 0 ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
index 3cd603728e52f1b851219a01f91eb0d5358e4c86..67be86c9a3ed8595a35915f06a4b8e4ea5ded0b3 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,14 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <algorithm>
 #include <Kokkos_Macros.hpp>
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #endif
 /*--------------------------------------------------------------------------*/
@@ -292,7 +292,7 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
     case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC" ; break ;
     }
     msg << " ]( " << arg_alloc_size << " ) FAILED" ;
-    if ( ptr == NULL ) { msg << " NULL" ; } 
+    if ( ptr == NULL ) { msg << " NULL" ; }
     else { msg << " NOT ALIGNED " << ptr ; }
 
     std::cerr << msg.str() << std::endl ;
@@ -312,7 +312,7 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
     if ( m_alloc_mech == STD_MALLOC ) {
       void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
       free( alloc_ptr );
-    }    
+    }
 
 #if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
     else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
@@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
 SharedAllocationRecord< Kokkos::HostSpace , void >::
 ~SharedAllocationRecord()
 {
-  #if (KOKKOS_ENABLE_PROFILING)
+  #if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::deallocateData(
       Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
@@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
       )
   , m_space( arg_space )
 {
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
   if(Kokkos::Profiling::profileLibraryLoaded()) {
     Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
    }
@@ -406,7 +406,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
 
 void * SharedAllocationRecord< Kokkos::HostSpace , void >::
 allocate_tracked( const Kokkos::HostSpace & arg_space
-                , const std::string & arg_alloc_label 
+                , const std::string & arg_alloc_label
                 , const size_t arg_alloc_size )
 {
   if ( ! arg_alloc_size ) return (void *) 0 ;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac200209c72bca381f60b9564944bc444748f0fb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -0,0 +1,463 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <limits>
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_HostThreadTeam.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void HostThreadTeamData::organize_pool
+  ( HostThreadTeamData * members[] , const int size )
+{
+  bool ok = true ;
+
+  // Verify not already a member of a pool:
+  for ( int rank = 0 ; rank < size && ok ; ++rank ) {
+    ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
+  }
+
+  if ( ok ) {
+
+    int64_t * const root_scratch = members[0]->m_scratch ;
+
+    for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
+      root_scratch[i] = 0 ;
+    }
+
+    {
+      HostThreadTeamData ** const pool =
+        (HostThreadTeamData **) (root_scratch + m_pool_members);
+
+      // team size == 1, league size == pool_size
+
+      for ( int rank = 0 ; rank < size ; ++rank ) {
+        HostThreadTeamData * const mem = members[ rank ] ;
+        mem->m_pool_scratch = root_scratch ;
+        mem->m_team_scratch = mem->m_scratch ;
+        mem->m_pool_rank    = rank ;
+        mem->m_pool_size    = size ;
+        mem->m_team_base    = rank ;
+        mem->m_team_rank    = 0 ;
+        mem->m_team_size    = 1 ;
+        mem->m_team_alloc   = 1 ;
+        mem->m_league_rank  = rank ;
+        mem->m_league_size  = size ;
+        mem->m_pool_rendezvous_step = 0 ;
+        mem->m_team_rendezvous_step = 0 ;
+        pool[ rank ] = mem ;
+      }
+    }
+
+    Kokkos::memory_fence();
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
+  }
+}
+
+void HostThreadTeamData::disband_pool()
+{
+   m_work_range.first  = -1 ;
+   m_work_range.second = -1 ;
+   m_pool_scratch = 0 ;
+   m_team_scratch = 0 ;
+   m_pool_rank    = 0 ;
+   m_pool_size    = 1 ;
+   m_team_base    = 0 ;
+   m_team_rank    = 0 ;
+   m_team_size    = 1 ;
+   m_team_alloc   = 1 ;
+   m_league_rank  = 0 ;
+   m_league_size  = 1 ;
+   m_pool_rendezvous_step = 0 ;
+   m_team_rendezvous_step = 0 ;
+}
+
+int HostThreadTeamData::organize_team( const int team_size )
+{
+  // Pool is initialized
+  const bool ok_pool = 0 != m_pool_scratch ;
+
+  // Team is not set
+  const bool ok_team =
+    m_team_scratch == m_scratch &&
+    m_team_base    == m_pool_rank &&
+    m_team_rank    == 0 &&
+    m_team_size    == 1 &&
+    m_team_alloc   == 1 &&
+    m_league_rank  == m_pool_rank &&
+    m_league_size  == m_pool_size ;
+
+  if ( ok_pool && ok_team ) {
+
+    if ( team_size <= 0 ) return 0 ; // No teams to organize
+
+    if ( team_size == 1 ) return 1 ; // Already organized in teams of one
+
+    HostThreadTeamData * const * const pool =
+      (HostThreadTeamData **) (m_pool_scratch + m_pool_members);
+
+    // "league_size" in this context is the number of concurrent teams
+    // that the pool can accommodate.  Excess threads are idle.
+    const int league_size     = m_pool_size / team_size ;
+    const int team_alloc_size = m_pool_size / league_size ;
+    const int team_alloc_rank = m_pool_rank % team_alloc_size ;
+    const int league_rank     = m_pool_rank / team_alloc_size ;
+    const int team_base_rank  = league_rank * team_alloc_size ;
+
+    m_team_scratch = pool[ team_base_rank ]->m_scratch ;
+    m_team_base    = team_base_rank ;
+    // This needs to check overflow, if m_pool_size % team_alloc_size !=0
+    // there are two corner cases:
+    // (i) if team_alloc_size == team_size there might be a non-full
+    //     zombi team around (for example m_pool_size = 5 and team_size = 2
+    // (ii) if team_alloc > team_size then the last team might have less
+    //      threads than the others
+    m_team_rank    = ( team_base_rank + team_size <= m_pool_size ) &&
+                     ( team_alloc_rank < team_size ) ?
+                     team_alloc_rank : -1;
+    m_team_size    = team_size ;
+    m_team_alloc   = team_alloc_size ;
+    m_league_rank  = league_rank ;
+    m_league_size  = league_size ;
+    m_team_rendezvous_step = 0 ;
+
+    if ( team_base_rank == m_pool_rank ) {
+      // Initialize team's rendezvous memory
+      for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
+        m_scratch[i] = 0 ;
+      }
+      // Make sure team's rendezvous memory initialized
+      // is written before proceeding.
+      Kokkos::memory_fence();
+    }
+
+    // Organizing threads into a team performs a barrier across the
+    // entire pool to insure proper initialization of the team
+    // rendezvous mechanism before a team rendezvous can be performed.
+
+    if ( pool_rendezvous() ) {
+      pool_rendezvous_release();
+    }
+  }
+  else {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
+  }
+
+  return 0 <= m_team_rank ;
+}
+
+void HostThreadTeamData::disband_team()
+{
+  m_team_scratch = m_scratch ;
+  m_team_base    = m_pool_rank ;
+  m_team_rank    = 0 ;
+  m_team_size    = 1 ;
+  m_team_alloc   = 1 ;
+  m_league_rank  = m_pool_rank ;
+  m_league_size  = m_pool_size ;
+  m_team_rendezvous_step = 0 ;
+}
+
+//----------------------------------------------------------------------------
+/* pattern for rendezvous
+ *
+ *  if ( rendezvous() ) {
+ *     ... all other threads are still in team_rendezvous() ...
+ *     rendezvous_release();
+ *     ... all other threads are released from team_rendezvous() ...
+ *  }
+ */
+
+int HostThreadTeamData::rendezvous( int64_t * const buffer
+                                  , int & rendezvous_step
+                                  , int const size
+                                  , int const rank ) noexcept
+{
+  enum : int { shift_byte = 3 };
+  enum : int { size_byte  = ( 01 << shift_byte ) }; // == 8
+  enum : int { mask_byte  = size_byte - 1 };
+
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Cycle step values: 1 <= step <= size_val_cycle
+  // An odd multiple of memory cycle so that when a memory location
+  // is reused it has a different value.
+  // Must be representable within a single byte: size_val_cycle < 16
+
+  enum : int { size_val_cycle = 3 * size_mem_cycle };
+
+  // Requires:
+  //   Called by rank = [ 0 .. size )
+  //   buffer aligned to int64_t[4]
+
+  // A sequence of rendezvous uses four cycled locations in memory
+  // and non-equal cycled synchronization values to
+  // 1) prevent rendezvous from overtaking one another and
+  // 2) give each spin wait location an int64_t[4] span
+  //    so that it has its own cache line.
+
+  const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
+
+  rendezvous_step = step ;
+
+  // The leading int64_t[4] span is for thread 0 to write
+  // and all other threads to read spin-wait.
+  // sync_offset is the index into this array for this step.
+
+  const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
+
+  union {
+    int64_t full ;
+    int8_t  byte[8] ;
+  } value ;
+
+  if ( rank ) {
+
+    const int group_begin = rank << shift_byte ; // == rank * size_byte
+
+    if ( group_begin < size ) {
+
+      //  This thread waits for threads
+      //   [ group_begin .. group_begin + 8 )
+      //   [ rank*8      .. rank*8 + 8      )
+      // to write to their designated bytes.
+
+      const int end = group_begin + size_byte < size
+                    ? size_byte : size - group_begin ;
+
+      value.full = 0 ;
+      for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+      store_fence(); // This should not be needed but fixes #742
+
+      spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
+                          , value.full );
+    }
+
+    {
+      // This thread sets its designated byte.
+      //   ( rank % size_byte ) +
+      //   ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
+      //   ( sync_offset * size_byte )
+      const int offset = ( rank & mask_byte )
+                       + ( ( rank & ~mask_byte ) << shift_mem_cycle )
+                       + ( sync_offset << shift_byte );
+
+      // All of this thread's previous memory stores must be complete before
+      // this thread stores the step value at this thread's designated byte
+      // in the shared synchronization array.
+
+      Kokkos::memory_fence();
+
+      ((volatile int8_t*) buffer)[ offset ] = int8_t( step );
+
+      // Memory fence to push the previous store out
+      Kokkos::memory_fence();
+    }
+
+    // Wait for thread 0 to release all other threads
+
+    spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
+
+  }
+  else {
+    // Thread 0 waits for threads [1..7]
+    // to write to their designated bytes.
+
+    const int end = size_byte < size ? 8 : size ;
+
+    value.full = 0 ;
+    for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
+
+    spinwait_until_equal( buffer[ sync_offset ], value.full );
+  }
+
+  return rank ? 0 : 1 ;
+}
+
+void HostThreadTeamData::
+  rendezvous_release( int64_t * const buffer
+                    , int const rendezvous_step ) noexcept
+{
+  enum : int { shift_mem_cycle = 2 };
+  enum : int { size_mem_cycle  = ( 01 << shift_mem_cycle ) }; // == 4
+  enum : int { mask_mem_cycle  = size_mem_cycle - 1 };
+
+  // Requires:
+  //   Called after team_rendezvous
+  //   Called only by true == team_rendezvous(root)
+
+  // Memory fence to be sure all previous writes are complete:
+  Kokkos::memory_fence();
+
+  ((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
+     int64_t( rendezvous_step );
+
+  // Memory fence to push the store out
+  Kokkos::memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+int HostThreadTeamData::get_work_stealing() noexcept
+{
+  pair_int_t w( -1 , -1 );
+
+  if ( 1 == m_team_size || team_rendezvous() ) {
+
+    // Attempt first from beginning of my work range
+    for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
+
+      // Query and attempt to update m_work_range
+      //   from: [ w.first     , w.second )
+      //   to:   [ w.first + 1 , w.second ) = w_new
+      //
+      // If w is invalid then is just a query.
+
+      const pair_int_t w_new( w.first + 1 , w.second );
+
+      w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
+
+      if ( w.first < w.second ) {
+        // m_work_range is viable
+
+        // If steal is successful then don't repeat attempt to steal
+        attempt = ! ( w_new.first  == w.first + 1 &&
+                      w_new.second == w.second );
+      }
+      else {
+        // m_work_range is not viable
+        w.first  = -1 ;
+        w.second = -1 ;
+
+        attempt = 0 ;
+      }
+    }
+
+    if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
+
+      HostThreadTeamData * const * const pool =
+        (HostThreadTeamData**)( m_pool_scratch + m_pool_members );
+
+      // Attempt from begining failed, try to steal from end of neighbor
+
+      pair_int_t volatile * steal_range =
+        & ( pool[ m_steal_rank ]->m_work_range );
+
+      for ( int attempt = true ; attempt ; ) {
+
+        // Query and attempt to update steal_work_range
+        //   from: [ w.first , w.second )
+        //   to:   [ w.first , w.second - 1 ) = w_new
+        //
+        // If w is invalid then is just a query.
+
+        const pair_int_t w_new( w.first , w.second - 1 );
+
+        w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
+
+        if ( w.first < w.second ) {
+          // steal_work_range is viable
+
+          // If steal is successful then don't repeat attempt to steal
+          attempt = ! ( w_new.first  == w.first &&
+                        w_new.second == w.second - 1 );
+        }
+        else {
+          // steal_work_range is not viable, move to next member
+          w.first  = -1 ;
+          w.second = -1 ;
+
+          // We need to figure out whether the next team is active
+          // m_steal_rank + m_team_alloc could be the next base_rank to steal from
+          // but only if there are another m_team_size threads available so that that
+          // base rank has a full team.
+          m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
+                         m_steal_rank + m_team_alloc : 0;
+
+          steal_range = & ( pool[ m_steal_rank ]->m_work_range );
+
+          // If tried all other members then don't repeat attempt to steal
+          attempt = m_steal_rank != m_pool_rank ;
+        }
+      }
+
+      if ( w.first != -1 ) w.first = w.second - 1 ;
+    }
+
+    if ( 1 < m_team_size ) {
+      // Must share the work index
+      *((int volatile *) team_reduce()) = w.first ;
+
+      team_rendezvous_release();
+    }
+  }
+  else if ( 1 < m_team_size ) {
+    w.first = *((int volatile *) team_reduce());
+  }
+
+  // May exit because successfully stole work and w is good.
+  // May exit because no work left to steal and w = (-1,-1).
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
+       , m_pool_rank , m_pool_size , w.first );
+fflush(stdout);
+#endif
+
+  return w.first ;
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6b5918eaefc2ee74e951b8caabdeb0d4e8c488c0
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -0,0 +1,1090 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP
+#define KOKKOS_IMPL_HOSTTHREADTEAM_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Reducer.hpp>
+#include <impl/Kokkos_FunctorAnalysis.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class HostExecSpace >
+class HostThreadTeamMember ;
+
+class HostThreadTeamData {
+public:
+
+  template< class > friend class HostThreadTeamMember ;
+
+  // Assume upper bounds on number of threads:
+  //   pool size       <= 1024 threads
+  //   pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052
+  //   team size       <= 64 threads
+  //   team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36
+
+  enum : int { max_pool_members  = 1024 };
+  enum : int { max_team_members  = 64 };
+  enum : int { max_pool_rendezvous  = ( max_pool_members / 8 ) * 4 + 4 };
+  enum : int { max_team_rendezvous  = ( max_team_members / 8 ) * 4 + 4 };
+
+private:
+
+  // per-thread scratch memory buffer chunks:
+  //
+  //   [ pool_members ]     = [ m_pool_members    .. m_pool_rendezvous )
+  //   [ pool_rendezvous ]  = [ m_pool_rendezvous .. m_team_rendezvous )
+  //   [ team_rendezvous ]  = [ m_team_rendezvous .. m_pool_reduce )
+  //   [ pool_reduce ]      = [ m_pool_reduce     .. m_team_reduce )
+  //   [ team_reduce ]      = [ m_team_reduce     .. m_team_shared )
+  //   [ team_shared ]      = [ m_team_shared     .. m_thread_local )
+  //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
+
+  enum : int { m_pool_members    = 0 };
+  enum : int { m_pool_rendezvous = m_pool_members    + max_pool_members };
+  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
+  enum : int { m_pool_reduce     = m_team_rendezvous + max_team_rendezvous };
+
+  using pair_int_t = Kokkos::pair<int,int> ;
+
+  pair_int_t  m_work_range ;
+  int64_t     m_work_end ;
+  int64_t   * m_scratch ;       // per-thread buffer
+  int64_t   * m_pool_scratch ;  // == pool[0]->m_scratch
+  int64_t   * m_team_scratch ;  // == pool[ 0 + m_team_base ]->m_scratch
+  int         m_pool_rank ;
+  int         m_pool_size ;
+  int         m_team_reduce ;
+  int         m_team_shared ;
+  int         m_thread_local ;
+  int         m_scratch_size ;
+  int         m_team_base ;
+  int         m_team_rank ;
+  int         m_team_size ;
+  int         m_team_alloc ;
+  int         m_league_rank ;
+  int         m_league_size ;
+  int         m_work_chunk ;
+  int         m_steal_rank ; // work stealing rank
+  int mutable m_pool_rendezvous_step ;
+  int mutable m_team_rendezvous_step ;
+
+  HostThreadTeamData * team_member( int r ) const noexcept
+    { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[m_team_base+r]; }
+
+  // Rendezvous pattern:
+  //   if ( rendezvous(root) ) {
+  //     ... only root thread here while all others wait ...
+  //     rendezvous_release();
+  //   }
+  //   else {
+  //     ... all other threads release here ...
+  //   }
+  //
+  // Requires: buffer[ ( max_threads / 8 ) * 4 + 4 ]; 0 == max_threads % 8
+  //
+  static
+  int rendezvous( int64_t * const buffer
+                , int & rendezvous_step
+                , int const size
+                , int const rank ) noexcept ;
+
+  static
+  void rendezvous_release( int64_t * const buffer
+                         , int const rendezvous_step ) noexcept ;
+
+public:
+
+  inline
+  int team_rendezvous( int const root ) const noexcept
+    {
+      return 1 == m_team_size ? 1 :
+             rendezvous( m_team_scratch + m_team_rendezvous
+                       , m_team_rendezvous_step
+                       , m_team_size
+                       , ( m_team_rank + m_team_size - root ) % m_team_size );
+    }
+
+  inline
+  int team_rendezvous() const noexcept
+    {
+      return 1 == m_team_size ? 1 :
+             rendezvous( m_team_scratch + m_team_rendezvous
+                       , m_team_rendezvous_step
+                       , m_team_size
+                       , m_team_rank );
+    }
+
+  inline
+  void team_rendezvous_release() const noexcept
+    {
+      if ( 1 < m_team_size ) {
+        rendezvous_release( m_team_scratch + m_team_rendezvous
+                          , m_team_rendezvous_step );
+      }
+    }
+
+  inline
+  int pool_rendezvous() const noexcept
+    {
+      return 1 == m_pool_size ? 1 :
+             rendezvous( m_pool_scratch + m_pool_rendezvous
+                       , m_pool_rendezvous_step
+                       , m_pool_size
+                       , m_pool_rank );
+    }
+
+  inline
+  void pool_rendezvous_release() const noexcept
+    {
+      if ( 1 < m_pool_size ) {
+        rendezvous_release( m_pool_scratch + m_pool_rendezvous
+                          , m_pool_rendezvous_step );
+      }
+    }
+
+  //----------------------------------------
+
+  constexpr HostThreadTeamData() noexcept
+    : m_work_range(-1,-1)
+    , m_work_end(0)
+    , m_scratch(0)
+    , m_pool_scratch(0)
+    , m_team_scratch(0)
+    , m_pool_rank(0)
+    , m_pool_size(1)
+    , m_team_reduce(0)
+    , m_team_shared(0)
+    , m_thread_local(0)
+    , m_scratch_size(0)
+    , m_team_base(0)
+    , m_team_rank(0)
+    , m_team_size(1)
+    , m_team_alloc(1)
+    , m_league_rank(0)
+    , m_league_size(1)
+    , m_work_chunk(0)
+    , m_steal_rank(0)
+    , m_pool_rendezvous_step(0)
+    , m_team_rendezvous_step(0)
+    {}
+
+  //----------------------------------------
+  // Organize array of members into a pool.
+  // The 0th member is the root of the pool.
+  // Requires: members are not already in a pool.
+  // Requires: called by one thread.
+  // Pool members are ordered as "close" - sorted by NUMA and then CORE
+  // Each thread is its own team with team_size == 1.
+  static void organize_pool( HostThreadTeamData * members[]
+                           , const int size );
+
+  // Called by each thread within the pool
+  void disband_pool();
+
+  //----------------------------------------
+  // Each thread within a pool organizes itself into a team.
+  // Must be called by all threads of the pool.
+  // Organizing threads into a team performs a barrier across the
+  // entire pool to insure proper initialization of the team
+  // rendezvous mechanism before a team rendezvous can be performed.
+  //
+  // Return true  if a valid member of a team.
+  // Return false if not a member and thread should be idled.
+  int organize_team( const int team_size );
+
+  // Each thread within a pool disbands itself from current team.
+  // Each thread becomes its own team with team_size == 1.
+  // Must be called by all threads of the pool.
+  void disband_team();
+
+  //----------------------------------------
+
+  constexpr int pool_rank() const { return m_pool_rank ; }
+  constexpr int pool_size() const { return m_pool_size ; }
+
+  HostThreadTeamData * pool_member( int r ) const noexcept
+    { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[r]; }
+
+  //----------------------------------------
+
+private:
+
+  enum : int { mask_to_16 = 0x0f }; // align to 16 bytes
+  enum : int { shift_to_8 = 3 };    // size to 8 bytes
+
+public:
+
+  static constexpr int align_to_int64( int n )
+    { return ( ( n + mask_to_16 ) & ~mask_to_16 ) >> shift_to_8 ; }
+
+  constexpr int pool_reduce_bytes() const
+    { return m_scratch_size ? sizeof(int64_t) * ( m_team_reduce - m_pool_reduce ) : 0 ; }
+
+  constexpr int team_reduce_bytes() const
+    { return sizeof(int64_t) * ( m_team_shared - m_team_reduce ); }
+
+  constexpr int team_shared_bytes() const
+    { return sizeof(int64_t) * ( m_thread_local - m_team_shared ); }
+
+  constexpr int thread_local_bytes() const
+    { return sizeof(int64_t) * ( m_scratch_size - m_thread_local ); }
+
+  constexpr int scratch_bytes() const
+    { return sizeof(int64_t) * m_scratch_size ; }
+
+  // Memory chunks:
+
+  int64_t * scratch_buffer() const noexcept
+    { return m_scratch ; }
+
+  int64_t * pool_reduce() const noexcept
+    { return m_pool_scratch + m_pool_reduce ; }
+
+  int64_t * pool_reduce_local() const noexcept
+    { return m_scratch + m_pool_reduce ; }
+
+  int64_t * team_reduce() const noexcept
+    { return m_team_scratch + m_team_reduce ; }
+
+  int64_t * team_reduce_local() const noexcept
+    { return m_scratch + m_team_reduce ; }
+
+  int64_t * team_shared() const noexcept
+    { return m_team_scratch + m_team_shared ; }
+
+  int64_t * local_scratch() const noexcept
+    { return m_scratch + m_thread_local ; }
+
+  // Given:
+  //   pool_reduce_size  = number bytes for pool reduce
+  //   team_reduce_size  = number bytes for team reduce
+  //   team_shared_size  = number bytes for team shared memory
+  //   thread_local_size = number bytes for thread local memory
+  // Return:
+  //   total number of bytes that must be allocated
+  static
+  size_t scratch_size( int pool_reduce_size
+                     , int team_reduce_size
+                     , int team_shared_size
+                     , int thread_local_size )
+    {
+      pool_reduce_size  = align_to_int64( pool_reduce_size );
+      team_reduce_size  = align_to_int64( team_reduce_size );
+      team_shared_size  = align_to_int64( team_shared_size );
+      thread_local_size = align_to_int64( thread_local_size );
+
+      const size_t total_bytes = (
+        m_pool_reduce +
+        pool_reduce_size +
+        team_reduce_size +
+        team_shared_size +
+        thread_local_size ) * sizeof(int64_t);
+
+      return total_bytes ;
+    }
+
+  // Given:
+  //   alloc_ptr         = pointer to allocated memory
+  //   alloc_size        = number bytes of allocated memory
+  //   pool_reduce_size  = number bytes for pool reduce/scan operations
+  //   team_reduce_size  = number bytes for team reduce/scan operations
+  //   team_shared_size  = number bytes for team-shared memory
+  //   thread_local_size = number bytes for thread-local memory
+  // Return:
+  //   total number of bytes that must be allocated
+  void scratch_assign( void * const alloc_ptr
+                     , size_t const alloc_size
+                     , int pool_reduce_size
+                     , int team_reduce_size
+                     , int team_shared_size
+                     , int /* thread_local_size */ )
+    {
+      pool_reduce_size  = align_to_int64( pool_reduce_size );
+      team_reduce_size  = align_to_int64( team_reduce_size );
+      team_shared_size  = align_to_int64( team_shared_size );
+      // thread_local_size = align_to_int64( thread_local_size );
+
+      m_scratch      = (int64_t *) alloc_ptr ;
+      m_team_reduce  = m_pool_reduce + pool_reduce_size ;
+      m_team_shared  = m_team_reduce + team_reduce_size ;
+      m_thread_local = m_team_shared + team_shared_size ;
+      m_scratch_size = align_to_int64( alloc_size );
+
+#if 0
+fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
+       , int(m_pool_members)
+       , int(m_pool_rendezvous)
+       , int(m_pool_reduce)
+       , int(m_team_reduce)
+       , int(m_team_shared)
+       , int(m_thread_local)
+       , int(m_scratch_size)
+       );
+fflush(stdout);
+#endif
+
+    }
+
+  //----------------------------------------
+  // Get a work index within the range.
+  // First try to steal from beginning of own teams's partition.
+  // If that fails then try to steal from end of another teams' partition.
+  int get_work_stealing() noexcept ;
+
+  //----------------------------------------
+  // Set the initial work partitioning of [ 0 .. length ) among the teams
+  // with granularity of chunk
+
+  void set_work_partition( int64_t const length
+                         , int     const chunk ) noexcept
+    {
+      // Minimum chunk size to insure that
+      //   m_work_end < std::numeric_limits<int>::max() * m_work_chunk
+
+      int const chunk_min = ( length + std::numeric_limits<int>::max() )
+                            / std::numeric_limits<int>::max();
+
+      m_work_end   = length ;
+      m_work_chunk = std::max( chunk , chunk_min );
+
+      // Number of work chunks and partitioning of that number:
+      int const num  = ( m_work_end + m_work_chunk - 1 ) / m_work_chunk ;
+      int const part = ( num + m_league_size - 1 ) / m_league_size ;
+
+      m_work_range.first  = part * m_league_rank ;
+      m_work_range.second = m_work_range.first + part ;
+
+      // Steal from next team, round robin
+      // The next team is offset by m_team_alloc if it fits in the pool.
+
+      m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ? 
+                     m_team_base + m_team_alloc : 0 ;
+    }
+
+  std::pair<int64_t,int64_t> get_work_partition() noexcept
+    {
+      return std::pair<int64_t,int64_t>
+        ( m_work_range.first * m_work_chunk
+        , m_work_range.second * m_work_chunk < m_work_end
+        ? m_work_range.second * m_work_chunk : m_work_end );
+    }
+
+  std::pair<int64_t,int64_t> get_work_stealing_chunk() noexcept
+    {
+      std::pair<int64_t,int64_t> x(-1,-1);
+
+      const int i = get_work_stealing();
+
+      if ( 0 <= i ) {
+        x.first  = m_work_chunk * i ;
+        x.second = x.first + m_work_chunk < m_work_end
+                 ? x.first + m_work_chunk : m_work_end ;
+      }
+
+      return x ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class HostExecSpace >
+class HostThreadTeamMember {
+public:
+
+  using scratch_memory_space = typename HostExecSpace::scratch_memory_space ;
+
+private:
+
+  scratch_memory_space m_scratch ;
+  HostThreadTeamData & m_data ;
+  int const            m_league_rank ;
+  int const            m_league_size ;
+
+public:
+
+  constexpr HostThreadTeamMember( HostThreadTeamData & arg_data ) noexcept
+    : m_scratch( arg_data.team_shared() , arg_data.team_shared_bytes() )
+    , m_data( arg_data )
+    , m_league_rank(0)
+    , m_league_size(1)
+    {}
+
+  constexpr HostThreadTeamMember( HostThreadTeamData & arg_data
+                                , int const            arg_league_rank
+                                , int const            arg_league_size
+                                ) noexcept
+    : m_scratch( arg_data.team_shared()
+               , arg_data.team_shared_bytes()
+               , arg_data.team_shared()
+               , arg_data.team_shared_bytes() )
+    , m_data( arg_data )
+    , m_league_rank( arg_league_rank )
+    , m_league_size( arg_league_size )
+    {}
+
+  ~HostThreadTeamMember() = default ;
+  HostThreadTeamMember() = delete ;
+  HostThreadTeamMember( HostThreadTeamMember && ) = default ;
+  HostThreadTeamMember( HostThreadTeamMember const & ) = default ;
+  HostThreadTeamMember & operator = ( HostThreadTeamMember && ) = default ;
+  HostThreadTeamMember & operator = ( HostThreadTeamMember const & ) = default ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  int team_rank() const noexcept { return m_data.m_team_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int team_size() const noexcept { return m_data.m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int league_rank() const noexcept { return m_league_rank ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int league_size() const noexcept { return m_league_size ; }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const
+    { return m_scratch.set_team_thread_mode(0,1,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_scratch(int) const
+    { return m_scratch.set_team_thread_mode(0,1,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & thread_scratch(int) const
+    { return m_scratch.set_team_thread_mode(0,m_data.m_team_size,m_data.m_team_rank); }
+
+  //----------------------------------------
+  // Team collectives
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( m_data.team_rendezvous() ) m_data.team_rendezvous_release();
+    }
+#else
+    {}
+#endif
+
+  template< class Closure >
+  KOKKOS_INLINE_FUNCTION
+  void team_barrier( Closure const & f ) const noexcept
+    {
+      if ( m_data.team_rendezvous() ) {
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+
+        f();
+
+        m_data.team_rendezvous_release();
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( T & value , const int source_team_rank ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 1 < m_data.m_team_size ) {
+        T volatile * const shared_value = (T*) m_data.team_reduce();
+
+        // Don't overwrite shared memory until all threads arrive
+
+        if ( m_data.team_rendezvous( source_team_rank ) ) {
+          // All threads have entered 'team_rendezvous'
+          // only this thread returned from 'team_rendezvous'
+          // with a return value of 'true'
+
+          *shared_value = value ;
+
+          m_data.team_rendezvous_release();
+          // This thread released all other threads from 'team_rendezvous'
+          // with a return value of 'false'
+        }
+        else {
+          value = *shared_value ;
+        }
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_broadcast\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  template< class Closure , typename T >
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( Closure const & f , T & value , const int source_team_rank) const noexcept
+    {
+      T volatile * const shared_value = (T*) m_data.team_reduce();
+
+      // Don't overwrite shared memory until all threads arrive
+
+      if ( m_data.team_rendezvous(source_team_rank) ) {
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+
+        f( value );
+
+        if ( 1 < m_data.m_team_size ) { *shared_value = value ; }
+
+        m_data.team_rendezvous_release();
+        // This thread released all other threads from 'team_rendezvous'
+        // with a return value of 'false'
+      }
+      else {
+        value = *shared_value ;
+      }
+    }
+
+  //--------------------------------------------------------------------------
+  // team_reduce( Sum(result) );
+  // team_reduce( Min(result) );
+  // team_reduce( Max(result) );
+
+  template< typename ReducerType >
+  KOKKOS_INLINE_FUNCTION
+  typename std::enable_if< is_reducer< ReducerType >::value >::type
+  team_reduce( ReducerType const & reducer ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 1 < m_data.m_team_size ) {
+
+        using value_type = typename ReducerType::value_type ;
+
+        if ( 0 != m_data.m_team_rank ) {
+          // Non-root copies to their local buffer:
+          reducer.copy( (value_type*) m_data.team_reduce_local()
+                      , reducer.data() );
+        }
+
+        // Root does not overwrite shared memory until all threads arrive
+        // and copy to their local buffer.
+
+        if ( m_data.team_rendezvous() ) {
+          // All threads have entered 'team_rendezvous'
+          // only this thread returned from 'team_rendezvous'
+          // with a return value of 'true'
+          //
+          // This thread sums contributed values
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            value_type * const src =
+              (value_type*) m_data.team_member(i)->team_reduce_local();
+
+            reducer.join( reducer.data() , src );
+          }
+
+          // Copy result to root member's buffer:
+          reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
+
+          m_data.team_rendezvous_release();
+          // This thread released all other threads from 'team_rendezvous'
+          // with a return value of 'false'
+        }
+        else {
+          // Copy from root member's buffer:
+          reducer.copy( reducer.data() , (value_type*) m_data.team_reduce() );
+        }
+      }
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); }
+#endif
+
+  //--------------------------------------------------------------------------
+
+  template< typename ValueType , class JoinOp >
+  KOKKOS_INLINE_FUNCTION
+  ValueType
+  team_reduce( ValueType const & value
+             , JoinOp    const & join ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 0 != m_data.m_team_rank ) {
+        // Non-root copies to their local buffer:
+        *((ValueType*) m_data.team_reduce_local()) = value ;
+      }
+
+      // Root does not overwrite shared memory until all threads arrive
+      // and copy to their local buffer.
+
+      if ( m_data.team_rendezvous() ) {
+        const Impl::Reducer< ValueType , JoinOp > reducer( join );
+
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+        //
+        // This thread sums contributed values
+
+        ValueType * const dst = (ValueType*) m_data.team_reduce_local();
+
+        *dst = value ;
+
+        for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+          ValueType * const src =
+            (ValueType*) m_data.team_member(i)->team_reduce_local();
+
+          reducer.join( dst , src );
+        }
+
+        m_data.team_rendezvous_release();
+        // This thread released all other threads from 'team_rendezvous'
+        // with a return value of 'false'
+      }
+
+      return *((ValueType*) m_data.team_reduce());
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
+#endif
+
+
+  template< typename T >
+  KOKKOS_INLINE_FUNCTION
+  T team_scan( T const & value , T * const global = 0 ) const noexcept
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {
+      if ( 0 != m_data.m_team_rank ) {
+        // Non-root copies to their local buffer:
+        ((T*) m_data.team_reduce_local())[1] = value ;
+      }
+
+      // Root does not overwrite shared memory until all threads arrive
+      // and copy to their local buffer.
+
+      if ( m_data.team_rendezvous() ) {
+        // All threads have entered 'team_rendezvous'
+        // only this thread returned from 'team_rendezvous'
+        // with a return value of 'true'
+        //
+        // This thread scans contributed values
+
+        {
+          T * prev = (T*) m_data.team_reduce_local();
+
+          prev[0] = 0 ;
+          prev[1] = value ;
+
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            T * const ptr = (T*) m_data.team_member(i)->team_reduce_local();
+
+            ptr[0] = prev[0] + prev[1] ;
+
+            prev = ptr ;
+          }
+        }
+
+        // If adding to global value then atomic_fetch_add to that value
+        // and sum previous value to every entry of the scan.
+        if ( global ) {
+          T * prev = (T*) m_data.team_reduce_local();
+
+          {
+            T * ptr  = (T*) m_data.team_member( m_data.m_team_size - 1 )->team_reduce_local();
+            prev[0] = Kokkos::atomic_fetch_add( global , ptr[0] + ptr[1] );
+          }
+
+          for ( int i = 1 ; i < m_data.m_team_size ; ++i ) {
+            T * ptr = (T*) m_data.team_member(i)->team_reduce_local();
+            ptr[0] += prev[0] ;
+          }
+        }
+
+        m_data.team_rendezvous_release();
+      }
+
+      return ((T*) m_data.team_reduce_local())[0];
+    }
+#else
+    { Kokkos::abort("HostThreadTeamMember team_scan\n"); return T(); }
+#endif
+
+};
+
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<class Space,typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+TeamThreadRange( Impl::HostThreadTeamMember<Space> const & member
+               , iType const & count )
+{
+  return
+    Impl::TeamThreadRangeBoundariesStruct
+      <iType,Impl::HostThreadTeamMember<Space> >(member,0,count);
+}
+
+template<class Space, typename iType1, typename iType2>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct
+  < typename std::common_type< iType1, iType2 >::type
+  , Impl::HostThreadTeamMember<Space> >
+TeamThreadRange( Impl::HostThreadTeamMember<Space> const & member
+               , iType1 const & begin , iType2 const & end )
+{
+  return
+    Impl::TeamThreadRangeBoundariesStruct
+      < typename std::common_type< iType1, iType2 >::type
+      , Impl::HostThreadTeamMember<Space> >( member , begin , end );
+}
+
+template<class Space, typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+ThreadVectorRange
+  ( Impl::HostThreadTeamMember<Space> const & member
+  , const iType & count )
+{
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >(member,count);
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Inter-thread parallel_for.
+ *
+ * Executes lambda(iType i) for each i=[0..N)
+ *
+ * The range [0..N) is mapped to all threads of the the calling thread team.
+*/
+template<typename iType, class Space, class Closure>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure (i);
+  }
+}
+
+template<typename iType, class Space, class Closure>
+KOKKOS_INLINE_FUNCTION
+void parallel_for
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure (i);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename iType, class Space, class Closure, class Reducer >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< Kokkos::is_reducer< Reducer >::value >::type
+parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , Reducer  const & reducer
+  )
+{
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+template< typename iType, class Space, typename Closure, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
+parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , ValueType      & result
+  )
+{
+  Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > > reducer( & result );
+
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+template< typename iType, class Space
+         , class Closure, class Joiner , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >
+             const & loop_boundaries
+  , Closure  const & closure
+  , Joiner   const & joiner
+  , ValueType      & result
+  )
+{
+  Impl::Reducer< ValueType , Joiner > reducer( joiner , & result );
+
+  reducer.init( reducer.data() );
+
+  for( iType i = loop_boundaries.start
+     ; i <  loop_boundaries.end
+     ; i += loop_boundaries.increment ) {
+    closure( i , reducer.reference() );
+  }
+
+  loop_boundaries.thread.team_reduce( reducer );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Inter-thread vector parallel_reduce.
+ *
+ *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all threads of the
+ *  calling thread team and a summation of  val is
+ *  performed and put into result.
+ */
+template< typename iType, class Space , class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   ValueType& result)
+{
+  result = ValueType();
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment) {
+    lambda(i,result);
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce.
+ *
+ *  Executes lambda(iType i, ValueType & val) for each i=[0..N)
+ *
+ *  The range [0..N) is mapped to all vector lanes of the the
+ *  calling thread and a reduction of val is performed using
+ *  JoinType(ValueType& val, const ValueType& update)
+ *  and put into init_result.
+ *  The input value of init_result is used as initializer for
+ *  temporary variables of ValueType. Therefore * the input
+ *  value should be the neutral element with respect to the
+ *  join operation (e.g. '0 for +-' or * '1 for *').
+ */
+template< typename iType, class Space
+        , class Lambda, class JoinType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce
+  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
+   const Lambda & lambda,
+   const JoinType & join,
+   ValueType& result)
+{
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i =  loop_boundaries.start ;
+             i <  loop_boundaries.end ;
+             i += loop_boundaries.increment ) {
+    lambda(i,result);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename iType, class Space, class Closure >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( Impl::TeamThreadRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , Closure const & closure
+  )
+{
+  // Extract ValueType from the closure
+
+  using value_type =
+    typename Kokkos::Impl::FunctorAnalysis
+      < Kokkos::Impl::FunctorPatternInterface::SCAN
+      , void
+      , Closure >::value_type ;
+
+  value_type accum = 0 ;
+
+  // Intra-member scan
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,accum,false);
+  }
+
+  // 'accum' output is the exclusive prefix sum
+  accum = loop_boundaries.thread.team_scan(accum);
+
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,accum,true);
+  }
+}
+
+
+template< typename iType, class Space, class ClosureType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan
+  ( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> > const & loop_boundaries
+  , ClosureType const & closure
+  )
+{
+  using value_type = typename
+    Kokkos::Impl::FunctorAnalysis
+      < Impl::FunctorPatternInterface::SCAN
+      , void
+      , ClosureType >::value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for ( iType i = loop_boundaries.start
+      ; i <  loop_boundaries.end
+      ; i += loop_boundaries.increment ) {
+    closure(i,scan_val,true);
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::HostThreadTeamMember<Space> >
+PerTeam(const Impl::HostThreadTeamMember<Space> & member )
+{
+  return Impl::ThreadSingleStruct<Impl::HostThreadTeamMember<Space> >(member);
+}
+
+template< class Space >
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::HostThreadTeamMember<Space> >
+PerThread(const Impl::HostThreadTeamMember<Space> & member)
+{
+  return Impl::VectorSingleStruct<Impl::HostThreadTeamMember<Space> >(member);
+}
+
+template< class Space , class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor )
+{
+  if ( single.team_member.team_rank() == 0 ) functor();
+  // 'single' does not perform a barrier.
+  // single.team_member.team_barrier( functor );
+}
+
+template< class Space , class FunctorType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor , ValueType & val )
+{
+  single.team_member.team_broadcast( functor , val , 0 );
+}
+
+template< class Space , class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::VectorSingleStruct< Impl::HostThreadTeamMember<Space> > & , const FunctorType & functor )
+{
+  functor();
+}
+
+template< class Space , class FunctorType , typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void single( const Impl::VectorSingleStruct< Impl::HostThreadTeamMember<Space> > & , const FunctorType & functor , ValueType & val )
+{
+  functor(val);
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_HOSTTHREADTEAM_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 84cf536bb7adf86be20459f36f64f4ced027188e..7489018ac641b70e97b6eba879d4c08aa0776fb9 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -52,6 +52,10 @@ void memory_fence()
 {
 #if defined( __CUDA_ARCH__ )
   __threadfence();
+#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
+  asm volatile (
+	  "mfence" ::: "memory"
+  );
 #elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
       ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
   __sync_synchronize();
@@ -76,8 +80,8 @@ void store_fence()
 {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
   asm volatile (
-	"sfence" ::: "memory"
-  	);
+	  "sfence" ::: "memory"
+  );
 #else
   memory_fence();
 #endif
@@ -93,8 +97,8 @@ void load_fence()
 {
 #if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
   asm volatile (
-	"lfence" ::: "memory"
-  	);
+	  "lfence" ::: "memory"
+  );
 #else
   memory_fence();
 #endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
index da95c943fe96acbeda0a8d44525f9f9fd2d65076..5852efb011f357ace9df66c5d330f9e2a3f39dd1 100644
--- a/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_OldMacros.hpp
@@ -129,8 +129,8 @@
 #endif
 
 #ifdef KOKKOS_HAVE_CUDA_RDC
-#ifndef KOKKOS_ENABLE_CUDA_RDC
-#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
+#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
 #endif
 #endif
 
@@ -242,9 +242,9 @@
 #endif
 #endif
 
-#ifdef KOKKOS_HAVE_QTHREAD
-#ifndef KOKKOS_ENABLE_QTHREAD
-#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
+#ifdef KOKKOS_HAVE_QTHREADS
+#ifndef KOKKOS_ENABLE_QTHREADS
+#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
 #endif
 #endif
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
index 99c5df4db31001b42f56337938f5a7ea73941157..0c006a8c008390e330f35d849f9b93facfeb1879 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -43,7 +43,7 @@
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <string.h>
 
 namespace Kokkos {
@@ -84,21 +84,21 @@ namespace Kokkos {
             (*endScanCallee)(kernelID);
         }
     }
-    
+
     void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
         if(NULL != beginReduceCallee) {
             Kokkos::fence();
             (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
         }
     }
-    
+
     void endParallelReduce(const uint64_t kernelID) {
         if(NULL != endReduceCallee) {
             Kokkos::fence();
             (*endReduceCallee)(kernelID);
         }
     }
-    
+
 
     void pushRegion(const std::string& kName) {
       if( NULL != pushRegionCallee ) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 3d6a3892524ee3234a33f14cf7727cac5512e455..139a20d8f9ea99b88d21436726fa9c55fe063622 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -50,7 +50,7 @@
 #include <string>
 #include <cinttypes>
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_DeviceInfo.hpp>
 #include <dlfcn.h>
 #include <iostream>
@@ -59,7 +59,7 @@
 
 #define KOKKOSP_INTERFACE_VERSION 20150628
 
-#if (KOKKOS_ENABLE_PROFILING)
+#if defined(KOKKOS_ENABLE_PROFILING)
 namespace Kokkos {
   namespace Profiling {
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b3ed5f151439c659305773f1cd997376300ccf3e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
@@ -0,0 +1,317 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_REDUCER_HPP
+#define KOKKOS_IMPL_REDUCER_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+/*  Reducer abstraction:
+ *  1) Provides 'join' operation
+ *  2) Provides 'init' operation
+ *  3) Provides 'copy' operation
+ *  4) Optionally provides result value in a memory space
+ *
+ *  Created from:
+ *  1) Functor::operator()( destination , source )
+ *  2) Functor::{ join , init )
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename value_type >
+struct ReduceSum
+{
+  KOKKOS_INLINE_FUNCTION static
+  void copy( value_type & dest
+           , value_type const & src ) noexcept
+    { dest = src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void init( value_type & dest ) noexcept
+    { new( &dest ) value_type(); }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type volatile & dest
+           , value_type const volatile & src ) noexcept
+    { dest += src ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  void join( value_type & dest
+           , value_type const & src ) noexcept
+    { dest += src ; }
+};
+
+template< typename T
+        , class ReduceOp = ReduceSum< T >
+        , typename MemorySpace = void >
+struct Reducer
+  : private ReduceOp
+  , private integral_nonzero_constant
+    < int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
+{
+private:
+
+  // Determine if T is simple array
+
+  enum : int { rank = std::rank<T>::value };
+
+  static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
+
+  using length_t =
+    integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
+
+public:
+
+  using reducer        = Reducer ;
+  using memory_space   = MemorySpace ;
+  using value_type     = typename std::remove_extent<T>::type ;
+  using reference_type =
+    typename std::conditional< ( rank != 0 )
+                             , value_type *
+                             , value_type &
+                             >::type ;
+private:
+
+  //--------------------------------------------------------------------------
+  // Determine what functions 'ReduceOp' provides:
+  //   copy( destination , source )
+  //   init( destination )
+  //
+  //   operator()( destination , source )
+  //   join( destination , source )
+  //
+  // Provide defaults for missing optional operations
+
+  template< class R , typename = void>
+  struct COPY {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const &
+             , value_type * dst
+             , value_type const * src ) { *dst = *src ; }
+  };
+
+  template< class R >
+  struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
+                                          , *((value_type const *)0) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void copy( R const & r
+             , value_type * dst
+             , value_type const * src ) { r.copy( *dst , *src ); }
+  };
+
+  template< class R , typename = void >
+  struct INIT {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & , value_type * dst ) { new(dst) value_type(); }
+  };
+
+  template< class R >
+  struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
+  {
+    KOKKOS_INLINE_FUNCTION static
+    void init( R const & r , value_type * dst ) { r.init( *dst ); }
+  };
+
+  template< class R , typename V , typename = void > struct JOIN
+    {
+      // If no join function then try operator()
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.operator()(*dst,*src); }
+    };
+
+  template< class R , typename V >
+  struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
+    {
+      // If has join function use it
+      KOKKOS_INLINE_FUNCTION static
+      void join( R const & r , V * dst , V const * src )
+        { r.join(*dst,*src); }
+    };
+
+  //--------------------------------------------------------------------------
+
+  value_type * const m_result ;
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 != Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return p ; }
+
+  template< int Rank >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr
+  typename std::enable_if< ( 0 == Rank ) , reference_type >::type
+  ref( value_type * p ) noexcept { return *p ; }
+
+public:
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr int length() const noexcept
+     { return length_t::value ; }
+
+  KOKKOS_INLINE_FUNCTION
+  value_type * data() const noexcept
+    { return m_result ; }
+
+  KOKKOS_INLINE_FUNCTION
+  reference_type reference() const noexcept
+    { return Reducer::template ref< rank >( m_result ); }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  void copy( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type * dest ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type * const dest
+           , value_type const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile * const dest
+           , value_type volatile const * const src ) const noexcept
+    {
+      for ( int i = 0 ; i < length() ; ++i ) {
+        Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
+      }
+    }
+
+  //--------------------------------------------------------------------------
+
+  template< typename ArgT >
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer
+    ( ArgT * arg_value
+    , typename std::enable_if
+        < std::is_same<ArgT,value_type>::value &&
+          std::is_default_constructible< ReduceOp >::value
+        , int >::type arg_length = 1
+    ) noexcept
+    : ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp const & arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  KOKKOS_INLINE_FUNCTION explicit
+  constexpr Reducer( ReduceOp      && arg_op
+                   , value_type     * arg_value = 0
+                   , int arg_length = 1 ) noexcept
+    : ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
+
+  Reducer( Reducer const & ) = default ;
+  Reducer( Reducer && ) = default ;
+  Reducer & operator = ( Reducer const & ) = default ;
+  Reducer & operator = ( Reducer && ) = default ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
+Sum( ValueType & arg_value )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
+}
+
+template< typename ValueType >
+constexpr
+Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
+Sum( ValueType * arg_value , int arg_length )
+{
+  static_assert( std::is_trivial<ValueType>::value
+               , "Kokkos reducer requires trivial value type" );
+  return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
+}
+
+//----------------------------------------------------------------------------
+
+template< typename ValueType , class JoinType >
+Impl::Reducer< ValueType , JoinType >
+reducer( ValueType & value , JoinType const & lambda )
+{
+  return Impl::Reducer< ValueType , JoinType >( lambda , & value );
+}
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
index 76161c10f1a8b4ed493772a59e086362b9e2723c..79496133061145aee8786aecb21aa86117b1dbc4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -53,63 +53,126 @@
 
 namespace Kokkos {
 namespace Impl {
-namespace SerialImpl {
+namespace {
 
-Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+HostThreadTeamData g_serial_thread_team_data ;
 
-Sentinel::~Sentinel()
-{
-  if ( m_scratch ) { free( m_scratch ); }
-  m_scratch = 0 ;
-  m_reduce_end = 0 ;
-  m_shared_end = 0 ;
 }
 
-Sentinel & Sentinel::singleton()
+// Resize thread team data scratch memory
+void serial_resize_thread_team_data( size_t pool_reduce_bytes
+                                   , size_t team_reduce_bytes
+                                   , size_t team_shared_bytes
+                                   , size_t thread_local_bytes )
 {
-  static Sentinel s ; return s ;
+  if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
+  if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
+
+  const size_t old_pool_reduce  = g_serial_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = g_serial_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = g_serial_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = g_serial_thread_team_data.scratch_bytes();
+
+  // Allocate if any of the old allocation is tool small:
+
+  const bool allocate = ( old_pool_reduce  < pool_reduce_bytes ) ||
+                        ( old_team_reduce  < team_reduce_bytes ) ||
+                        ( old_team_shared  < team_shared_bytes ) ||
+                        ( old_thread_local < thread_local_bytes );
+
+  if ( allocate ) {
+
+    Kokkos::HostSpace space ;
+
+    if ( old_alloc_bytes ) {
+      g_serial_thread_team_data.disband_team();
+      g_serial_thread_team_data.disband_pool();
+
+      space.deallocate( g_serial_thread_team_data.scratch_buffer()
+                      , g_serial_thread_team_data.scratch_bytes() );
+    }
+
+    if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
+    if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
+    if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
+    if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
+
+    const size_t alloc_bytes =
+      HostThreadTeamData::scratch_size( pool_reduce_bytes
+                                      , team_reduce_bytes
+                                      , team_shared_bytes
+                                      , thread_local_bytes );
+
+    void * const ptr = space.allocate( alloc_bytes );
+
+    g_serial_thread_team_data.
+      scratch_assign( ((char *)ptr)
+                    , alloc_bytes
+                    , pool_reduce_bytes
+                    , team_reduce_bytes
+                    , team_shared_bytes
+                    , thread_local_bytes );
+
+    HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
+
+    g_serial_thread_team_data.organize_pool( pool , 1 );
+    g_serial_thread_team_data.organize_team(1);
+  }
 }
 
-inline
-unsigned align( unsigned n )
+// Get thread team data structure for omp_get_thread_num()
+HostThreadTeamData * serial_get_thread_team_data()
 {
-  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
-  return ( n + MASK ) & ~MASK ;
+  return & g_serial_thread_team_data ;
 }
 
-} // namespace
+} // namespace Impl
+} // namespace Kokkos
 
-SerialTeamMember::SerialTeamMember( int arg_league_rank
-                                  , int arg_league_size
-                                  , int arg_shared_size
-                                  )
-  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
-           , arg_shared_size )
-  , m_league_rank( arg_league_rank )
-  , m_league_size( arg_league_size )
-{}
+/*--------------------------------------------------------------------------*/
 
-} // namespace Impl
+namespace Kokkos {
 
-void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+int Serial::is_initialized()
 {
-  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+  return 1 ;
+}
 
-  reduce_size = Impl::SerialImpl::align( reduce_size );
-  shared_size = Impl::SerialImpl::align( shared_size );
+void Serial::initialize( unsigned threads_count
+                       , unsigned use_numa_count
+                       , unsigned use_cores_per_numa
+                       , bool allow_asynchronous_threadpool )
+{
+  (void) threads_count;
+  (void) use_numa_count;
+  (void) use_cores_per_numa;
+  (void) allow_asynchronous_threadpool;
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::initialize();
+  #endif
+}
 
-  if ( ( s.m_reduce_end < reduce_size ) ||
-       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+void Serial::finalize()
+{
+  if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
+    Impl::g_serial_thread_team_data.disband_team();
+    Impl::g_serial_thread_team_data.disband_pool();
 
-    if ( s.m_scratch ) { free( s.m_scratch ); }
+    Kokkos::HostSpace space ;
 
-    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
-    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+    space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
+                    , Impl::g_serial_thread_team_data.scratch_bytes() );
 
-    s.m_scratch = malloc( s.m_shared_end );
+    Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
   }
 
-  return s.m_scratch ;
+  #if defined(KOKKOS_ENABLE_PROFILING)
+    Kokkos::Profiling::finalize();
+  #endif
 }
 
 } // namespace Kokkos
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
index 19f3abe71ae7049ce0c2674ee2638c07679aa5b0..d22d604fbc2f02e2f18c6c24d69840e7f33e7e98 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.cpp
@@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
   using execution_space = Kokkos::Serial ;
   using queue_type      = TaskQueue< execution_space > ;
   using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );
 
   // Loop until all queues are empty
   while ( 0 < queue->m_ready_count ) {
@@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
 
     for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
       for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
       }
     }
 
     if ( end != task ) {
 
-      // pop_task resulted in lock == task->m_next
+      // pop_ready_task resulted in lock == task->m_next
       // In the executing state
 
       (*task->m_apply)( task , & exec );
@@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
   using execution_space = Kokkos::Serial ;
   using queue_type      = TaskQueue< execution_space > ;
   using task_root_type  = TaskBase< execution_space , void , void > ;
-  using Member          = TaskExec< execution_space > ;
+  using Member          = Impl::HostThreadTeamMember< execution_space > ;
 
   task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
 
-  Member exec ;
+  Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
+
+  Member exec( *data );
 
   // Loop until no runnable task
 
@@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
 
     for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
       for ( int j = 0 ; j < 2 && end == task ; ++j ) {
-        task = queue_type::pop_task( & queue->m_ready[i][j] );
+        task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
       }
     }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 178305c5d3c97da52535324a14333e1878cea730..ac7f17c0ea9e314137560626e0b0467faf5ff90d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -65,6 +65,7 @@ public:
   using memory_space    = Kokkos::HostSpace ;
   using queue_type      = Kokkos::Impl::TaskQueue< execution_space > ;
   using task_base_type  = Kokkos::Impl::TaskBase< execution_space , void , void > ;
+  using member_type     = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
 
   static
   void iff_single_thread_recursive_execute( queue_type * const );
@@ -72,237 +73,19 @@ public:
   static
   void execute( queue_type * const );
 
-  template< typename FunctorType >
+  template< typename TaskType >
   static
-  void proc_set_apply( task_base_type::function_type * ptr )
-    {
-      using TaskType = TaskBase< Kokkos::Serial
-                               , typename FunctorType::value_type
-                               , FunctorType
-                               > ;
-       *ptr = TaskType::apply ;
-    }
+  typename TaskType::function_type
+  get_function_pointer() { return TaskType::apply ; }
 };
 
 extern template class TaskQueue< Kokkos::Serial > ;
 
-//----------------------------------------------------------------------------
-
-template<>
-class TaskExec< Kokkos::Serial >
-{
-public:
-
-  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
-  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
-  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
-};
-
-template<typename iType>
-struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  //const  TaskExec< Kokkos::Serial > & thread;
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start(0)
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-
-  KOKKOS_INLINE_FUNCTION
-  TeamThreadRangeBoundariesStruct
-    //( const TaskExec< Kokkos::Serial > & arg_thread
-    ( TaskExec< Kokkos::Serial > & arg_thread
-    , const iType& arg_start
-    , const iType & arg_end
-    )
-    : start( arg_start )
-    , end(   arg_end)
-    , thread( arg_thread )
-    {}
-};
-
-//----------------------------------------------------------------------------
-
-template<typename iType>
-struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
-{
-  typedef iType index_type;
-  const iType start ;
-  const iType end ;
-  enum {increment = 1};
-  TaskExec< Kokkos::Serial > & thread;
-
-  KOKKOS_INLINE_FUNCTION
-  ThreadVectorRangeBoundariesStruct
-    ( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
-    : start( 0 )
-    , end(arg_count)
-    , thread(arg_thread)
-    {}
-};
-
 }} /* namespace Kokkos::Impl */
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-
-// OMP version needs non-const TaskExec
-template< typename iType >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
-{
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
-}
-
-// OMP version needs non-const TaskExec
-template< typename iType1, typename iType2 >
-KOKKOS_INLINE_FUNCTION
-Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
-                                       Impl::TaskExec< Kokkos::Serial > >
-TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
-{
-  typedef typename std::common_type< iType1, iType2 >::type iType;
-  return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
-           thread, iType(start), iType(end) );
-}
-
-// OMP version needs non-const TaskExec
-template<typename iType>
-KOKKOS_INLINE_FUNCTION
-Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
-ThreadVectorRange
-  ( Impl::TaskExec< Kokkos::Serial > & thread
-  , const iType & count )
-{
-  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
-}
-
-  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
-   *
-   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
-   * This functionality requires C++11 support.*/
-template<typename iType, class Lambda>
-KOKKOS_INLINE_FUNCTION
-void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i);
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
-    lambda(i, result);
-
-  initialized_result = result;
-}
-
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   ValueType& initialized_result)
-{
-  initialized_result = ValueType();
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    initialized_result+=tmp;
-  }
-}
-
-template< typename iType, class Lambda, typename ValueType, class JoinType >
-KOKKOS_INLINE_FUNCTION
-void parallel_reduce
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda,
-   const JoinType & join,
-   ValueType& initialized_result)
-{
-  ValueType result = initialized_result;
-#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
-#pragma ivdep
-#endif
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    ValueType tmp = ValueType();
-    lambda(i,tmp);
-    join(result,tmp);
-  }
-  initialized_result = result;
-}
-
-template< typename ValueType, typename iType, class Lambda >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-  ValueType accum = 0 ;
-  ValueType val, local_total;
-
-  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
-    local_total = 0;
-    lambda(i,local_total,false);
-    val = accum;
-    lambda(i,val,true);
-    accum += local_total;
-  }
-
-}
-
-// placeholder for future function
-template< typename iType, class Lambda, typename ValueType >
-KOKKOS_INLINE_FUNCTION
-void parallel_scan
-  (const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
-   const Lambda & lambda)
-{
-}
-
-} /* namespace Kokkos */
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
deleted file mode 100644
index b2aea14df44ea55b8c86a70c9907792b51525918..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic.hpp
+++ /dev/null
@@ -1,693 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_HPP
-#define KOKKOS_SYNCHRONIC_HPP
-
-#include <impl/Kokkos_Synchronic_Config.hpp>
-
-#include <atomic>
-#include <chrono>
-#include <thread>
-#include <functional>
-#include <algorithm>
-
-namespace Kokkos {
-namespace Impl {
-
-enum notify_hint {
-  notify_all,
-  notify_one,
-  notify_none
-};
-enum expect_hint {
-  expect_urgent,
-  expect_delay
-};
-
-namespace Details {
-
-template <class S, class T>
-bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
-  int i = 0;
-  for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_relax();
-  for(;i < attempts; ++i)
-    if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
-      return true;
-    else
-      __synchronic_yield();
-  return false;
-}
-
-struct __exponential_backoff {
-  __exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
-  }
-  static inline void sleep_for(std::chrono::microseconds const& time) {
-    auto t = time.count();
-    if(__builtin_expect(t > 75,0)) {
-      portable_sleep(time);
-    }
-    else if(__builtin_expect(t > 25,0))
-      __synchronic_yield();
-    else
-      __synchronic_relax();
-  }
-  void sleep_for_step() {
-    sleep_for(step());
-  }
-  std::chrono::microseconds step() {
-    float const f = ranfu();
-    int const t = int(microseconds * f);
-    if(__builtin_expect(f >= 0.95f,0))
-      microseconds = 8;
-    else
-      microseconds = (std::min)(microseconds>>1,maximum);
-    return std::chrono::microseconds(t);
-  }
-private :
-  int maximum, microseconds, x, y, z;
-  int xorshf96() {
-    int t;
-    x ^= x << 16; x ^= x >> 5; x ^= x << 1;
-    t = x; x = y; y = z; z = t ^ x ^ y;
-    return z;
-  }
-  float ranfu() {
-    return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
-  }
-};
-
-template <class T, class Enable = void>
-struct __synchronic_base {
-
-protected:
-  std::atomic<T> atom;
-
-  void notify(notify_hint = notify_all) noexcept {
-  }
-  void notify(notify_hint = notify_all) volatile noexcept {
-  }
-
-public :
-  __synchronic_base() noexcept = default;
-  constexpr __synchronic_base(T v) noexcept : atom(v) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    while(atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
-      return;
-    __exponential_backoff b;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
-      __do_backoff(b);
-      if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
-        return;
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-};
-
-#ifdef __SYNCHRONIC_COMPATIBLE
-template <class T>
-struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
-
-public:
-  std::atomic<T> atom;
-
-  void notify(notify_hint hint = notify_all) noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all(&atom);
-      else
-        __synchronic_wake_one(&atom);
-    }
-  }
-  void notify(notify_hint hint = notify_all) volatile noexcept {
-    if(__builtin_expect(hint == notify_none,1))
-      return;
-    auto const x = count.fetch_add(0,std::memory_order_acq_rel);
-    if(__builtin_expect(x,0)) {
-      if(__builtin_expect(hint == notify_all,1))
-        __synchronic_wake_all_volatile(&atom);
-      else
-        __synchronic_wake_one_volatile(&atom);
-    }
-  }
-
-public :
-  __synchronic_base() noexcept : count(0) { }
-  constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
-  __synchronic_base(const __synchronic_base&) = delete;
-  ~__synchronic_base() { }
-  __synchronic_base& operator=(const __synchronic_base&) = delete;
-  __synchronic_base& operator=(const __synchronic_base&) volatile = delete;
-
-  void expect_update(T val, expect_hint = expect_urgent) const noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-  void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_volatile(&atom,val);
-      count.fetch_add(-1,std::memory_order_acquire);
-    }
-  }
-
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-  template <class Clock, class Duration>
-  void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
-    if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
-      return;
-    std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
-    while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
-      count.fetch_add(1,std::memory_order_release);
-      __synchronic_wait_timed_volatile(&atom,val,remains);
-      count.fetch_add(-1,std::memory_order_acquire);
-      remains = then - std::chrono::high_resolution_clock::now();
-    }
-  }
-private:
-  mutable std::atomic<int> count;
-};
-#endif
-
-template <class T, class Enable = void>
-struct __synchronic : public __synchronic_base<T> {
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-};
-
-template <class T>
-struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
-
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_and(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_or(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-  T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_xor(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T operator=(T v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator=(T v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator+=(T v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator-=(T v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) volatile noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator&=(T v) noexcept {
-    auto const t = this->atom &= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) volatile noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator|=(T v) noexcept {
-    auto const t = this->atom |= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) volatile noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-  T operator^=(T v) noexcept {
-    auto const t = this->atom ^= v;
-    this->notify();
-    return t;
-  }
-};
-
-template <class T>
-struct __synchronic<T*> : public __synchronic_base<T*> {
-
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_add(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-  T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.fetch_sub(v,m);
-    this->notify(n);
-    return t;
-  }
-
-  __synchronic() noexcept = default;
-  constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
-  __synchronic(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) = delete;
-  __synchronic& operator=(const __synchronic&) volatile = delete;
-
-  T* operator=(T* v) volatile noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator=(T* v) noexcept {
-    auto const t = this->atom = v;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) volatile noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++(int) noexcept {
-    auto const t = ++this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) volatile noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator--(int) noexcept {
-    auto const t = --this->atom;
-    this->notify();
-    return t;
-  }
-  T* operator++() volatile noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator++() noexcept {
-    auto const t = this->atom++;
-    this->notify();
-    return t;
-  }
-  T* operator--() volatile noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator--() noexcept {
-    auto const t = this->atom--;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator+=(ptrdiff_t v) noexcept {
-    auto const t = this->atom += v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) volatile noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-  T* operator-=(ptrdiff_t v) noexcept {
-    auto const t = this->atom -= v;
-    this->notify();
-    return t;
-  }
-};
-
-} //namespace Details
-
-template <class T>
-struct synchronic : public Details::__synchronic<T> {
-
-  bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
-  bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    this->atom.store(v,m);
-    this->notify(n);
-  }
-  T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
-  T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
-
-  operator T() const volatile noexcept { return (T)this->atom; }
-  operator T() const noexcept { return (T)this->atom; }
-
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.exchange(v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_weak(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-  bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
-    auto const t = this->atom.compare_exchange_strong(r,v,m);
-    this->notify(n);
-    return t;
-  }
-
-  synchronic() noexcept = default;
-  constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
-  synchronic(const synchronic&) = delete;
-  ~synchronic() { }
-  synchronic& operator=(const synchronic&) = delete;
-  synchronic& operator=(const synchronic&) volatile = delete;
-  T operator=(T val) noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-  T operator=(T val) volatile noexcept {
-    return Details::__synchronic<T>::operator=(val);
-  }
-
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    Details::__synchronic<T>::expect_update(val,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      Details::__synchronic<T>::expect_update(nval,h);
-    return load(order);
-  }
-  T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
-    for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
-      expect_update(nval,h);
-    return load(order);
-  }
-  template <class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-  template < class Rep, class Period>
-  void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
-    Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
-  }
-};
-
-#include <inttypes.h>
-
-typedef synchronic<char> synchronic_char;
-typedef synchronic<char> synchronic_schar;
-typedef synchronic<unsigned char> synchronic_uchar;
-typedef synchronic<short> synchronic_short;
-typedef synchronic<unsigned short> synchronic_ushort;
-typedef synchronic<int> synchronic_int;
-typedef synchronic<unsigned int> synchronic_uint;
-typedef synchronic<long> synchronic_long;
-typedef synchronic<unsigned long> synchronic_ulong;
-typedef synchronic<long long> synchronic_llong;
-typedef synchronic<unsigned long long> synchronic_ullong;
-//typedef synchronic<char16_t> synchronic_char16_t;
-//typedef synchronic<char32_t> synchronic_char32_t;
-typedef synchronic<wchar_t> synchronic_wchar_t;
-
-typedef synchronic<int_least8_t> synchronic_int_least8_t;
-typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
-typedef synchronic<int_least16_t> synchronic_int_least16_t;
-typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
-typedef synchronic<int_least32_t> synchronic_int_least32_t;
-typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
-//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
-typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
-typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
-typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
-typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
-typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
-typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
-typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
-typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
-typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
-typedef synchronic<intptr_t> synchronic_intptr_t;
-typedef synchronic<uintptr_t> synchronic_uintptr_t;
-typedef synchronic<size_t> synchronic_size_t;
-typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
-typedef synchronic<intmax_t> synchronic_intmax_t;
-typedef synchronic<uintmax_t> synchronic_uintmax_t;
-
-}
-}
-
-#endif //__SYNCHRONIC_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
deleted file mode 100644
index 0a6dd6e715edad752f56756ccdc6fba3d43e30fb..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_Config.hpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
-#define KOKKOS_SYNCHRONIC_CONFIG_H
-
-#include <thread>
-#include <chrono>
-
-namespace Kokkos {
-namespace Impl {
-
-//the default yield function used inside the implementation is the Standard one
-#define __synchronic_yield std::this_thread::yield
-#define __synchronic_relax __synchronic_yield
-
-#if defined(_MSC_VER)
-    //this is a handy GCC optimization that I use inside the implementation
-    #define __builtin_expect(condition,common) condition
-    #if _MSC_VER <= 1800
-        //using certain keywords that VC++ temporarily doesn't support
-        #define _ALLOW_KEYWORD_MACROS
-        #define noexcept
-        #define constexpr
-    #endif
-    //yes, I define multiple assignment operators
-    #pragma warning(disable:4522)
-    //I don't understand how Windows is so bad at timing functions, but is OK
-    //with straight-up yield loops
-    #define __do_backoff(b) __synchronic_yield()
-#else
-#define __do_backoff(b) b.sleep_for_step()
-#endif
-
-//certain platforms have efficient support for spin-waiting built into the operating system
-#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
-#if defined(_WIN32_WINNT)
-#include <winsock2.h>
-#include <Windows.h>
-    //the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
-    #define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
-    #define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
-    #define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
-    #define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
-    #define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
-
-    inline void native_sleep(unsigned long microseconds)
-    {
-      // What to do if microseconds is < 1000?
-      Sleep(microseconds / 1000);
-    }
-
-    inline void native_yield()
-    {
-      SwitchToThread();
-    }
-#elif defined(__linux__)
-    #include <chrono>
-    #include <time.h>
-    #include <unistd.h>
-    #include <pthread.h>
-    #include <linux/futex.h>
-    #include <sys/syscall.h>
-    #include <climits>
-    #include <cassert>
-    template < class Rep, class Period>
-    inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
-      struct timespec ts;
-      ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
-      assert(!ts.tv_sec);
-      ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
-      return ts;
-    }
-    inline long futex(void const* addr1, int op, int val1) {
-        return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
-    }
-    inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
-        return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
-    }
-    inline void native_sleep(unsigned long microseconds)
-    {
-      usleep(microseconds);
-    }
-    inline void native_yield()
-    {
-      pthread_yield();
-    }
-
-    //the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
-    #define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
-    #define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
-    #define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
-    #define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
-    #define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
-    #define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
-    #define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
-    #define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
-    #define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
-
-    //the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
-    #undef __synchronic_yield
-    #define __synchronic_yield sched_yield
-
-    //for extremely short wait times, just let another hyper-thread run
-    #undef __synchronic_relax
-    #define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
-
-#endif
-#endif
-
-#ifdef _GLIBCXX_USE_NANOSLEEP
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ std::this_thread::sleep_for(time); }
-#else
-inline void portable_sleep(std::chrono::microseconds const& time)
-{ native_sleep(time.count()); }
-#endif
-
-#ifdef _GLIBCXX_USE_SCHED_YIELD
-inline void portable_yield()
-{ std::this_thread::yield(); }
-#else
-inline void portable_yield()
-{ native_yield(); }
-#endif
-
-//this is the number of times we initially spin, on the first wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_A 16
-
-//this is how decide to yield instead of just spinning, 'c' is the current trip count
-//#define __SYNCHRONIC_SPIN_YIELD(c) true
-#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
-
-//this is the number of times we normally spin, on every subsequent wait attempt
-#define __SYNCHRONIC_SPIN_COUNT_B 8
-
-}
-}
-
-#endif //__SYNCHRONIC_CONFIG_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp b/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
deleted file mode 100644
index facc8d6d8e67a4828aa94bd75fb7590f454b41f6..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Synchronic_n3998.hpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
-#define KOKKOS_SYNCHRONIC_N3998_HPP
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <functional>
-
-/*
-In the section below, a synchronization point represents a point at which a
-thread may block until a given synchronization condition has been reached or
-at which it may notify other threads that a synchronization condition has
-been achieved.
-*/
-namespace Kokkos { namespace Impl {
-
-    /*
-    A latch maintains an internal counter that is initialized when the latch
-    is created. The synchronization condition is reached when the counter is
-    decremented to 0. Threads may block at a synchronization point waiting
-    for the condition to be reached. When the condition is reached, any such
-    blocked threads will be released.
-    */
-    struct latch {
-        latch(int val) : count(val), released(false) { }
-        latch(const latch&) = delete;
-        latch& operator=(const latch&) = delete;
-        ~latch( ) { }
-        void arrive( ) {
-            __arrive( );
-        }
-        void arrive_and_wait( ) {
-            if(!__arrive( ))
-                wait( );
-        }
-        void wait( ) {
-            while(!released.load_when_not_equal(false,std::memory_order_acquire))
-                ;
-        }
-        bool try_wait( ) {
-            return released.load(std::memory_order_acquire);
-        }
-    private:
-        bool __arrive( ) {
-            if(count.fetch_add(-1,std::memory_order_release)!=1)
-                return false;
-            released.store(true,std::memory_order_release);
-            return true;
-        }
-        std::atomic<int> count;
-        synchronic<bool> released;
-    };
-
-    /*
-    A barrier is created with an initial value representing the number of threads
-    that can arrive at the synchronization point. When that many threads have
-    arrived, the  synchronization condition is reached and the threads are
-    released. The barrier will then reset, and may be reused for a new cycle, in
-    which the same set of threads may arrive again at the synchronization point.
-    The same set of threads shall arrive at the barrier in each cycle, otherwise
-    the behaviour is undefined.
-    */
-    struct barrier {
-        barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
-        barrier(const barrier&) = delete;
-        barrier& operator=(const barrier&) = delete;
-        ~barrier() { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                expected = nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-    };
-
-    /*
-    A notifying barrier behaves as a barrier, but is constructed with a callable
-    completion function that is invoked after all threads have arrived at the
-    synchronization point, and before the synchronization condition is reached.
-    The completion may modify the set of threads that arrives at the barrier in
-    each cycle.
-    */
-    struct notifying_barrier {
-        template <typename T>
-        notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
-        notifying_barrier(const notifying_barrier&) = delete;
-        notifying_barrier& operator=(const notifying_barrier&) = delete;
-        ~notifying_barrier( ) { }
-        void arrive_and_wait() {
-            int const myepoch = epoch.load(std::memory_order_relaxed);
-            if(!__arrive(myepoch))
-                while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
-                    ;
-        }
-        void arrive_and_drop() {
-            nexpected.fetch_add(-1,std::memory_order_relaxed);
-            __arrive(epoch.load(std::memory_order_relaxed));
-        }
-    private:
-        bool __arrive(int const myepoch) {
-            int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
-            if(__builtin_expect(myresult == expected,0)) {
-                int const newexpected = completion();
-                expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
-                arrived.store(0,std::memory_order_relaxed);
-                epoch.store(myepoch+1,std::memory_order_release);
-                return true;
-            }
-            return false;
-        }
-        int expected;
-        std::atomic<int> arrived, nexpected;
-        synchronic<int> epoch;
-        std::function<int()> completion;
-    };
-}}
-
-#endif //__N3998_H
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index afa01d0cde1f1253f216c415b81bf5c8fee1de2b..b514df351725ac55e88ea1c2e92eec4b1711e6b4 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -76,9 +76,6 @@ namespace Impl {
 template< typename Space , typename ResultType , typename FunctorType >
 class TaskBase ;
 
-template< typename Space >
-class TaskExec ;
-
 } /* namespace Impl */
 } /* namespace Kokkos */
 
@@ -149,8 +146,8 @@ private:
   //     task->m_next is the dependence or zero
   //   Postcondition:
   //     task->m_next is linked list membership
-  KOKKOS_FUNCTION
-  void schedule( task_root_type * const );
+  KOKKOS_FUNCTION void schedule_runnable(  task_root_type * const );
+  KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );
 
   // Reschedule a task
   //   Precondition:
@@ -178,7 +175,7 @@ private:
                        , task_root_type * const );
 
   KOKKOS_FUNCTION
-  static task_root_type * pop_task( task_root_type * volatile * const );
+  static task_root_type * pop_ready_task( task_root_type * volatile * const );
 
   KOKKOS_FUNCTION static
   void decrement( task_root_type * task );
@@ -368,6 +365,7 @@ public:
   int16_t        m_task_type ;   ///< Type of task
   int16_t        m_priority ;    ///< Priority of runnable task
 
+  TaskBase() = delete ;
   TaskBase( TaskBase && ) = delete ;
   TaskBase( const TaskBase & ) = delete ;
   TaskBase & operator = ( TaskBase && ) = delete ;
@@ -375,17 +373,43 @@ public:
 
   KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
 
+  // Constructor for a runnable task
   KOKKOS_INLINE_FUNCTION
-  constexpr TaskBase() noexcept
-    : m_apply(0)
-    , m_queue(0)
-    , m_wait(0)
-    , m_next(0)
-    , m_ref_count(0)
-    , m_alloc_size(0)
-    , m_dep_count(0)
-    , m_task_type( TaskSingle )
-    , m_priority( 1 /* TaskRegularPriority */ )
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , TaskBase    * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    ) noexcept
+    : m_apply(      arg_apply )
+    , m_queue(      arg_queue )
+    , m_wait( 0 )
+    , m_next(       arg_dependence )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count( 0 )
+    , m_task_type(  arg_task_type )
+    , m_priority(   arg_priority )
+    {}
+
+  // Constructor for an aggregate task
+  KOKKOS_INLINE_FUNCTION
+  constexpr TaskBase( queue_type  * arg_queue
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_dep_count
+                    ) noexcept
+    : m_apply( 0 )
+    , m_queue( arg_queue )
+    , m_wait( 0 )
+    , m_next( 0 )
+    , m_ref_count(  arg_ref_count )
+    , m_alloc_size( arg_alloc_size )
+    , m_dep_count(  arg_dep_count )
+    , m_task_type(  Aggregate )
+    , m_priority( 0 )
     {}
 
   //----------------------------------------
@@ -406,9 +430,13 @@ public:
   KOKKOS_INLINE_FUNCTION
   void add_dependence( TaskBase* dep )
     {
+      // Precondition: lock == m_next
+
+      TaskBase * const lock = (TaskBase *) LockTag ;
+
       // Assign dependence to m_next.  It will be processed in the subsequent
       // call to schedule.  Error if the dependence is reset.
-      if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
+      if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
         Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
       }
 
@@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
 {
 private:
 
-  static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
+  using root_type     = TaskBase<ExecSpace,void,void> ;
+  using function_type = typename root_type::function_type ;
+  using queue_type    = typename root_type::queue_type ;
 
+  static_assert( sizeof(root_type) == 48 , "" );
+
+  TaskBase() = delete ;
   TaskBase( TaskBase && ) = delete ;
   TaskBase( const TaskBase & ) = delete ;
   TaskBase & operator = ( TaskBase && ) = delete ;
@@ -444,9 +477,24 @@ public:
 
   KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
 
+  // Constructor for runnable task
   KOKKOS_INLINE_FUNCTION
-  TaskBase()
-    : TaskBase< ExecSpace , void , void >()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    )
+    : root_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
     , m_result()
     {}
 
@@ -471,11 +519,14 @@ private:
 
 public:
 
-  using root_type    = TaskBase< ExecSpace , void , void > ;
-  using base_type    = TaskBase< ExecSpace , ResultType , void > ;
-  using member_type  = TaskExec< ExecSpace > ;
-  using functor_type = FunctorType ;
-  using result_type  = ResultType ;
+  using root_type       = TaskBase< ExecSpace , void , void > ;
+  using base_type       = TaskBase< ExecSpace , ResultType , void > ;
+  using specialization  = TaskQueueSpecialization< ExecSpace > ;
+  using function_type   = typename root_type::function_type ;
+  using queue_type      = typename root_type::queue_type ;
+  using member_type     = typename specialization::member_type ;
+  using functor_type    = FunctorType ;
+  using result_type     = ResultType ;
 
   template< typename Type >
   KOKKOS_INLINE_FUNCTION static
@@ -522,13 +573,30 @@ public:
       if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
         // Did not respawn, destroy the functor to free memory.
         static_cast<functor_type*>(task)->~functor_type();
-        // Cannot destroy the task until its dependences have been processed.
+        // Cannot destroy and deallocate the task until its dependences
+        // have been processed.
       }
     }
 
+  // Constructor for runnable task
   KOKKOS_INLINE_FUNCTION
-  TaskBase( functor_type const & arg_functor )
-    : base_type()
+  constexpr TaskBase( function_type arg_apply
+                    , queue_type  * arg_queue
+                    , root_type   * arg_dependence
+                    , int           arg_ref_count
+                    , int           arg_alloc_size
+                    , int           arg_task_type
+                    , int           arg_priority
+                    , FunctorType && arg_functor
+                    )
+    : base_type( arg_apply 
+               , arg_queue
+               , arg_dependence
+               , arg_ref_count
+               , arg_alloc_size
+               , arg_task_type
+               , arg_priority
+               )
     , functor_type( arg_functor )
     {}
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index fefbbad8bde297ce94ad99058e6f25eca6046b7e..23f5d3cd30dbbf87c024af935356961c1642a022 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
   )
 {
   // Push task into a concurrently pushed and popped queue.
+  // The queue can be either a ready task queue or a waiting task queue.
   // The queue is a linked list where 'task->m_next' form the links.
   // Fail the push attempt if the queue is locked;
   // otherwise retry until the push succeeds.
@@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
 template< typename ExecSpace >
 KOKKOS_FUNCTION
 typename TaskQueue< ExecSpace >::task_root_type *
-TaskQueue< ExecSpace >::pop_task
+TaskQueue< ExecSpace >::pop_ready_task
   ( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
 {
-  // Pop task from a concurrently pushed and popped queue.
+  // Pop task from a concurrently pushed and popped ready task queue.
   // The queue is a linked list where 'task->m_next' form the links.
 
-  task_root_type * const zero = (task_root_type *) 0 ;
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
@@ -252,85 +252,83 @@ TaskQueue< ExecSpace >::pop_task
     // (1) lock, (2) end, or (3) a valid task.
     // Thus zero will never appear in the queue.
     //
-    // If queue is locked then just read by guaranteeing
-    // the CAS will fail.
+    // If queue is locked then just read by guaranteeing the CAS will fail.
 
     if ( lock == task ) task = 0 ;
 
     task_root_type * const x = task ;
 
-    task = Kokkos::atomic_compare_exchange(queue,task,lock);
-
-    if ( x == task ) break ; // CAS succeeded and queue is locked
-  }
+    task = Kokkos::atomic_compare_exchange(queue,x,lock);
 
-  if ( end != task ) {
+    if ( x == task ) {
+      // CAS succeeded and queue is locked
+      //
+      // This thread has locked the queue and removed 'task' from the queue.
+      // Extract the next entry of the queue from 'task->m_next'
+      // and mark 'task' as popped from a queue by setting
+      // 'task->m_next = lock'.
+      //
+      // Place the next entry in the head of the queue,
+      // which also unlocks the queue.
+      //
+      // This thread has exclusive access to
+      // the queue and the popped task's m_next.
 
-    // This thread has locked the queue and removed 'task' from the queue.
-    // Extract the next entry of the queue from 'task->m_next'
-    // and mark 'task' as popped from a queue by setting
-    // 'task->m_next = lock'.
+      *queue = task->m_next ; task->m_next = lock ;
 
-    task_root_type * const next =
-      Kokkos::atomic_exchange( & task->m_next , lock );
+      Kokkos::memory_fence();
 
-    // Place the next entry in the head of the queue,
-    // which also unlocks the queue.
-
-    task_root_type * const unlock =
-      Kokkos::atomic_exchange( queue , next );
+#if 0
+      printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
+            , uintptr_t(queue)
+            , uintptr_t(task)
+            , uintptr_t(task->m_wait)
+            , uintptr_t(task->m_next)
+            , int(task->m_task_type)
+            , int(task->m_priority)
+            , int(task->m_ref_count) );
+#endif
 
-    if ( next == zero || next == lock || lock != unlock ) {
-      Kokkos::abort("TaskQueue::pop_task ERROR");
+      return task ;
     }
   }
 
-#if 0
-  if ( end != task ) {
-    printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
-          , uintptr_t(queue)
-          , uintptr_t(task)
-          , uintptr_t(task->m_wait)
-          , uintptr_t(task->m_next)
-          , int(task->m_task_type)
-          , int(task->m_priority)
-          , int(task->m_ref_count) );
-  }
-#endif
-
-  return task ;
+  return end ;
 }
 
 //----------------------------------------------------------------------------
 
 template< typename ExecSpace >
 KOKKOS_FUNCTION
-void TaskQueue< ExecSpace >::schedule
+void TaskQueue< ExecSpace >::schedule_runnable
   ( TaskQueue< ExecSpace >::task_root_type * const task )
 {
-  // Schedule a runnable or when_all task upon construction / spawn
+  // Schedule a runnable task upon construction / spawn
   // and upon completion of other tasks that 'task' is waiting on.
-
-  // Precondition on runnable task state:
-  //   task is either constructing or executing
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  // - if runnable then task is either constructing or respawning
   //
   //   Constructing state:
   //     task->m_wait == 0
-  //     task->m_next == dependence
-  //   Executing-respawn state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == dependence
+  //     task->m_next == dependence or 0
+  //   Respawn state:
+  //     task->m_wait == head of linked list: 'end' or valid task
+  //     task->m_next == dependence or 0
   //
   //  Task state transition:
-  //     Constructing      ->  Waiting
-  //     Executing-respawn ->  Waiting
+  //     Constructing ->  Waiting
+  //     Respawn      ->  Waiting
   //
   //  Postcondition on task state:
-  //     task->m_wait == head of linked list
-  //     task->m_next == member of linked list
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
 
 #if 0
-  printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+  printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
         , uintptr_t(task)
         , uintptr_t(task->m_wait)
         , uintptr_t(task->m_next)
@@ -343,135 +341,204 @@ void TaskQueue< ExecSpace >::schedule
   task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
   task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
-  //----------------------------------------
-  {
-    // If Constructing then task->m_wait == 0
-    // Change to waiting by task->m_wait = EndTag
-
-    task_root_type * const init =
-      Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
+  bool respawn = false ;
 
-    // Precondition
+  //----------------------------------------
 
-    if ( lock == init ) {
-      Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
-    }
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
 
-    // if ( init == 0 ) Constructing       ->  Waiting
-    // else             Executing-Respawn  ->  Waiting
+    task->m_wait = end ;
+    // Task in Waiting state
   }
+  else if ( lock != task->m_wait ) {
+    // Task in Executing state with Respawn request
+    // - Update dependence
+    // - Transition to Waiting state
+    respawn = true ;
+  }
+  else {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
+  }
+
   //----------------------------------------
+  // Scheduling a runnable task which may have a depencency 'dep'.
+  // Extract dependence, if any, from task->m_next.
+  // If 'dep' is not null then attempt to push 'task'
+  // into the wait queue of 'dep'.
+  // If the push succeeds then 'task' may be
+  // processed or executed by another thread at any time.
+  // If the push fails then 'dep' is complete and 'task'
+  // is ready to execute.
+
+  // Exclusive access so don't need an atomic exchange
+  // task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+  task_root_type * dep = task->m_next ; task->m_next = zero ;
+
+  const bool is_ready = 
+    ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+
+  if ( ( 0 != dep ) && respawn ) {
+    // Reference count for dep was incremented when
+    // respawn assigned dependency to task->m_next
+    // so that if dep completed prior to the
+    // above push_task dep would not be destroyed.
+    // dep reference count can now be decremented,
+    // which may deallocate the task.
+    TaskQueue::assign( & dep , (task_root_type *)0 );
+  }
 
-  if ( task_root_type::Aggregate != task->m_task_type ) {
+  if ( is_ready ) {
 
-    // Scheduling a runnable task which may have a depencency 'dep'.
-    // Extract dependence, if any, from task->m_next.
-    // If 'dep' is not null then attempt to push 'task'
-    // into the wait queue of 'dep'.
-    // If the push succeeds then 'task' may be
-    // processed or executed by another thread at any time.
-    // If the push fails then 'dep' is complete and 'task'
-    // is ready to execute.
+    // No dependence or 'dep' is complete so push task into ready queue.
+    // Increment the ready count before pushing into ready queue
+    // to track number of ready + executing tasks.
+    // The ready count will be decremented when the task is complete.
 
-    task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
+    Kokkos::atomic_increment( & m_ready_count );
 
-    const bool is_ready =
-      ( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
+    task_root_type * volatile * const ready_queue =
+      & m_ready[ task->m_priority ][ task->m_task_type ];
 
-    // Reference count for dep was incremented when assigned
-    // to task->m_next so that if it completed prior to the
-    // above push_task dep would not be destroyed.
-    // dep reference count can now be decremented,
-    // which may deallocate the task.
-    TaskQueue::assign( & dep , (task_root_type *)0 );
+    // A push_task fails if the ready queue is locked.
+    // A ready queue is only locked during a push or pop;
+    // i.e., it is never permanently locked.
+    // Retry push to ready queue until it succeeds.
+    // When the push succeeds then 'task' may be
+    // processed or executed by another thread at any time.
 
-    if ( is_ready ) {
+    while ( ! push_task( ready_queue , task ) );
+  }
 
-      // No dependence or 'dep' is complete so push task into ready queue.
-      // Increment the ready count before pushing into ready queue
-      // to track number of ready + executing tasks.
-      // The ready count will be decremented when the task is complete.
+  //----------------------------------------
+  // Postcondition:
+  // - A runnable 'task' was pushed into a wait or ready queue.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
+}
 
-      Kokkos::atomic_increment( & m_ready_count );
+template< typename ExecSpace >
+KOKKOS_FUNCTION
+void TaskQueue< ExecSpace >::schedule_aggregate
+  ( TaskQueue< ExecSpace >::task_root_type * const task )
+{
+  // Schedule an aggregate task upon construction
+  // and upon completion of other tasks that 'task' is waiting on.
+  //
+  // Precondition:
+  // - called by a single thread for the input task
+  // - calling thread has exclusive access to the task
+  // - task is not a member of a queue
+  //
+  //   Constructing state:
+  //     task->m_wait == 0
+  //     task->m_next == dependence or 0
+  //
+  //  Task state transition:
+  //     Constructing ->  Waiting
+  //
+  //  Postcondition on task state:
+  //     task->m_wait == head of linked list (queue)
+  //     task->m_next == member of linked list (queue)
+
+#if 0
+  printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
+        , uintptr_t(task)
+        , uintptr_t(task->m_wait)
+        , uintptr_t(task->m_next)
+        , task->m_task_type
+        , task->m_priority
+        , task->m_ref_count );
+#endif
 
-      task_root_type * volatile * const queue =
-        & m_ready[ task->m_priority ][ task->m_task_type ];
+  task_root_type * const zero = (task_root_type *) 0 ;
+  task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
+  task_root_type * const end  = (task_root_type *) task_root_type::EndTag ;
 
-      // A push_task fails if the ready queue is locked.
-      // A ready queue is only locked during a push or pop;
-      // i.e., it is never permanently locked.
-      // Retry push to ready queue until it succeeds.
-      // When the push succeeds then 'task' may be
-      // processed or executed by another thread at any time.
+  //----------------------------------------
 
-      while ( ! push_task( queue , task ) );
-    }
+  if ( zero == task->m_wait ) {
+    // Task in Constructing state
+    // - Transition to Waiting state
+    // Preconditions:
+    // - call occurs exclusively within a single thread
+
+    task->m_wait = end ;
+    // Task in Waiting state
+  }
+  else if ( lock == task->m_wait ) {
+    // Task in Complete state
+    Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
   }
+
   //----------------------------------------
-  else {
-    // Scheduling a 'when_all' task with multiple dependences.
-    // This scheduling may be called when the 'when_all' is
-    // (1) created or
-    // (2) being removed from a completed task's wait list.
+  // Scheduling a 'when_all' task with multiple dependences.
+  // This scheduling may be called when the 'when_all' is
+  // (1) created or
+  // (2) being removed from a completed task's wait list.
 
-    task_root_type ** const aggr = task->aggregate_dependences();
+  task_root_type ** const aggr = task->aggregate_dependences();
 
-    // Assume the 'when_all' is complete until a dependence is
-    // found that is not complete.
+  // Assume the 'when_all' is complete until a dependence is
+  // found that is not complete.
 
-    bool is_complete = true ;
+  bool is_complete = true ;
 
-    for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
+  for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
 
-      --i ;
+    --i ;
 
-      // Loop dependences looking for an incomplete task.
-      // Add this task to the incomplete task's wait queue.
+    // Loop dependences looking for an incomplete task.
+    // Add this task to the incomplete task's wait queue.
 
-      // Remove a task 'x' from the dependence list.
-      // The reference count of 'x' was incremented when
-      // it was assigned into the dependence list.
+    // Remove a task 'x' from the dependence list.
+    // The reference count of 'x' was incremented when
+    // it was assigned into the dependence list.
 
-      task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    // Exclusive access so don't need an atomic exchange
+    // task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
+    task_root_type * x = aggr[i] ; aggr[i] = zero ;
 
-      if ( x ) {
+    if ( x ) {
 
-        // If x->m_wait is not locked then push succeeds
-        // and the aggregate is not complete.
-        // If the push succeeds then this when_all 'task' may be
-        // processed by another thread at any time.
-        // For example, 'x' may be completeed by another
-        // thread and then re-schedule this when_all 'task'.
+      // If x->m_wait is not locked then push succeeds
+      // and the aggregate is not complete.
+      // If the push succeeds then this when_all 'task' may be
+      // processed by another thread at any time.
+      // For example, 'x' may be completeed by another
+      // thread and then re-schedule this when_all 'task'.
 
-        is_complete = ! push_task( & x->m_wait , task );
+      is_complete = ! push_task( & x->m_wait , task );
 
-        // Decrement reference count which had been incremented
-        // when 'x' was added to the dependence list.
+      // Decrement reference count which had been incremented
+      // when 'x' was added to the dependence list.
 
-        TaskQueue::assign( & x , zero );
-      }
+      TaskQueue::assign( & x , zero );
     }
+  }
 
-    if ( is_complete ) {
-      // The when_all 'task' was not added to a wait queue because
-      // all dependences were complete so this aggregate is complete.
-      // Complete the when_all 'task' to schedule other tasks
-      // that are waiting for the when_all 'task' to complete.
+  if ( is_complete ) {
+    // The when_all 'task' was not added to a wait queue because
+    // all dependences were complete so this aggregate is complete.
+    // Complete the when_all 'task' to schedule other tasks
+    // that are waiting for the when_all 'task' to complete.
 
-      task->m_next = lock ;
+    task->m_next = lock ;
 
-      complete( task );
+    complete( task );
 
-      // '*task' may have been deleted upon completion
-    }
+    // '*task' may have been deleted upon completion
   }
+
   //----------------------------------------
   // Postcondition:
-  //   A runnable 'task' was pushed into a wait or ready queue.
-  //   An aggregate 'task' was either pushed to a wait queue
-  //   or completed.
-  // Concurrent execution may have already popped 'task'
-  // from a queue and processed it as appropriate.
+  // - An aggregate 'task' was either pushed to a wait queue or completed.
+  // - Concurrent execution may have already popped 'task'
+  //   from a queue and processed it as appropriate.
 }
 
 //----------------------------------------------------------------------------
@@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
     // Is a runnable task has finished executing and requested respawn.
     // Schedule the task for subsequent execution.
 
-    schedule( task );
+    schedule_runnable( task );
   }
   //----------------------------------------
   else {
@@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
       TaskQueue::assign( & task , zero );
 
       // This thread has exclusive access to the wait list so
-      // the concurrency-safe pop_task function is not needed.
+      // the concurrency-safe pop_ready_task function is not needed.
       // Schedule the tasks that have been waiting on the input 'task',
       // which may have been deleted.
 
       while ( x != end ) {
+        // Have exclusive access to 'x' until it is scheduled
+        // Set x->m_next = zero  <=  no dependence, not a respawn
 
-        // Set x->m_next = zero  <=  no dependence
-
-        task_root_type * const next =
-          (task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
+        task_root_type * const next = x->m_next ; x->m_next = 0 ;
 
-        schedule( x );
+        if ( task_root_type::Aggregate != x->m_task_type ) {
+          schedule_runnable( x );
+        }
+        else {
+          schedule_aggregate( x );
+        }
 
         x = next ;
       }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
index ff503cb27329c006aeb0b476c2dd54e09d43baa4..d72cde03fd2bb1ae40559c80d007f7a8836636c0 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -45,6 +45,7 @@
 #define KOKKOS_CORE_IMPL_UTILITIES_HPP
 
 #include <Kokkos_Macros.hpp>
+#include <stdint.h>
 #include <type_traits>
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
index ad1b6dce39d03182d1187105d79a9cb8e239ac8e..93ff6c48a77d00e45e3028413d5c02f4020d65bc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,52 +36,144 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <Kokkos_Macros.hpp>
+
 #include <impl/Kokkos_spinwait.hpp>
 
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_BitOps.hpp>
+
 /*--------------------------------------------------------------------------*/
 
-#if ( KOKKOS_ENABLE_ASM )
-  #if defined( __arm__ ) || defined( __aarch64__ )
-    /* No-operation instruction to idle the thread. */
-    #define YIELD   asm volatile("nop")
+#if !defined( _WIN32 )
+  #if defined( KOKKOS_ENABLE_ASM )
+    #if defined( __arm__ ) || defined( __aarch64__ )
+      /* No-operation instruction to idle the thread. */
+      #define KOKKOS_INTERNAL_PAUSE
+    #else
+      /* Pause instruction to prevent excess processor bus usage */
+      #define KOKKOS_INTERNAL_PAUSE   asm volatile("pause\n":::"memory")
+    #endif
+    #define KOKKOS_INTERNAL_NOP2    asm volatile("nop\n" "nop\n")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0u:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1u:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2u:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3u:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
   #else
-    /* Pause instruction to prevent excess processor bus usage */
-    #define YIELD   asm volatile("pause\n":::"memory")
+    #include <sched.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      sched_yield();
+    }
+    }
+  #endif
+#else // defined( _WIN32 )
+  #if defined ( KOKKOS_ENABLE_WINTHREAD )
+    #include <process.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      Sleep(0);
+    }
+    }
+  #elif defined( _MSC_VER )
+    #define NOMINMAX
+    #include <winsock2.h>
+    #include <windows.h>
+    namespace {
+    inline void kokkos_internal_yield( const unsigned ) noexcept {
+      YieldProcessor();
+    }
+    }
+  #else
+    #define KOKKOS_INTERNAL_PAUSE   __asm__ __volatile__("pause\n":::"memory")
+    #define KOKKOS_INTERNAL_NOP2    __asm__ __volatile__("nop\n" "nop")
+    #define KOKKOS_INTERNAL_NOP4    KOKKOS_INTERNAL_NOP2;  KOKKOS_INTERNAL_NOP2
+    #define KOKKOS_INTERNAL_NOP8    KOKKOS_INTERNAL_NOP4;  KOKKOS_INTERNAL_NOP4;
+    #define KOKKOS_INTERNAL_NOP16   KOKKOS_INTERNAL_NOP8;  KOKKOS_INTERNAL_NOP8;
+    #define KOKKOS_INTERNAL_NOP32   KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
+    namespace {
+    inline void kokkos_internal_yield( const unsigned i ) noexcept {
+      switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
+      case 0:  KOKKOS_INTERNAL_NOP2;  break;
+      case 1:  KOKKOS_INTERNAL_NOP4;  break;
+      case 2:  KOKKOS_INTERNAL_NOP8;  break;
+      case 3:  KOKKOS_INTERNAL_NOP16; break;
+      default: KOKKOS_INTERNAL_NOP32;
+      }
+      KOKKOS_INTERNAL_PAUSE;
+    }
+    }
   #endif
-#elif defined ( KOKKOS_ENABLE_WINTHREAD )
-  #include <process.h>
-  #define YIELD  Sleep(0)
-#elif defined ( _WIN32)  && defined (_MSC_VER)
-  /* Windows w/ Visual Studio */
-  #define NOMINMAX
-  #include <winsock2.h>
-  #include <windows.h>
-#define YIELD YieldProcessor();
-#elif defined ( _WIN32 )
-  /* Windows w/ Intel*/
-  #define YIELD __asm__ __volatile__("pause\n":::"memory")
-#else
-  #include <sched.h>
-  #define YIELD  sched_yield()
 #endif
 
+
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
 namespace Impl {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value )
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value == flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
 {
+  Kokkos::store_fence();
+  unsigned i = 0;
   while ( value == flag ) {
-    YIELD ;
+    kokkos_internal_yield(i);
+    ++i;
+  }
+  Kokkos::load_fence();
+}
+
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
+{
+  Kokkos::store_fence();
+  unsigned i = 0;
+  while ( value != flag ) {
+    kokkos_internal_yield(i);
+    ++i;
   }
+  Kokkos::load_fence();
 }
+
 #endif
 
 } /* namespace Impl */
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
index cc87771faefcb8ad7716842890dbec4a9c1219a1..6e34b8a943d164eea1af317be66928a26a9e4ab2 100644
--- a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -47,14 +47,30 @@
 
 #include <Kokkos_Macros.hpp>
 
+#include <cstdint>
+
 namespace Kokkos {
 namespace Impl {
 
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-void spinwait( volatile int & flag , const int value );
+
+void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
+void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
+
+void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
+void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
 #else
+
+KOKKOS_INLINE_FUNCTION
+void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
+KOKKOS_INLINE_FUNCTION
+void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
+
+KOKKOS_INLINE_FUNCTION
+void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
 KOKKOS_INLINE_FUNCTION
-void spinwait( volatile int & , const int ) {}
+void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
+
 #endif
 
 } /* namespace Impl */
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
index 795657fe876233c8ef7f962bdce12be4d0452e2f..caf6c50129f090cd13cd92e67a79880949e821a1 100644
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -115,10 +115,31 @@ IF(Kokkos_ENABLE_OpenMP)
   )
 ENDIF()
 
-IF(Kokkos_ENABLE_QTHREAD)
+IF(Kokkos_ENABLE_Qthreads)
   TRIBITS_ADD_EXECUTABLE_AND_TEST(
-    UnitTest_Qthread
-    SOURCES UnitTestMain.cpp TestQthread.cpp
+    UnitTest_Qthreads
+    SOURCES
+      UnitTestMain.cpp
+      qthreads/TestQthreads_Atomics.cpp
+      qthreads/TestQthreads_Other.cpp
+      qthreads/TestQthreads_Reductions.cpp
+      qthreads/TestQthreads_SubView_a.cpp
+      qthreads/TestQthreads_SubView_b.cpp
+      qthreads/TestQthreads_SubView_c01.cpp
+      qthreads/TestQthreads_SubView_c02.cpp
+      qthreads/TestQthreads_SubView_c03.cpp
+      qthreads/TestQthreads_SubView_c04.cpp
+      qthreads/TestQthreads_SubView_c05.cpp
+      qthreads/TestQthreads_SubView_c06.cpp
+      qthreads/TestQthreads_SubView_c07.cpp
+      qthreads/TestQthreads_SubView_c08.cpp
+      qthreads/TestQthreads_SubView_c09.cpp
+      qthreads/TestQthreads_SubView_c10.cpp
+      qthreads/TestQthreads_SubView_c11.cpp
+      qthreads/TestQthreads_SubView_c12.cpp
+      qthreads/TestQthreads_Team.cpp
+      qthreads/TestQthreads_ViewAPI_a.cpp
+      qthreads/TestQthreads_ViewAPI_b.cpp
     COMM serial mpi
     NUM_MPI_PROCS 1
     FAIL_REGULAR_EXPRESSION "  FAILED  "
@@ -194,4 +215,3 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   FAIL_REGULAR_EXPRESSION "  FAILED  "
     TESTONLYLIBS kokkos_gtest
 )
-
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
index cc59825fba85d17b67c0694de1198acd240587d9..d93830a28d9db5ae50306c70ae5187062a07c594 100644
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -6,6 +6,7 @@ vpath %.cpp ${KOKKOS_PATH}/core/unit_test
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test/qthreads
 vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda
 
 TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
@@ -35,15 +36,15 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	OBJ_CUDA = TestCuda_Other.o TestCuda_Reductions_a.o TestCuda_Reductions_b.o TestCuda_Atomics.o TestCuda_Team.o TestCuda_Spaces.o
 	OBJ_CUDA += TestCuda_SubView_a.o TestCuda_SubView_b.o
 ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-        OBJ_OPENMP += TestCuda_SubView_c_all.o
+	OBJ_OPENMP += TestCuda_SubView_c_all.o
 else
 	OBJ_CUDA += TestCuda_SubView_c01.o TestCuda_SubView_c02.o TestCuda_SubView_c03.o
-	OBJ_CUDA += TestCuda_SubView_c04.o  TestCuda_SubView_c05.o  TestCuda_SubView_c06.o
-	OBJ_CUDA += TestCuda_SubView_c07.o  TestCuda_SubView_c08.o  TestCuda_SubView_c09.o
+	OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o
+	OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o
 	OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o
 endif
-	OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o  TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o
-	OBJ_CUDA += TestCuda_ViewAPI_e.o TestCuda_ViewAPI_f.o  TestCuda_ViewAPI_g.o TestCuda_ViewAPI_h.o
+	OBJ_CUDA += TestCuda_ViewAPI_a.o TestCuda_ViewAPI_b.o TestCuda_ViewAPI_c.o TestCuda_ViewAPI_d.o
+	OBJ_CUDA += TestCuda_ViewAPI_e.o TestCuda_ViewAPI_f.o TestCuda_ViewAPI_g.o TestCuda_ViewAPI_h.o
 	OBJ_CUDA += TestCuda_ViewAPI_s.o
 	OBJ_CUDA += UnitTestMain.o gtest-all.o
 	TARGETS += KokkosCore_UnitTest_Cuda
@@ -51,13 +52,13 @@ endif
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-	OBJ_THREADS = TestThreads_Other.o TestThreads_Reductions.o TestThreads_Atomics.o TestThreads_Team.o 
-	OBJ_THREADS += TestThreads_SubView_a.o TestThreads_SubView_b.o 
+	OBJ_THREADS = TestThreads_Other.o TestThreads_Reductions.o TestThreads_Atomics.o TestThreads_Team.o
+	OBJ_THREADS += TestThreads_SubView_a.o TestThreads_SubView_b.o
 	OBJ_THREADS += TestThreads_SubView_c01.o TestThreads_SubView_c02.o TestThreads_SubView_c03.o
-	OBJ_THREADS += TestThreads_SubView_c04.o  TestThreads_SubView_c05.o  TestThreads_SubView_c06.o  
-	OBJ_THREADS += TestThreads_SubView_c07.o  TestThreads_SubView_c08.o  TestThreads_SubView_c09.o
+	OBJ_THREADS += TestThreads_SubView_c04.o TestThreads_SubView_c05.o TestThreads_SubView_c06.o
+	OBJ_THREADS += TestThreads_SubView_c07.o TestThreads_SubView_c08.o TestThreads_SubView_c09.o
 	OBJ_THREADS += TestThreads_SubView_c10.o TestThreads_SubView_c11.o TestThreads_SubView_c12.o
-	OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o UnitTestMain.o gtest-all.o 
+	OBJ_THREADS += TestThreads_ViewAPI_a.o TestThreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosCore_UnitTest_Threads
 	TEST_TARGETS += test-threads
 endif
@@ -66,11 +67,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	OBJ_OPENMP = TestOpenMP_Other.o TestOpenMP_Reductions.o TestOpenMP_Atomics.o TestOpenMP_Team.o
 	OBJ_OPENMP += TestOpenMP_SubView_a.o TestOpenMP_SubView_b.o
 ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-        OBJ_OPENMP += TestOpenMP_SubView_c_all.o
+	OBJ_OPENMP += TestOpenMP_SubView_c_all.o
 else
 	OBJ_OPENMP += TestOpenMP_SubView_c01.o TestOpenMP_SubView_c02.o TestOpenMP_SubView_c03.o
-	OBJ_OPENMP += TestOpenMP_SubView_c04.o  TestOpenMP_SubView_c05.o  TestOpenMP_SubView_c06.o
-	OBJ_OPENMP += TestOpenMP_SubView_c07.o  TestOpenMP_SubView_c08.o  TestOpenMP_SubView_c09.o
+	OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o
+	OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o
 	OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o
 endif
 	OBJ_OPENMP += TestOpenMP_ViewAPI_a.o TestOpenMP_ViewAPI_b.o UnitTestMain.o gtest-all.o
@@ -78,28 +79,38 @@ endif
 	TEST_TARGETS += test-openmp
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
+	OBJ_QTHREADS = TestQthreads_Other.o TestQthreads_Reductions.o TestQthreads_Atomics.o TestQthreads_Team.o
+	OBJ_QTHREADS += TestQthreads_SubView_a.o TestQthreads_SubView_b.o
+ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
+	OBJ_QTHREADS += TestQthreads_SubView_c_all.o
+else
+	OBJ_QTHREADS += TestQthreads_SubView_c01.o TestQthreads_SubView_c02.o TestQthreads_SubView_c03.o
+	OBJ_QTHREADS += TestQthreads_SubView_c04.o TestQthreads_SubView_c05.o TestQthreads_SubView_c06.o
+	OBJ_QTHREADS += TestQthreads_SubView_c07.o TestQthreads_SubView_c08.o TestQthreads_SubView_c09.o
+	OBJ_QTHREADS += TestQthreads_SubView_c10.o TestQthreads_SubView_c11.o TestQthreads_SubView_c12.o
+endif
+	OBJ_QTHREADS += TestQthreads_ViewAPI_a.o TestQthreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Qthreads
+	TEST_TARGETS += test-qthreads
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-	OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o 
-	OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o 
+	OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o
+	OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o
 ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
-        OBJ_OPENMP += TestSerial_SubView_c_all.o
+	OBJ_OPENMP += TestSerial_SubView_c_all.o
 else
 	OBJ_SERIAL += TestSerial_SubView_c01.o TestSerial_SubView_c02.o TestSerial_SubView_c03.o
-	OBJ_SERIAL += TestSerial_SubView_c04.o  TestSerial_SubView_c05.o  TestSerial_SubView_c06.o  
-	OBJ_SERIAL += TestSerial_SubView_c07.o  TestSerial_SubView_c08.o  TestSerial_SubView_c09.o
+	OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o
+	OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o
 	OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o
 endif
-	OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o UnitTestMain.o gtest-all.o 
+	OBJ_SERIAL += TestSerial_ViewAPI_a.o TestSerial_ViewAPI_b.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosCore_UnitTest_Serial
 	TEST_TARGETS += test-serial
 endif
 
-ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
-	OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
-	TARGETS += KokkosCore_UnitTest_Qthread
-	TEST_TARGETS += test-qthread
-endif
-
 OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
 TARGETS += KokkosCore_UnitTest_HWLOC
 TEST_TARGETS += test-hwloc
@@ -115,10 +126,6 @@ TARGETS += ${INITTESTS_TARGETS}
 INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
 TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
 
-OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
-TARGETS += KokkosCore_UnitTest_Synchronic
-TEST_TARGETS += test-synchronic
-
 KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda
 
@@ -131,8 +138,8 @@ KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
 KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial
 
-KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
+KokkosCore_UnitTest_Qthreads: $(OBJ_QTHREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthreads
 
 KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
@@ -146,9 +153,6 @@ KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
 ${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
 
-KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
-	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
-
 test-cuda: KokkosCore_UnitTest_Cuda
 	./KokkosCore_UnitTest_Cuda
 
@@ -161,8 +165,8 @@ test-openmp: KokkosCore_UnitTest_OpenMP
 test-serial: KokkosCore_UnitTest_Serial
 	./KokkosCore_UnitTest_Serial
 
-test-qthread: KokkosCore_UnitTest_Qthread
-	./KokkosCore_UnitTest_Qthread
+test-qthreads: KokkosCore_UnitTest_Qthreads
+	./KokkosCore_UnitTest_Qthreads
 
 test-hwloc: KokkosCore_UnitTest_HWLOC
 	./KokkosCore_UnitTest_HWLOC
@@ -176,9 +180,6 @@ test-default: KokkosCore_UnitTest_Default
 ${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
 	./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
 
-test-synchronic: KokkosCore_UnitTest_Synchronic
-	./KokkosCore_UnitTest_Synchronic
-
 build_all: $(TARGETS)
 
 test: $(TEST_TARGETS)
@@ -193,4 +194,3 @@ clean: kokkos-clean
 
 gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
-
diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp
index d22837f3ed7b67bccecfbe11ba4d71266a094616..f09cc5018cb698ec033639a326a29d8fffacec3f 100644
--- a/lib/kokkos/core/unit_test/TestAggregate.hpp
+++ b/lib/kokkos/core/unit_test/TestAggregate.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,8 +50,6 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 #include <impl/Kokkos_ViewArray.hpp>
 
 namespace Test {
@@ -59,51 +57,68 @@ namespace Test {
 template< class DeviceType >
 void TestViewAggregate()
 {
-  typedef Kokkos::Array<double,32>  value_type ;
-
-  typedef Kokkos::Experimental::Impl::
-    ViewDataAnalysis< value_type * , Kokkos::LayoutLeft , value_type >
-      analysis_1d ;
+  typedef Kokkos::Array< double, 32 >  value_type;
+  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< value_type *, Kokkos::LayoutLeft, value_type > analysis_1d;
 
-  static_assert( std::is_same< typename analysis_1d::specialize , Kokkos::Array<> >::value , "" );
+  static_assert( std::is_same< typename analysis_1d::specialize, Kokkos::Array<> >::value, "" );
 
+  typedef Kokkos::ViewTraits< value_type **, DeviceType > a32_traits;
+  typedef Kokkos::ViewTraits< typename a32_traits::scalar_array_type, DeviceType > flat_traits;
 
-  typedef Kokkos::ViewTraits< value_type ** , DeviceType > a32_traits ;
-  typedef Kokkos::ViewTraits< typename a32_traits::scalar_array_type , DeviceType > flat_traits ;
+  static_assert( std::is_same< typename a32_traits::specialize, Kokkos::Array<> >::value, "" );
+  static_assert( std::is_same< typename a32_traits::value_type, value_type >::value, "" );
+  static_assert( a32_traits::rank == 2, "" );
+  static_assert( a32_traits::rank_dynamic == 2, "" );
 
-  static_assert( std::is_same< typename a32_traits::specialize , Kokkos::Array<> >::value , "" );
-  static_assert( std::is_same< typename a32_traits::value_type , value_type >::value , "" );
-  static_assert( a32_traits::rank == 2 , "" );
-  static_assert( a32_traits::rank_dynamic == 2 , "" );
+  static_assert( std::is_same< typename flat_traits::specialize, void >::value, "" );
+  static_assert( flat_traits::rank == 3, "" );
+  static_assert( flat_traits::rank_dynamic == 2, "" );
+  static_assert( flat_traits::dimension::N2 == 32, "" );
 
-  static_assert( std::is_same< typename flat_traits::specialize , void >::value , "" );
-  static_assert( flat_traits::rank == 3 , "" );
-  static_assert( flat_traits::rank_dynamic == 2 , "" );
-  static_assert( flat_traits::dimension::N2 == 32 , "" );
+  typedef Kokkos::View< Kokkos::Array< double, 32 > **, DeviceType > a32_type;
+  typedef typename a32_type::array_type  a32_flat_type;
 
+  static_assert( std::is_same< typename a32_type::value_type, value_type >::value, "" );
+  static_assert( std::is_same< typename a32_type::pointer_type, double * >::value, "" );
+  static_assert( a32_type::Rank == 2, "" );
+  static_assert( a32_flat_type::Rank == 3, "" );
 
-  typedef Kokkos::View< Kokkos::Array<double,32> ** , DeviceType > a32_type ;
-
-  typedef typename a32_type::array_type  a32_flat_type ;
-
-  static_assert( std::is_same< typename a32_type::value_type , value_type >::value , "" );
-  static_assert( std::is_same< typename a32_type::pointer_type , double * >::value , "" );
-  static_assert( a32_type::Rank == 2 , "" );
-  static_assert( a32_flat_type::Rank == 3 , "" );
-
-  a32_type x("test",4,5);
+  a32_type x( "test", 4, 5 );
   a32_flat_type y( x );
 
-  ASSERT_EQ( x.extent(0) , 4 );
-  ASSERT_EQ( x.extent(1) , 5 );
-  ASSERT_EQ( y.extent(0) , 4 );
-  ASSERT_EQ( y.extent(1) , 5 );
-  ASSERT_EQ( y.extent(2) , 32 );
-}
-
+  ASSERT_EQ( x.extent( 0 ), 4 );
+  ASSERT_EQ( x.extent( 1 ), 5 );
+  ASSERT_EQ( y.extent( 0 ), 4 );
+  ASSERT_EQ( y.extent( 1 ), 5 );
+  ASSERT_EQ( y.extent( 2 ), 32 );
+
+  // Initialize arrays from brace-init-list as for std::array.
+  //
+  // Comment: Clang will issue the following warning if we don't use double
+  //          braces here (one for initializing the Kokkos::Array and one for
+  //          initializing the sub-aggreagate C-array data member),
+  //
+  //            warning: suggest braces around initialization of subobject
+  //
+  //          but single brace syntax would be valid as well.
+  Kokkos::Array< float, 2 > aggregate_initialization_syntax_1 = { { 1.41, 3.14 } };
+  ASSERT_FLOAT_EQ( aggregate_initialization_syntax_1[0], 1.41 );
+  ASSERT_FLOAT_EQ( aggregate_initialization_syntax_1[1], 3.14 );
+
+  Kokkos::Array< int, 3 > aggregate_initialization_syntax_2{ { 0, 1, 2 } }; // since C++11
+  for ( int i = 0; i < 3; ++i ) {
+    ASSERT_EQ( aggregate_initialization_syntax_2[i], i );
+  }
+
+  // Note that this is a valid initialization.
+  Kokkos::Array< double, 3 > initialized_with_one_argument_missing = { { 255, 255 } };
+  for (int i = 0; i < 2; ++i) {
+    ASSERT_DOUBLE_EQ( initialized_with_one_argument_missing[i], 255 );
+  }
+  // But the following line would not compile
+//  Kokkos::Array< double, 3 > initialized_with_too_many{ { 1, 2, 3, 4 } };
 }
 
-/*--------------------------------------------------------------------------*/
-/*--------------------------------------------------------------------------*/
+} // namespace Test
 
 #endif /* #ifndef TEST_AGGREGATE_HPP */
diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp
index e948723574b48b2a64ee66c487062e34c0ccf29b..ff77b8dca6f0437393bacca9d42ed73d359e44d5 100644
--- a/lib/kokkos/core/unit_test/TestAtomic.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomic.hpp
@@ -45,116 +45,129 @@
 
 namespace TestAtomic {
 
-// Struct for testing arbitrary size atomics
+// Struct for testing arbitrary size atomics.
 
-template<int N>
+template< int N >
 struct SuperScalar {
   double val[N];
 
   KOKKOS_INLINE_FUNCTION
   SuperScalar() {
-    for(int i=0; i<N; i++)
+    for ( int i = 0; i < N; i++ ) {
       val[i] = 0.0;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar(const SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar(const volatile SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar( const volatile SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator = (const SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar& operator=( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator = (const volatile SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar& operator=( const volatile SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator = (const SuperScalar& src) volatile  {
-    for(int i=0; i<N; i++)
+  void operator=( const SuperScalar & src ) volatile  {
+    for ( int i = 0; i < N; i++ ) {
       val[i] = src.val[i];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar operator + (const SuperScalar& src) {
+  SuperScalar operator+( const SuperScalar & src ) {
     SuperScalar tmp = *this;
-    for(int i=0; i<N; i++)
+    for ( int i = 0; i < N; i++ ) {
       tmp.val[i] += src.val[i];
+    }
     return tmp;
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator += (const double& src) {
-    for(int i=0; i<N; i++)
-      val[i] += 1.0*(i+1)*src;
+  SuperScalar& operator+=( const double & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] += 1.0 * ( i + 1 ) * src;
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar& operator += (const SuperScalar& src) {
-    for(int i=0; i<N; i++)
+  SuperScalar& operator+=( const SuperScalar & src ) {
+    for ( int i = 0; i < N; i++ ) {
       val[i] += src.val[i];
+    }
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const SuperScalar& src) {
+  bool operator==( const SuperScalar & src ) {
     bool compare = true;
-    for(int i=0; i<N; i++)
-      compare = compare && ( val[i] == src.val[i]);
+    for( int i = 0; i < N; i++ ) {
+      compare = compare && ( val[i] == src.val[i] );
+    }
     return compare;
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const SuperScalar& src) {
+  bool operator!=( const SuperScalar & src ) {
     bool compare = true;
-    for(int i=0; i<N; i++)
-      compare = compare && ( val[i] == src.val[i]);
+    for ( int i = 0; i < N; i++ ) {
+      compare = compare && ( val[i] == src.val[i] );
+    }
     return !compare;
   }
 
-
-
   KOKKOS_INLINE_FUNCTION
-  SuperScalar(const double& src) {
-    for(int i=0; i<N; i++)
-      val[i] = 1.0 * (i+1) * src;
+  SuperScalar( const double & src ) {
+    for ( int i = 0; i < N; i++ ) {
+      val[i] = 1.0 * ( i + 1 ) * src;
+    }
   }
-
 };
 
-template<int N>
-std::ostream& operator<<(std::ostream& os, const SuperScalar<N>& dt)
+template< int N >
+std::ostream & operator<<( std::ostream & os, const SuperScalar< N > & dt )
 {
-    os << "{ ";
-    for(int i=0;i<N-1;i++)
-       os << dt.val[i] << ", ";
-    os << dt.val[N-1] << "}";
-    return os;
+  os << "{ ";
+  for ( int  i = 0; i < N - 1; i++ ) {
+     os << dt.val[i] << ", ";
+  }
+  os << dt.val[N-1] << "}";
+
+  return os;
 }
 
-template<class T,class DEVICE_TYPE>
+template< class T, class DEVICE_TYPE >
 struct ZeroFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T,execution_space> type;
-  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
   type data;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
+  void operator()( int ) const {
     data() = 0;
   }
 };
@@ -163,47 +176,53 @@ struct ZeroFunctor {
 //--------------atomic_fetch_add---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct AddFunctor{
+template< class T, class DEVICE_TYPE >
+struct AddFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_add(&data(),(T)1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_add( &data(), (T) 1 );
   }
 };
 
-template<class T, class execution_space >
-T AddLoop(int loop) {
-  struct ZeroFunctor<T,execution_space> f_zero;
-  typename ZeroFunctor<T,execution_space>::type data("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T AddLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_zero.data = data;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  struct AddFunctor<T,execution_space> f_add;
+  struct AddFunctor< T, execution_space > f_add;
+
   f_add.data = data;
-  Kokkos::parallel_for(loop,f_add);
+  Kokkos::parallel_for( loop, f_add );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T AddLoopSerial(int loop) {
+template< class T >
+T AddLoopSerial( int loop ) {
   T* data = new T[1];
   data[0] = 0;
 
-  for(int i=0;i<loop;i++)
-  *data+=(T)1;
+  for ( int i = 0; i < loop; i++ ) {
+    *data += (T) 1;
+  }
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
@@ -211,65 +230,69 @@ T AddLoopSerial(int loop) {
 //--------------atomic_compare_exchange-----------------
 //------------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct CASFunctor{
+template< class T, class DEVICE_TYPE >
+struct CASFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-	  T old = data();
-	  T newval, assumed;
-	  do {
-	    assumed = old;
-	    newval = assumed + (T)1;
-	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
-	  }
-	  while( old != assumed );
+  void operator()( int ) const {
+    T old = data();
+    T newval, assumed;
+
+    do {
+      assumed = old;
+      newval = assumed + (T) 1;
+      old = Kokkos::atomic_compare_exchange( &data(), assumed, newval );
+    } while( old != assumed );
   }
 };
 
-template<class T, class execution_space >
-T CASLoop(int loop) {
-  struct ZeroFunctor<T,execution_space> f_zero;
-  typename ZeroFunctor<T,execution_space>::type data("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T CASLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_zero.data = data;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  struct CASFunctor<T,execution_space> f_cas;
+  struct CASFunctor< T, execution_space > f_cas;
+
   f_cas.data = data;
-  Kokkos::parallel_for(loop,f_cas);
+  Kokkos::parallel_for( loop, f_cas );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
 
   return val;
 }
 
-template<class T>
-T CASLoopSerial(int loop) {
+template< class T >
+T CASLoopSerial( int loop ) {
   T* data = new T[1];
   data[0] = 0;
 
-  for(int i=0;i<loop;i++) {
-	  T assumed;
-	  T newval;
-	  T old;
-	  do {
-	    assumed = *data;
-	    newval = assumed + (T)1;
-	    old = *data;
-	    *data = newval;
-	  }
-	  while(!(assumed==old));
+  for ( int i = 0; i < loop; i++ ) {
+    T assumed;
+    T newval;
+    T old;
+
+    do {
+      assumed = *data;
+      newval = assumed + (T) 1;
+      old = *data;
+      *data = newval;
+    } while( !( assumed == old ) );
   }
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
@@ -277,109 +300,119 @@ T CASLoopSerial(int loop) {
 //--------------atomic_exchange-----------------
 //----------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct ExchFunctor{
+template< class T, class DEVICE_TYPE >
+struct ExchFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data, data2;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
-    T old = Kokkos::atomic_exchange(&data(),(T)i);
-    Kokkos::atomic_fetch_add(&data2(),old);
+  void operator()( int i ) const {
+    T old = Kokkos::atomic_exchange( &data(), (T) i );
+    Kokkos::atomic_fetch_add( &data2(), old );
   }
 };
 
-template<class T, class execution_space >
-T ExchLoop(int loop) {
-  struct ZeroFunctor<T,execution_space> f_zero;
-  typename ZeroFunctor<T,execution_space>::type data("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T ExchLoop( int loop ) {
+  struct ZeroFunctor< T, execution_space > f_zero;
+  typename ZeroFunctor< T, execution_space >::type data( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_zero.data = data;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  typename ZeroFunctor<T,execution_space>::type data2("Data");
-  typename ZeroFunctor<T,execution_space>::h_type h_data2("HData");
+  typename ZeroFunctor< T, execution_space >::type data2( "Data" );
+  typename ZeroFunctor< T, execution_space >::h_type h_data2( "HData" );
+
   f_zero.data = data2;
-  Kokkos::parallel_for(1,f_zero);
+  Kokkos::parallel_for( 1, f_zero );
   execution_space::fence();
 
-  struct ExchFunctor<T,execution_space> f_exch;
+  struct ExchFunctor< T, execution_space > f_exch;
+
   f_exch.data = data;
   f_exch.data2 = data2;
-  Kokkos::parallel_for(loop,f_exch);
+  Kokkos::parallel_for( loop, f_exch );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
-  Kokkos::deep_copy(h_data2,data2);
+  Kokkos::deep_copy( h_data, data );
+  Kokkos::deep_copy( h_data2, data2 );
   T val = h_data() + h_data2();
 
   return val;
 }
 
-template<class T>
-T ExchLoopSerial(typename std::conditional<!std::is_same<T,Kokkos::complex<double> >::value,int,void>::type loop) {
+template< class T >
+T ExchLoopSerial( typename std::conditional< !std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
   T* data = new T[1];
   T* data2 = new T[1];
   data[0] = 0;
   data2[0] = 0;
-  for(int i=0;i<loop;i++) {
-	T old = *data;
-	*data=(T) i;
-	*data2+=old;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T old = *data;
+    *data = (T) i;
+    *data2 += old;
   }
 
   T val = *data2 + *data;
   delete [] data;
   delete [] data2;
+
   return val;
 }
 
-template<class T>
-T ExchLoopSerial(typename std::conditional<std::is_same<T,Kokkos::complex<double> >::value,int,void>::type loop) {
+template< class T >
+T ExchLoopSerial( typename std::conditional< std::is_same< T, Kokkos::complex<double> >::value, int, void >::type loop ) {
   T* data = new T[1];
   T* data2 = new T[1];
   data[0] = 0;
   data2[0] = 0;
-  for(int i=0;i<loop;i++) {
-  T old = *data;
-  data->real() = (static_cast<double>(i));
-  data->imag() = 0;
-  *data2+=old;
+
+  for ( int i = 0; i < loop; i++ ) {
+    T old = *data;
+    data->real() = ( static_cast<double>( i ) );
+    data->imag() = 0;
+    *data2 += old;
   }
 
   T val = *data2 + *data;
   delete [] data;
   delete [] data2;
+
   return val;
 }
 
-template<class T, class DeviceType >
-T LoopVariant(int loop, int test) {
-  switch (test) {
-    case 1: return AddLoop<T,DeviceType>(loop);
-    case 2: return CASLoop<T,DeviceType>(loop);
-    case 3: return ExchLoop<T,DeviceType>(loop);
+template< class T, class DeviceType >
+T LoopVariant( int loop, int test ) {
+  switch ( test ) {
+    case 1: return AddLoop< T, DeviceType >( loop );
+    case 2: return CASLoop< T, DeviceType >( loop );
+    case 3: return ExchLoop< T, DeviceType >( loop );
   }
+
   return 0;
 }
 
-template<class T>
-T LoopVariantSerial(int loop, int test) {
-  switch (test) {
-    case 1: return AddLoopSerial<T>(loop);
-    case 2: return CASLoopSerial<T>(loop);
-    case 3: return ExchLoopSerial<T>(loop);
+template< class T >
+T LoopVariantSerial( int loop, int test ) {
+  switch ( test ) {
+    case 1: return AddLoopSerial< T >( loop );
+    case 2: return CASLoopSerial< T >( loop );
+    case 3: return ExchLoopSerial< T >( loop );
   }
+
   return 0;
 }
 
-template<class T,class DeviceType>
-bool Loop(int loop, int test)
+template< class T, class DeviceType >
+bool Loop( int loop, int test )
 {
-  T res       = LoopVariant<T,DeviceType>(loop,test);
-  T resSerial = LoopVariantSerial<T>(loop,test);
+  T res       = LoopVariant< T, DeviceType >( loop, test );
+  T resSerial = LoopVariantSerial< T >( loop, test );
 
   bool passed = true;
 
@@ -387,16 +420,14 @@ bool Loop(int loop, int test)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = "
               << test << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-
-  return passed ;
-}
-
+  return passed;
 }
 
+} // namespace TestAtomic
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
index 7f1519045187c535c586659e757eeb24609ccb50..e3ceca404ff12c1c9e5da04bf70d183fee87dfdd 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -49,14 +49,16 @@ namespace TestAtomicOperations {
 //--------------zero_functor---------------------
 //-----------------------------------------------
 
-template<class T,class DEVICE_TYPE>
+template< class T, class DEVICE_TYPE >
 struct ZeroFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T,execution_space> type;
-  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
   type data;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
+  void operator()( int ) const {
     data() = 0;
   }
 };
@@ -65,78 +67,84 @@ struct ZeroFunctor {
 //--------------init_functor---------------------
 //-----------------------------------------------
 
-template<class T,class DEVICE_TYPE>
+template< class T, class DEVICE_TYPE >
 struct InitFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<T,execution_space> type;
-  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  typedef typename Kokkos::View< T, execution_space > type;
+  typedef typename Kokkos::View< T, execution_space >::HostMirror h_type;
+
   type data;
-  T init_value ;
+  T init_value;
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
+  void operator()( int ) const {
     data() = init_value;
   }
 
-  InitFunctor(T _init_value) : init_value(_init_value) {}
+  InitFunctor( T _init_value ) : init_value( _init_value ) {}
 };
 
-
 //---------------------------------------------------
 //--------------atomic_fetch_max---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct MaxFunctor{
+template< class T, class DEVICE_TYPE >
+struct MaxFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    //Kokkos::atomic_fetch_max(&data(),(T)1);
-    Kokkos::atomic_fetch_max(&data(),(T)i1);
+  void operator()( int ) const {
+    //Kokkos::atomic_fetch_max( &data(), (T) 1 );
+    Kokkos::atomic_fetch_max( &data(), (T) i1 );
   }
-  MaxFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+  MaxFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T MaxAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T MaxAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct MaxFunctor<T,execution_space> f(i0,i1);
+  struct MaxFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T MaxAtomicCheck(T i0 , T i1) {
+template< class T >
+T MaxAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = (i0 > i1 ? i0 : i1) ;
+  *data = ( i0 > i1 ? i0 : i1 );
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool MaxAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool MaxAtomicTest( T i0, T i1 )
 {
-  T res       = MaxAtomic<T,DeviceType>(i0,i1);
-  T resSerial = MaxAtomicCheck<T>(i0,i1);
+  T res       = MaxAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MaxAtomicCheck<T>( i0, i1 );
 
   bool passed = true;
 
@@ -144,71 +152,77 @@ bool MaxAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MaxAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_min---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct MinFunctor{
+template< class T, class DEVICE_TYPE >
+struct MinFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_min(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_min( &data(), (T) i1 );
   }
-  MinFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  MinFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T MinAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T MinAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct MinFunctor<T,execution_space> f(i0,i1);
+  struct MinFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T MinAtomicCheck(T i0 , T i1) {
+template< class T >
+T MinAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = (i0 < i1 ? i0 : i1) ;
+  *data = ( i0 < i1 ? i0 : i1 );
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool MinAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool MinAtomicTest( T i0, T i1 )
 {
-  T res       = MinAtomic<T,DeviceType>(i0,i1);
-  T resSerial = MinAtomicCheck<T>(i0,i1);
+  T res       = MinAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MinAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -216,55 +230,60 @@ bool MinAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MinAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_increment---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct IncFunctor{
+template< class T, class DEVICE_TYPE >
+struct IncFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_increment(&data());
+  void operator()( int ) const {
+    Kokkos::atomic_increment( &data() );
   }
-  IncFunctor( T _i0 ) : i0(_i0) {}
+
+  IncFunctor( T _i0 ) : i0( _i0 ) {}
 };
 
-template<class T, class execution_space >
-T IncAtomic(T i0) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T IncAtomic( T i0 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct IncFunctor<T,execution_space> f(i0);
+  struct IncFunctor< T, execution_space > f( i0 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T IncAtomicCheck(T i0) {
+template< class T >
+T IncAtomicCheck( T i0 ) {
   T* data = new T[1];
   data[0] = 0;
 
@@ -272,14 +291,15 @@ T IncAtomicCheck(T i0) {
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool IncAtomicTest(T i0)
+template< class T, class DeviceType >
+bool IncAtomicTest( T i0 )
 {
-  T res       = IncAtomic<T,DeviceType>(i0);
-  T resSerial = IncAtomicCheck<T>(i0);
+  T res       = IncAtomic< T, DeviceType >( i0 );
+  T resSerial = IncAtomicCheck< T >( i0 );
 
   bool passed = true;
 
@@ -287,55 +307,60 @@ bool IncAtomicTest(T i0)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = IncAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_decrement---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct DecFunctor{
+template< class T, class DEVICE_TYPE >
+struct DecFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_decrement(&data());
+  void operator()( int ) const {
+    Kokkos::atomic_decrement( &data() );
   }
-  DecFunctor( T _i0 ) : i0(_i0) {}
+
+  DecFunctor( T _i0 ) : i0( _i0 ) {}
 };
 
-template<class T, class execution_space >
-T DecAtomic(T i0) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T DecAtomic( T i0 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct DecFunctor<T,execution_space> f(i0);
+  struct DecFunctor< T, execution_space > f( i0 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T DecAtomicCheck(T i0) {
+template< class T >
+T DecAtomicCheck( T i0 ) {
   T* data = new T[1];
   data[0] = 0;
 
@@ -343,14 +368,15 @@ T DecAtomicCheck(T i0) {
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool DecAtomicTest(T i0)
+template< class T, class DeviceType >
+bool DecAtomicTest( T i0 )
 {
-  T res       = DecAtomic<T,DeviceType>(i0);
-  T resSerial = DecAtomicCheck<T>(i0);
+  T res       = DecAtomic< T, DeviceType >( i0 );
+  T resSerial = DecAtomicCheck< T >( i0 );
 
   bool passed = true;
 
@@ -358,71 +384,77 @@ bool DecAtomicTest(T i0)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = DecAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_mul---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct MulFunctor{
+template< class T, class DEVICE_TYPE >
+struct MulFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_mul(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_mul( &data(), (T) i1 );
   }
-  MulFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  MulFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T MulAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T MulAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct MulFunctor<T,execution_space> f(i0,i1);
+  struct MulFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T MulAtomicCheck(T i0 , T i1) {
+template< class T >
+T MulAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0*i1 ;
+  *data = i0*i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool MulAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool MulAtomicTest( T i0, T i1 )
 {
-  T res       = MulAtomic<T,DeviceType>(i0,i1);
-  T resSerial = MulAtomicCheck<T>(i0,i1);
+  T res       = MulAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = MulAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -430,71 +462,77 @@ bool MulAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MulAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_div---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct DivFunctor{
+template< class T, class DEVICE_TYPE >
+struct DivFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_div(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_div( &data(), (T) i1 );
   }
-  DivFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  DivFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T DivAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T DivAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct DivFunctor<T,execution_space> f(i0,i1);
+  struct DivFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T DivAtomicCheck(T i0 , T i1) {
+template< class T >
+T DivAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0/i1 ;
+  *data = i0 / i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool DivAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool DivAtomicTest( T i0, T i1 )
 {
-  T res       = DivAtomic<T,DeviceType>(i0,i1);
-  T resSerial = DivAtomicCheck<T>(i0,i1);
+  T res       = DivAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = DivAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -502,71 +540,77 @@ bool DivAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = DivAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_mod---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct ModFunctor{
+template< class T, class DEVICE_TYPE >
+struct ModFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_mod(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_mod( &data(), (T) i1 );
   }
-  ModFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  ModFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T ModAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T ModAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct ModFunctor<T,execution_space> f(i0,i1);
+  struct ModFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T ModAtomicCheck(T i0 , T i1) {
+template< class T >
+T ModAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0%i1 ;
+  *data = i0 % i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool ModAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool ModAtomicTest( T i0, T i1 )
 {
-  T res       = ModAtomic<T,DeviceType>(i0,i1);
-  T resSerial = ModAtomicCheck<T>(i0,i1);
+  T res       = ModAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = ModAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -574,71 +618,77 @@ bool ModAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = ModAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_and---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct AndFunctor{
+template< class T, class DEVICE_TYPE >
+struct AndFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_and(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_and( &data(), (T) i1 );
   }
-  AndFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  AndFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T AndAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T AndAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct AndFunctor<T,execution_space> f(i0,i1);
+  struct AndFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T AndAtomicCheck(T i0 , T i1) {
+template< class T >
+T AndAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0&i1 ;
+  *data = i0 & i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool AndAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool AndAtomicTest( T i0, T i1 )
 {
-  T res       = AndAtomic<T,DeviceType>(i0,i1);
-  T resSerial = AndAtomicCheck<T>(i0,i1);
+  T res       = AndAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = AndAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -646,71 +696,77 @@ bool AndAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = AndAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_or----------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct OrFunctor{
+template< class T, class DEVICE_TYPE >
+struct OrFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_or(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_or( &data(), (T) i1 );
   }
-  OrFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  OrFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T OrAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T OrAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct OrFunctor<T,execution_space> f(i0,i1);
+  struct OrFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T OrAtomicCheck(T i0 , T i1) {
+template< class T >
+T OrAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0|i1 ;
+  *data = i0 | i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool OrAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool OrAtomicTest( T i0, T i1 )
 {
-  T res       = OrAtomic<T,DeviceType>(i0,i1);
-  T resSerial = OrAtomicCheck<T>(i0,i1);
+  T res       = OrAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = OrAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -718,71 +774,77 @@ bool OrAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = OrAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_xor---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct XorFunctor{
+template< class T, class DEVICE_TYPE >
+struct XorFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_xor(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_xor( &data(), (T) i1 );
   }
-  XorFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  XorFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T XorAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T XorAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct XorFunctor<T,execution_space> f(i0,i1);
+  struct XorFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T XorAtomicCheck(T i0 , T i1) {
+template< class T >
+T XorAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0^i1 ;
+  *data = i0 ^ i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool XorAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool XorAtomicTest( T i0, T i1 )
 {
-  T res       = XorAtomic<T,DeviceType>(i0,i1);
-  T resSerial = XorAtomicCheck<T>(i0,i1);
+  T res       = XorAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = XorAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -790,71 +852,77 @@ bool XorAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = XorAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_lshift---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct LShiftFunctor{
+template< class T, class DEVICE_TYPE >
+struct LShiftFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_lshift(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_lshift( &data(), (T) i1 );
   }
-  LShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  LShiftFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T LShiftAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T LShiftAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct LShiftFunctor<T,execution_space> f(i0,i1);
+  struct LShiftFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T LShiftAtomicCheck(T i0 , T i1) {
+template< class T >
+T LShiftAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0<<i1 ;
+  *data = i0 << i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool LShiftAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool LShiftAtomicTest( T i0, T i1 )
 {
-  T res       = LShiftAtomic<T,DeviceType>(i0,i1);
-  T resSerial = LShiftAtomicCheck<T>(i0,i1);
+  T res       = LShiftAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = LShiftAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -862,71 +930,77 @@ bool LShiftAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = LShiftAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
 //---------------------------------------------------
 //--------------atomic_fetch_rshift---------------------
 //---------------------------------------------------
 
-template<class T,class DEVICE_TYPE>
-struct RShiftFunctor{
+template< class T, class DEVICE_TYPE >
+struct RShiftFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef Kokkos::View<T,execution_space> type;
+  typedef Kokkos::View< T, execution_space > type;
+
   type data;
   T i0;
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const {
-    Kokkos::atomic_fetch_rshift(&data(),(T)i1);
+  void operator()( int ) const {
+    Kokkos::atomic_fetch_rshift( &data(), (T) i1 );
   }
-  RShiftFunctor( T _i0 , T _i1 ) : i0(_i0) , i1(_i1) {}
+
+  RShiftFunctor( T _i0, T _i1 ) : i0( _i0 ), i1( _i1 ) {}
 };
 
-template<class T, class execution_space >
-T RShiftAtomic(T i0 , T i1) {
-  struct InitFunctor<T,execution_space> f_init(i0);
-  typename InitFunctor<T,execution_space>::type data("Data");
-  typename InitFunctor<T,execution_space>::h_type h_data("HData");
+template< class T, class execution_space >
+T RShiftAtomic( T i0, T i1 ) {
+  struct InitFunctor< T, execution_space > f_init( i0 );
+  typename InitFunctor< T, execution_space >::type data( "Data" );
+  typename InitFunctor< T, execution_space >::h_type h_data( "HData" );
+
   f_init.data = data;
-  Kokkos::parallel_for(1,f_init);
+  Kokkos::parallel_for( 1, f_init );
   execution_space::fence();
 
-  struct RShiftFunctor<T,execution_space> f(i0,i1);
+  struct RShiftFunctor< T, execution_space > f( i0, i1 );
+
   f.data = data;
-  Kokkos::parallel_for(1,f);
+  Kokkos::parallel_for( 1, f );
   execution_space::fence();
 
-  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy( h_data, data );
   T val = h_data();
+
   return val;
 }
 
-template<class T>
-T RShiftAtomicCheck(T i0 , T i1) {
+template< class T >
+T RShiftAtomicCheck( T i0, T i1 ) {
   T* data = new T[1];
   data[0] = 0;
 
-  *data = i0>>i1 ;
+  *data = i0 >> i1;
 
   T val = *data;
   delete [] data;
+
   return val;
 }
 
-template<class T,class DeviceType>
-bool RShiftAtomicTest(T i0, T i1)
+template< class T, class DeviceType >
+bool RShiftAtomicTest( T i0, T i1 )
 {
-  T res       = RShiftAtomic<T,DeviceType>(i0,i1);
-  T resSerial = RShiftAtomicCheck<T>(i0,i1);
+  T res       = RShiftAtomic< T, DeviceType >( i0, i1 );
+  T resSerial = RShiftAtomicCheck< T >( i0, i1 );
 
   bool passed = true;
 
@@ -934,52 +1008,52 @@ bool RShiftAtomicTest(T i0, T i1)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = RShiftAtomicTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //--------------atomic_test_control------------------
 //---------------------------------------------------
 
-template<class T,class DeviceType>
-bool AtomicOperationsTestIntegralType( int i0 , int i1 , int test )
+template< class T, class DeviceType >
+bool AtomicOperationsTestIntegralType( int i0, int i1, int test )
 {
-  switch (test) {
-    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 5: return ModAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 6: return AndAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 7: return OrAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 8: return XorAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 9: return LShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 10: return RShiftAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 11: return IncAtomicTest<T,DeviceType>( (T)i0 );
-    case 12: return DecAtomicTest<T,DeviceType>( (T)i0 );
+  switch ( test ) {
+    case 1: return MaxAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 2: return MinAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 3: return MulAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 4: return DivAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 5: return ModAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 6: return AndAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 7: return OrAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 8: return XorAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 9: return LShiftAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 10: return RShiftAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 11: return IncAtomicTest< T, DeviceType >( (T) i0 );
+    case 12: return DecAtomicTest< T, DeviceType >( (T) i0 );
   }
+
   return 0;
 }
 
-template<class T,class DeviceType>
-bool AtomicOperationsTestNonIntegralType( int i0 , int i1 , int test )
+template< class T, class DeviceType >
+bool AtomicOperationsTestNonIntegralType( int i0, int i1, int test )
 {
-  switch (test) {
-    case 1: return MaxAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 2: return MinAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 3: return MulAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
-    case 4: return DivAtomicTest<T,DeviceType>( (T)i0 , (T)i1 );
+  switch ( test ) {
+    case 1: return MaxAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 2: return MinAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 3: return MulAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
+    case 4: return DivAtomicTest< T, DeviceType >( (T) i0, (T) i1 );
   }
+
   return 0;
 }
 
-} // namespace
-
+} // namespace TestAtomicOperations
diff --git a/lib/kokkos/core/unit_test/TestAtomicViews.hpp b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
index 739492d32f806a80d1b64f10e3d0ba887f627acd..71080e5c8216aecd01985139c37bb68931139929 100644
--- a/lib/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -49,56 +49,52 @@ namespace TestAtomicViews {
 //-----------atomic view api tests-----------------
 //-------------------------------------------------
 
-template< class T , class ... P >
-size_t allocation_count( const Kokkos::View<T,P...> & view )
+template< class T, class ... P >
+size_t allocation_count( const Kokkos::View< T, P... > & view )
 {
   const size_t card  = view.size();
   const size_t alloc = view.span();
 
-  const int memory_span = Kokkos::View<int*>::required_allocation_size(100);
+  const int memory_span = Kokkos::View< int* >::required_allocation_size( 100 );
 
-  return (card <= alloc && memory_span == 400) ? alloc : 0 ;
+  return ( card <= alloc && memory_span == 400 ) ? alloc : 0;
 }
 
-template< class DataType ,
-          class DeviceType ,
+template< class DataType,
+          class DeviceType,
           unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
-struct TestViewOperator_LeftAndRight ;
+struct TestViewOperator_LeftAndRight;
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 1 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+    { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
+    { update = 0; }
 
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > left_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space, Kokkos::MemoryTraits< Kokkos::Atomic > > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > right_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space, Kokkos::MemoryTraits< Kokkos::Atomic > > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space, Kokkos::MemoryTraits< Kokkos::Atomic >> stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -111,357 +107,338 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
-      // below checks that values match, but unable to check the references 
-      // - should this be able to be checked?
-      if ( left(i0)  != left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( right(i0) != right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
-      if ( left(i0)  != left_stride(i0) ) { update |= 4 ; }
-      if ( right(i0) != right_stride(i0) ) { update |= 8 ; }
-      /*
-      if ( & left(i0)  != & left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0) != & right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
-      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
-      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
-      */
+      // Below checks that values match, but unable to check the references.
+      // Should this be able to be checked?
+      if ( left( i0 )  != left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( right( i0 ) != right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( left( i0 )  != left_stride( i0 ) ) { update |= 4; }
+      if ( right( i0 ) != right_stride( i0 ) ) { update |= 8; }
+/*
+      if ( &left( i0 )  != &left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( &right( i0 ) != &right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( &left( i0 )  != &left_stride( i0 ) ) { update |= 4; }
+      if ( &right( i0 ) != &right_stride( i0 ) ) { update |= 8; }
+*/
     }
   }
 };
 
-
 template< typename T, class DeviceType >
 class TestAtomicViewAPI
 {
 public:
-  typedef DeviceType        device ;
+  typedef DeviceType device;
 
-  enum { N0 = 1000 ,
-         N1 = 3 ,
-         N2 = 5 ,
+  enum { N0 = 1000,
+         N1 = 3,
+         N2 = 5,
          N3 = 7 };
 
-  typedef Kokkos::View< T , device > dView0 ;
-  typedef Kokkos::View< T* , device > dView1 ;
-  typedef Kokkos::View< T*[N1] , device > dView2 ;
-  typedef Kokkos::View< T*[N1][N2] , device > dView3 ;
-  typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ;
-  typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ;
-  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ;
-  typedef typename dView0::host_mirror_space host ;
+  typedef Kokkos::View< T, device > dView0;
+  typedef Kokkos::View< T*, device > dView1;
+  typedef Kokkos::View< T*[N1], device > dView2;
+  typedef Kokkos::View< T*[N1][N2], device > dView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device > dView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device > const_dView4;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged;
+  typedef typename dView0::host_mirror_space host;
 
-  typedef Kokkos::View< T , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView0 ;
-  typedef Kokkos::View< T* , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView1 ;
-  typedef Kokkos::View< T*[N1] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView2 ;
-  typedef Kokkos::View< T*[N1][N2] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView3 ;
-  typedef Kokkos::View< T*[N1][N2][N3] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > aView4 ;
-  typedef Kokkos::View< const T*[N1][N2][N3] , device , Kokkos::MemoryTraits< Kokkos::Atomic > > const_aView4 ;
+  typedef Kokkos::View< T, device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView0;
+  typedef Kokkos::View< T*, device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView1;
+  typedef Kokkos::View< T*[N1], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView2;
+  typedef Kokkos::View< T*[N1][N2], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device, Kokkos::MemoryTraits< Kokkos::Atomic > > aView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device, Kokkos::MemoryTraits< Kokkos::Atomic > > const_aView4;
 
-  typedef Kokkos::View< T****, device, Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::Atomic > > aView4_unmanaged ;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::Atomic > > aView4_unmanaged;
 
-  typedef typename aView0::host_mirror_space host_atomic ;
+  typedef typename aView0::host_mirror_space host_atomic;
 
   TestAtomicViewAPI()
   {
-    TestViewOperator_LeftAndRight< int[2] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2], device >::testit();
     run_test_rank0();
     run_test_rank4();
     run_test_const();
   }
 
-
   static void run_test_rank0()
   {
-    dView0 dx , dy ;
-    aView0 ax , ay , az ;
+    dView0 dx, dy;
+    aView0 ax, ay, az;
 
     dx = dView0( "dx" );
     dy = dView0( "dy" );
-    ASSERT_EQ( dx.use_count() , size_t(1) );
-    ASSERT_EQ( dy.use_count() , size_t(1) );
-
-    ax = dx ;
-    ay = dy ;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
-    ASSERT_EQ( dy.use_count() , size_t(2) );
-    ASSERT_EQ( dx.use_count() , ax.use_count() );
-
-    az = ax ;
-    ASSERT_EQ( dx.use_count() , size_t(3) );
-    ASSERT_EQ( ax.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , ax.use_count() );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 1 ) );
+
+    ax = dx;
+    ay = dy;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dx.use_count(), ax.use_count() );
+
+    az = ax;
+    ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), ax.use_count() );
   }
 
   static void run_test_rank4()
   {
-    dView4 dx , dy ;
-    aView4 ax , ay , az ;
+    dView4 dx, dy;
+    aView4 ax, ay, az;
 
-    dx = dView4( "dx" , N0 );
-    dy = dView4( "dy" , N0 );
-    ASSERT_EQ( dx.use_count() , size_t(1) );
-    ASSERT_EQ( dy.use_count() , size_t(1) );
+    dx = dView4( "dx", N0 );
+    dy = dView4( "dy", N0 );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 1 ) );
 
-    ax = dx ;
-    ay = dy ;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
-    ASSERT_EQ( dy.use_count() , size_t(2) );
-    ASSERT_EQ( dx.use_count() , ax.use_count() );
+    ax = dx;
+    ay = dy;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dy.use_count(), size_t( 2 ) );
+    ASSERT_EQ( dx.use_count(), ax.use_count() );
 
     dView4_unmanaged unmanaged_dx = dx;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
-    az = ax ;
-    ASSERT_EQ( dx.use_count() , size_t(3) );
-    ASSERT_EQ( ax.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , size_t(3) );
-    ASSERT_EQ( az.use_count() , ax.use_count() );
+    az = ax;
+    ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), size_t( 3 ) );
+    ASSERT_EQ( az.use_count(), ax.use_count() );
 
     aView4_unmanaged unmanaged_ax = ax;
-    ASSERT_EQ( ax.use_count() , size_t(3) );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
 
-    aView4_unmanaged unmanaged_ax_from_ptr_dx = aView4_unmanaged(dx.data(),
-                                                              dx.dimension_0(),
-                                                              dx.dimension_1(),
-                                                              dx.dimension_2(),
-                                                              dx.dimension_3());
-    ASSERT_EQ( ax.use_count() , size_t(3) );
+    aView4_unmanaged unmanaged_ax_from_ptr_dx =
+      aView4_unmanaged( dx.data(), dx.dimension_0(), dx.dimension_1(), dx.dimension_2(), dx.dimension_3() );
+    ASSERT_EQ( ax.use_count(), size_t( 3 ) );
 
-    const_aView4 const_ax = ax ;
-    ASSERT_EQ( ax.use_count() , size_t(4) );
-    ASSERT_EQ( const_ax.use_count() , ax.use_count() );
+    const_aView4 const_ax = ax;
+    ASSERT_EQ( ax.use_count(), size_t( 4 ) );
+    ASSERT_EQ( const_ax.use_count(), ax.use_count() );
 
     ASSERT_FALSE( ax.data() == 0 );
     ASSERT_FALSE( const_ax.data() == 0 ); // referenceable ptr
     ASSERT_FALSE( unmanaged_ax.data() == 0 );
     ASSERT_FALSE( unmanaged_ax_from_ptr_dx.data() == 0 );
     ASSERT_FALSE( ay.data() == 0 );
-//    ASSERT_NE( ax , ay );
+//    ASSERT_NE( ax, ay );
 //    Above test results in following runtime error from gtest:
 //    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F 00-00 00-31 44-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00> vs 32-byte object <80-01 D0-A0 D8-7F 00-00 00-A1 4A-0C 01-00 00-00 E8-03 00-00 00-00 00-00 69-00 00-00 00-00 00-00>
 
-    ASSERT_EQ( ax.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( ax.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( ax.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( ax.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( ax.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( ax.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( ax.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( ax.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( ay.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( ay.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( ay.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( ay.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( ay.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( ay.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( ay.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( ay.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( unmanaged_ax_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+    ASSERT_EQ( unmanaged_ax_from_ptr_dx.capacity(), unsigned( N0 ) * unsigned( N1 ) * unsigned( N2 ) * unsigned( N3 ) );
   }
 
-  typedef T DataType[2] ;
+  typedef T DataType[2];
 
   static void
   check_auto_conversion_to_const(
-     const Kokkos::View< const DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > & arg_const ,
-     const Kokkos::View< const DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > & arg )
+     const Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > & arg_const,
+     const Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > & arg )
   {
     ASSERT_TRUE( arg_const == arg );
   }
 
   static void run_test_const()
   {
-    typedef Kokkos::View< DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > typeX ;
-    typedef Kokkos::View< const DataType , device , Kokkos::MemoryTraits< Kokkos::Atomic> > const_typeX ;
+    typedef Kokkos::View< DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > typeX;
+    typedef Kokkos::View< const DataType, device, Kokkos::MemoryTraits<Kokkos::Atomic> > const_typeX;
 
     typeX x( "X" );
-    const_typeX xc = x ;
+    const_typeX xc = x;
 
     //ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
     //ASSERT_TRUE( x == xc );
 
-    check_auto_conversion_to_const( x , xc );
+    check_auto_conversion_to_const( x, xc );
   }
-
 };
 
-
 //---------------------------------------------------
 //-----------initialization functors-----------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct InitFunctor_Seq {
+  typedef Kokkos::View< T*, execution_space > view_type;
 
-  typedef Kokkos::View< T* , execution_space > view_type ;
-
-  view_type input ; 
-  const long length ;
+  view_type input;
+  const long length;
 
-  InitFunctor_Seq( view_type & input_ , const long length_ ) 
-    : input(input_)
-    , length(length_)
+  InitFunctor_Seq( view_type & input_, const long length_ )
+    : input( input_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const long i ) const {
     if ( i < length ) {
-      input(i) = (T) i ;
+      input( i ) = (T) i;
     }
   }
-
 };
 
-
 template<class T, class execution_space >
 struct InitFunctor_ModTimes {
+  typedef Kokkos::View< T*, execution_space > view_type;
 
-  typedef Kokkos::View< T* , execution_space > view_type ;
-
-  view_type input ; 
-  const long length ;
-  const long remainder ;
+  view_type input;
+  const long length;
+  const long remainder;
 
-  InitFunctor_ModTimes( view_type & input_ , const long length_ , const long remainder_ ) 
-    : input(input_)
-    , length(length_)
-    , remainder(remainder_)
+  InitFunctor_ModTimes( view_type & input_, const long length_, const long remainder_ )
+    : input( input_ )
+    , length( length_ )
+    , remainder( remainder_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const long i ) const {
     if ( i < length ) {
-      if ( i % (remainder+1) == remainder ) {
-        input(i) = (T)2 ;
+      if ( i % ( remainder + 1 ) == remainder ) {
+        input( i ) = (T) 2;
       }
       else {
-        input(i) = (T)1 ;
+        input( i ) = (T) 1;
       }
     }
   }
 };
 
-
 template<class T, class execution_space >
 struct InitFunctor_ModShift {
+  typedef Kokkos::View< T*, execution_space > view_type;
 
-  typedef Kokkos::View< T* , execution_space > view_type ;
-
-  view_type input ; 
-  const long length ;
-  const long remainder ;
+  view_type input;
+  const long length;
+  const long remainder;
 
-  InitFunctor_ModShift( view_type & input_ , const long length_ , const long remainder_ ) 
-    : input(input_)
-    , length(length_)
-    , remainder(remainder_)
+  InitFunctor_ModShift( view_type & input_, const long length_, const long remainder_ )
+    : input( input_ )
+    , length( length_ )
+    , remainder( remainder_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const long i ) const {
     if ( i < length ) {
-      if ( i % (remainder+1) == remainder ) {
-        input(i) = 1 ;
+      if ( i % ( remainder + 1 ) == remainder ) {
+        input( i ) = 1;
       }
     }
   }
 };
 
-
 //---------------------------------------------------
 //-----------atomic view plus-equal------------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct PlusEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
   // Wrap the result view in an atomic view, use this for operator
-  PlusEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  PlusEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) += input(i);
+        even_odd_result( 0 ) += input( i );
       }
       else {
-        even_odd_result(1) += input(i);
+        even_odd_result( 1 ) += input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T PlusEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T PlusEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(0, length) , init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  PlusEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  PlusEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0) + h_result_view(1) ) ;
+  return (T) ( h_result_view( 0 ) + h_result_view( 1 ) );
 }
 
-template<class T>
+template< class T >
 T PlusEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
   T result[2];
+
   if ( N % 2 == 0 ) {
-    const long half_sum_end = (N/2) - 1;
+    const long half_sum_end = ( N / 2 ) - 1;
     const long full_sum_end = N - 1;
-    result[0] = half_sum_end*(half_sum_end + 1)/2 ; //even sum
-    result[1] = ( full_sum_end*(full_sum_end + 1)/2 ) - result[0] ; // odd sum
+    result[0] = half_sum_end * ( half_sum_end + 1 ) / 2; // Even sum.
+    result[1] = ( full_sum_end * ( full_sum_end + 1 ) / 2 ) - result[0]; // Odd sum.
   }
   else {
-    const long half_sum_end = (T)(N/2) ;
+    const long half_sum_end = (T) ( N / 2 );
     const long full_sum_end = N - 2;
-    result[0] = half_sum_end*(half_sum_end - 1)/2 ; //even sum
-    result[1] = ( full_sum_end*(full_sum_end - 1)/2 ) - result[0] ; // odd sum
+    result[0] = half_sum_end * ( half_sum_end - 1 ) / 2; // Even sum.
+    result[1] = ( full_sum_end * ( full_sum_end - 1 ) / 2 ) - result[0]; // Odd sum.
   }
 
-  return (T)(result[0] + result[1]);
+  return (T) ( result[0] + result[1] );
 }
 
-template<class T,class DeviceType>
-bool PlusEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool PlusEqualAtomicViewTest( long input_length )
 {
-  T res       = PlusEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = PlusEqualAtomicViewCheck<T>(input_length);
+  T res       = PlusEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = PlusEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -469,104 +446,98 @@ bool PlusEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = PlusEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view minus-equal-----------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct MinusEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  MinusEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  MinusEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) -= input(i);
+        even_odd_result( 0 ) -= input( i );
       }
       else {
-        even_odd_result(1) -= input(i);
+        even_odd_result( 1 ) -= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T MinusEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T MinusEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  MinusEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  MinusEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0) + h_result_view(1) ) ;
+  return (T) ( h_result_view( 0 ) + h_result_view( 1 ) );
 }
 
-template<class T>
+template< class T >
 T MinusEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
   T result[2];
+
   if ( N % 2 == 0 ) {
-    const long half_sum_end = (N/2) - 1;
+    const long half_sum_end = ( N / 2 ) - 1;
     const long full_sum_end = N - 1;
-    result[0] = -1*( half_sum_end*(half_sum_end + 1)/2 ) ; //even sum
-    result[1] = -1*( ( full_sum_end*(full_sum_end + 1)/2 ) + result[0] ) ; // odd sum
+    result[0] = -1 * ( half_sum_end * ( half_sum_end + 1 ) / 2 ); // Even sum.
+    result[1] = -1 * ( ( full_sum_end * ( full_sum_end + 1 ) / 2 ) + result[0] ); // Odd sum.
   }
   else {
-    const long half_sum_end = (long)(N/2) ;
+    const long half_sum_end = (long) ( N / 2 );
     const long full_sum_end = N - 2;
-    result[0] = -1*( half_sum_end*(half_sum_end - 1)/2 ) ; //even sum
-    result[1] = -1*( ( full_sum_end*(full_sum_end - 1)/2 ) + result[0] ) ; // odd sum
+    result[0] = -1 * ( half_sum_end * ( half_sum_end - 1 ) / 2 ); // Even sum.
+    result[1] = -1 * ( ( full_sum_end * ( full_sum_end - 1 ) / 2 ) + result[0] ); // Odd sum.
   }
 
-  return (result[0] + result[1]);
+  return ( result[0] + result[1] );
 }
 
-template<class T,class DeviceType>
-bool MinusEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool MinusEqualAtomicViewTest( long input_length )
 {
-  T res       = MinusEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = MinusEqualAtomicViewCheck<T>(input_length);
+  T res       = MinusEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = MinusEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -574,83 +545,76 @@ bool MinusEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = MinusEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view times-equal-----------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct TimesEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
 
   // Wrap the result view in an atomic view, use this for operator
-  TimesEqualAtomicViewFunctor( const view_type & input_ , view_type & result_ , const long length_) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
+  TimesEqualAtomicViewFunctor( const view_type & input_, view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length && i > 0 ) {
-      result(0) *= (double)input(i);
+      result( 0 ) *= (double) input( i );
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T TimesEqualAtomicView(const long input_length, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T TimesEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",1) ;
-  deep_copy(result_view, 1.0);
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 1 );
+  deep_copy( result_view, 1.0 );
 
-  InitFunctor_ModTimes<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  TimesEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  TimesEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T TimesEqualAtomicViewCheck( const long input_length, const long remainder ) {
-
-  //Analytical result
+  // Analytical result.
   const long N = input_length;
   T result = 1.0;
 
   for ( long i = 2; i < N; ++i ) {
-    if ( i % (remainder+1) == remainder ) { 
+    if ( i % ( remainder + 1 ) == remainder ) {
       result *= 2.0;
     }
     else {
@@ -658,15 +622,15 @@ T TimesEqualAtomicViewCheck( const long input_length, const long remainder ) {
     }
   }
 
-  return (T)result;
+  return (T) result;
 }
 
-template<class T, class DeviceType>
-bool TimesEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType>
+bool TimesEqualAtomicViewTest( const long input_length )
 {
   const long remainder = 23;
-  T res       = TimesEqualAtomicView<T,DeviceType>(input_length, remainder);
-  T resSerial = TimesEqualAtomicViewCheck<T>(input_length, remainder);
+  T res       = TimesEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = TimesEqualAtomicViewCheck< T >( input_length, remainder );
 
   bool passed = true;
 
@@ -674,101 +638,93 @@ bool TimesEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = TimesEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view div-equal------------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct DivEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
+  typedef Kokkos::View< T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  DivEqualAtomicViewFunctor( const view_type & input_ , scalar_view_type & result_ , const long length_) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  DivEqualAtomicViewFunctor( const view_type & input_, scalar_view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length && i > 0 ) {
-      result() /= (double)(input(i));
+      result() /= (double) ( input( i ) );
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T DivEqualAtomicView(const long input_length, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
-  typedef typename scalar_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T DivEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+  typedef typename scalar_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  scalar_view_type result_view("result_view") ;
-  Kokkos::deep_copy(result_view, 12121212121);
+  view_type input( "input_view", length );
+  scalar_view_type result_view( "result_view" );
+  Kokkos::deep_copy( result_view, 12121212121 );
 
-  InitFunctor_ModTimes<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  DivEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  DivEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view()) ;
+  return (T) ( h_result_view() );
 }
 
-template<class T>
-T DivEqualAtomicViewCheck( const long input_length , const long remainder ) {
-
+template< class T >
+T DivEqualAtomicViewCheck( const long input_length, const long remainder ) {
   const long N = input_length;
   T result = 12121212121.0;
   for ( long i = 2; i < N; ++i ) {
-    if ( i % (remainder+1) == remainder ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
       result /= 1.0;
     }
     else {
       result /= 2.0;
     }
-
   }
 
-  return (T)result;
+  return (T) result;
 }
 
-template<class T, class DeviceType>
-bool DivEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool DivEqualAtomicViewTest( const long input_length )
 {
   const long remainder = 23;
 
-  T res       = DivEqualAtomicView<T,DeviceType>(input_length, remainder);
-  T resSerial = DivEqualAtomicViewCheck<T>(input_length, remainder);
+  T res       = DivEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = DivEqualAtomicViewCheck< T >( input_length, remainder );
 
   bool passed = true;
 
@@ -776,83 +732,76 @@ bool DivEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = DivEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view mod-equal------------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct ModEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
+  typedef Kokkos::View< T, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  ModEqualAtomicViewFunctor( const view_type & input_ , scalar_view_type & result_ , const long length_) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  ModEqualAtomicViewFunctor( const view_type & input_, scalar_view_type & result_, const long length_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length && i > 0 ) {
-      result() %= (double)(input(i));
+      result() %= (double) ( input( i ) );
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T ModEqualAtomicView(const long input_length, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T , execution_space > scalar_view_type ;
-  typedef typename scalar_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T ModEqualAtomicView( const long input_length, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T, execution_space > scalar_view_type;
+  typedef typename scalar_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  scalar_view_type result_view("result_view") ;
-  Kokkos::deep_copy(result_view, 12121212121);
+  view_type input( "input_view", length );
+  scalar_view_type result_view( "result_view" );
+  Kokkos::deep_copy( result_view, 12121212121 );
 
-  InitFunctor_ModTimes<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModTimes< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  ModEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  ModEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view()) ;
+  return (T) ( h_result_view() );
 }
 
-template<class T>
-T ModEqualAtomicViewCheck( const long input_length , const long remainder ) {
-
+template< class T >
+T ModEqualAtomicViewCheck( const long input_length, const long remainder ) {
   const long N = input_length;
   T result = 12121212121;
   for ( long i = 2; i < N; ++i ) {
-    if ( i % (remainder+1) == remainder ) {
+    if ( i % ( remainder + 1 ) == remainder ) {
       result %= 1;
     }
     else {
@@ -860,19 +809,18 @@ T ModEqualAtomicViewCheck( const long input_length , const long remainder ) {
     }
   }
 
-  return (T)result;
+  return (T) result;
 }
 
-template<class T, class DeviceType>
-bool ModEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool ModEqualAtomicViewTest( const long input_length )
 {
-
-  static_assert( std::is_integral<T>::value, "ModEqualAtomicView Error: Type must be integral type for this unit test");
+  static_assert( std::is_integral< T >::value, "ModEqualAtomicView Error: Type must be integral type for this unit test" );
 
   const long remainder = 23;
 
-  T res       = ModEqualAtomicView<T,DeviceType>(input_length, remainder);
-  T resSerial = ModEqualAtomicViewCheck<T>(input_length, remainder);
+  T res       = ModEqualAtomicView< T, DeviceType >( input_length, remainder );
+  T resSerial = ModEqualAtomicViewCheck< T >( input_length, remainder );
 
   bool passed = true;
 
@@ -880,142 +828,134 @@ bool ModEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = ModEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view rs-equal------------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct RSEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T**** , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
+  typedef Kokkos::View< T****, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
 
   const view_type input;
   atomic_view_type result;
   const long length;
   const long value;
 
-  // Wrap the result view in an atomic view, use this for operator
-  RSEqualAtomicViewFunctor( const view_type & input_ , result_view_type & result_ , const long & length_ , const long & value_ ) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
-    , value(value_)
+  // Wrap the result view in an atomic view, use this for operator.
+  RSEqualAtomicViewFunctor( const view_type & input_, result_view_type & result_, const long & length_, const long & value_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+    , value( value_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 4 == 0 ) {
-        result(1,0,0,0) >>= input(i);
+        result( 1, 0, 0, 0 ) >>= input( i );
       }
       else if ( i % 4 == 1 ) {
-        result(0,1,0,0) >>= input(i);
+        result( 0, 1, 0, 0 ) >>= input( i );
       }
       else if ( i % 4 == 2 ) {
-        result(0,0,1,0) >>= input(i);
+        result( 0, 0, 1, 0 ) >>= input( i );
       }
       else if ( i % 4 == 3 ) {
-        result(0,0,0,1) >>= input(i);
+        result( 0, 0, 0, 1 ) >>= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T RSEqualAtomicView(const long input_length, const long value, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
-  typedef typename result_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T RSEqualAtomicView( const long input_length, const long value, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+  typedef typename result_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  result_view_type result_view("result_view",2,2,2,2) ;
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-    h_result_view(1,0,0,0) = value;
-    h_result_view(0,1,0,0) = value;
-    h_result_view(0,0,1,0) = value;
-    h_result_view(0,0,0,1) = value;
-  Kokkos::deep_copy( result_view , h_result_view );
+  view_type input( "input_view", length );
+  result_view_type result_view( "result_view", 2, 2, 2, 2 );
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  h_result_view( 1, 0, 0, 0 ) = value;
+  h_result_view( 0, 1, 0, 0 ) = value;
+  h_result_view( 0, 0, 1, 0 ) = value;
+  h_result_view( 0, 0, 0, 1 ) = value;
+  Kokkos::deep_copy( result_view, h_result_view );
 
+  InitFunctor_ModShift< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  InitFunctor_ModShift<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
-
-  RSEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length, value);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  RSEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length, value );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  Kokkos::deep_copy(h_result_view, result_view);
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(1,0,0,0)) ; 
+  return (T) ( h_result_view( 1, 0, 0, 0 ) );
 }
 
-template<class T>
+template< class T >
 T RSEqualAtomicViewCheck( const long input_length, const long value, const long remainder ) {
-
-  T result[4] ;
-  result[0] = value ;
-  result[1] = value ;
-  result[2] = value ;
-  result[3] = value ;
+  T result[4];
+  result[0] = value;
+  result[1] = value;
+  result[2] = value;
+  result[3] = value;
 
   T * input = new T[input_length];
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % (remainder+1) == remainder ) {
-        input[i] = 1;
-      }
-      else {
-        input[i] = 0;
-      }
+    if ( i % ( remainder + 1 ) == remainder ) {
+      input[i] = 1;
+    }
+    else {
+      input[i] = 0;
+    }
   }
 
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % 4 == 0 ) {
-        result[0] >>= input[i];
-      }
-      else if ( i % 4 == 1 ) {
-        result[1] >>= input[i];
-      }
-      else if ( i % 4 == 2 ) {
-        result[2] >>= input[i];
-      }
-      else if ( i % 4 == 3 ) {
-        result[3] >>= input[i];
-      }
+    if ( i % 4 == 0 ) {
+      result[0] >>= input[i];
+    }
+    else if ( i % 4 == 1 ) {
+      result[1] >>= input[i];
+    }
+    else if ( i % 4 == 2 ) {
+      result[2] >>= input[i];
+    }
+    else if ( i % 4 == 3 ) {
+      result[3] >>= input[i];
+    }
   }
+
   delete [] input;
 
-  return (T)result[0]; 
+  return (T) result[0];
 }
 
-template<class T, class DeviceType>
-bool RSEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool RSEqualAtomicViewTest( const long input_length )
 {
-
-  static_assert( std::is_integral<T>::value, "RSEqualAtomicViewTest: Must be integral type for test");
+  static_assert( std::is_integral< T >::value, "RSEqualAtomicViewTest: Must be integral type for test" );
 
   const long remainder = 61042; //prime - 1
-  const long value =  1073741825; //  2^30+1
-  T res       = RSEqualAtomicView<T,DeviceType>(input_length, value, remainder);
-  T resSerial = RSEqualAtomicViewCheck<T>(input_length, value, remainder);
+  const long value = 1073741825; //  2^30+1
+  T res       = RSEqualAtomicView< T, DeviceType >( input_length, value, remainder );
+  T resSerial = RSEqualAtomicViewCheck< T >( input_length, value, remainder );
 
   bool passed = true;
 
@@ -1023,142 +963,134 @@ bool RSEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = RSEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //------------atomic view ls-equal------------------
 //---------------------------------------------------
 
 template<class T, class execution_space >
 struct LSEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T**** , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
+  typedef Kokkos::View< T****, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
 
   view_type input;
   atomic_view_type result;
   const long length;
   const long value;
 
-  // Wrap the result view in an atomic view, use this for operator
-  LSEqualAtomicViewFunctor( const view_type & input_ , result_view_type & result_ , const long & length_ , const long & value_ ) 
-    : input(input_)
-    , result(result_)
-    , length(length_)
-    , value(value_)
+  // Wrap the result view in an atomic view, use this for operator.
+  LSEqualAtomicViewFunctor( const view_type & input_, result_view_type & result_, const long & length_, const long & value_ )
+    : input( input_ )
+    , result( result_ )
+    , length( length_ )
+    , value( value_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 4 == 0 ) {
-        result(1,0,0,0) <<= input(i);
+        result( 1, 0, 0, 0 ) <<= input( i );
       }
       else if ( i % 4 == 1 ) {
-        result(0,1,0,0) <<= input(i);
+        result( 0, 1, 0, 0 ) <<= input( i );
       }
       else if ( i % 4 == 2 ) {
-        result(0,0,1,0) <<= input(i);
+        result( 0, 0, 1, 0 ) <<= input( i );
       }
       else if ( i % 4 == 3 ) {
-        result(0,0,0,1) <<= input(i);
+        result( 0, 0, 0, 1 ) <<= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T LSEqualAtomicView(const long input_length, const long value, const long remainder) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef Kokkos::View< T**** , execution_space > result_view_type ;
-  typedef typename result_view_type::HostMirror host_scalar_view_type ;
+template< class T, class execution_space >
+T LSEqualAtomicView( const long input_length, const long value, const long remainder ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef Kokkos::View< T****, execution_space > result_view_type;
+  typedef typename result_view_type::HostMirror host_scalar_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  result_view_type result_view("result_view",2,2,2,2) ;
-  host_scalar_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-    h_result_view(1,0,0,0) = value;
-    h_result_view(0,1,0,0) = value;
-    h_result_view(0,0,1,0) = value;
-    h_result_view(0,0,0,1) = value;
-  Kokkos::deep_copy( result_view , h_result_view );
+  view_type input( "input_view", length );
+  result_view_type result_view( "result_view", 2, 2, 2, 2 );
+  host_scalar_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+    h_result_view( 1, 0, 0, 0 ) = value;
+    h_result_view( 0, 1, 0, 0 ) = value;
+    h_result_view( 0, 0, 1, 0 ) = value;
+    h_result_view( 0, 0, 0, 1 ) = value;
+  Kokkos::deep_copy( result_view, h_result_view );
 
-  InitFunctor_ModShift<T, execution_space> init_f( input , length , remainder ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_ModShift< T, execution_space > init_f( input, length, remainder );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  LSEqualAtomicViewFunctor<T,execution_space> functor(input, result_view, length, value);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  LSEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length, value );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  Kokkos::deep_copy(h_result_view, result_view);
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(1,0,0,0)) ; 
+  return (T) ( h_result_view( 1, 0, 0, 0 ) );
 }
 
-template<class T>
+template< class T >
 T LSEqualAtomicViewCheck( const long input_length, const long value, const long remainder ) {
-
-  T result[4] ;
-  result[0] = value ;
-  result[1] = value ;
-  result[2] = value ;
-  result[3] = value ;
+  T result[4];
+  result[0] = value;
+  result[1] = value;
+  result[2] = value;
+  result[3] = value;
 
   T * input = new T[input_length];
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % (remainder+1) == remainder ) {
-        input[i] = 1;
-      }
-      else {
-        input[i] = 0;
-      }
+    if ( i % ( remainder + 1 ) == remainder ) {
+      input[i] = 1;
+    }
+    else {
+      input[i] = 0;
+    }
   }
 
   for ( long i = 0; i < input_length; ++i ) {
-      if ( i % 4 == 0 ) {
-        result[0] <<= input[i];
-      }
-      else if ( i % 4 == 1 ) {
-        result[1] <<= input[i];
-      }
-      else if ( i % 4 == 2 ) {
-        result[2] <<= input[i];
-      }
-      else if ( i % 4 == 3 ) {
-        result[3] <<= input[i];
-      }
+    if ( i % 4 == 0 ) {
+      result[0] <<= input[i];
+    }
+    else if ( i % 4 == 1 ) {
+      result[1] <<= input[i];
+    }
+    else if ( i % 4 == 2 ) {
+      result[2] <<= input[i];
+    }
+    else if ( i % 4 == 3 ) {
+      result[3] <<= input[i];
+    }
   }
 
   delete [] input;
 
-  return (T)result[0]; 
+  return (T) result[0];
 }
 
-template<class T, class DeviceType>
-bool LSEqualAtomicViewTest(const long input_length)
+template< class T, class DeviceType >
+bool LSEqualAtomicViewTest( const long input_length )
 {
-
-  static_assert( std::is_integral<T>::value, "LSEqualAtomicViewTest: Must be integral type for test");
+  static_assert( std::is_integral< T >::value, "LSEqualAtomicViewTest: Must be integral type for test" );
 
   const long remainder = 61042; //prime - 1
-  const long value =  1; //  2^30+1
-  T res       = LSEqualAtomicView<T,DeviceType>(input_length, value, remainder);
-  T resSerial = LSEqualAtomicViewCheck<T>(input_length, value, remainder);
+  const long value = 1; //  2^30+1
+  T res       = LSEqualAtomicView< T, DeviceType >( input_length, value, remainder );
+  T resSerial = LSEqualAtomicViewCheck< T >( input_length, value, remainder );
 
   bool passed = true;
 
@@ -1166,104 +1098,96 @@ bool LSEqualAtomicViewTest(const long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = RSEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view and-equal-----------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct AndEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  AndEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  AndEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) &= input(i);
+        even_odd_result( 0 ) &= input( i );
       }
       else {
-        even_odd_result(1) &= input(i);
+        even_odd_result( 1 ) &= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T AndEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T AndEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
-  Kokkos::deep_copy(result_view, 1);
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
+  Kokkos::deep_copy( result_view, 1 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  AndEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  AndEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T AndEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
-  T result[2] = {1};
+  T result[2] = { 1 };
   for ( long i = 0; i < N; ++i ) {
     if ( N % 2 == 0 ) {
-      result[0] &= (T)i;
+      result[0] &= (T) i;
     }
     else {
-      result[1] &= (T)i;
+      result[1] &= (T) i;
     }
   }
 
-  return (result[0]);
+  return ( result[0] );
 }
 
-template<class T,class DeviceType>
-bool AndEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool AndEqualAtomicViewTest( long input_length )
 {
+  static_assert( std::is_integral< T >::value, "AndEqualAtomicViewTest: Must be integral type for test" );
 
-  static_assert( std::is_integral<T>::value, "AndEqualAtomicViewTest: Must be integral type for test");
-
-  T res       = AndEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = AndEqualAtomicViewCheck<T>(input_length);
+  T res       = AndEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = AndEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -1271,103 +1195,96 @@ bool AndEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = AndEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view or-equal-----------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct OrEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  OrEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  OrEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) |= input(i);
+        even_odd_result( 0 ) |= input( i );
       }
       else {
-        even_odd_result(1) |= input(i);
+        even_odd_result( 1 ) |= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T OrEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T OrEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  OrEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  OrEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T OrEqualAtomicViewCheck( const long input_length ) {
 
   const long N = input_length;
-  T result[2] = {0};
+  T result[2] = { 0 };
   for ( long i = 0; i < N; ++i ) {
     if ( i % 2 == 0 ) {
-      result[0] |= (T)i;
+      result[0] |= (T) i;
     }
     else {
-      result[1] |= (T)i;
+      result[1] |= (T) i;
     }
   }
 
-  return (T)(result[0]);
+  return (T) ( result[0] );
 }
 
-template<class T,class DeviceType>
-bool OrEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool OrEqualAtomicViewTest( long input_length )
 {
-  
-  static_assert( std::is_integral<T>::value, "OrEqualAtomicViewTest: Must be integral type for test");
+  static_assert( std::is_integral< T >::value, "OrEqualAtomicViewTest: Must be integral type for test" );
 
-  T res       = OrEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = OrEqualAtomicViewCheck<T>(input_length);
+  T res       = OrEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = OrEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -1375,103 +1292,95 @@ bool OrEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = OrEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 //---------------------------------------------------
 //-----------atomic view xor-equal-----------------
 //---------------------------------------------------
 
-template<class T, class execution_space >
+template< class T, class execution_space >
 struct XOrEqualAtomicViewFunctor {
-
-  typedef Kokkos::View< T* , execution_space ,  Kokkos::MemoryTraits< Kokkos::Atomic > >  atomic_view_type ;
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
+  typedef Kokkos::View< T*, execution_space, Kokkos::MemoryTraits<Kokkos::Atomic> > atomic_view_type;
+  typedef Kokkos::View< T*, execution_space > view_type;
 
   view_type input;
   atomic_view_type even_odd_result;
   const long length;
 
-  // Wrap the result view in an atomic view, use this for operator
-  XOrEqualAtomicViewFunctor( const view_type & input_ , view_type & even_odd_result_ , const long length_) 
-    : input(input_)
-    , even_odd_result(even_odd_result_)
-    , length(length_)
+  // Wrap the result view in an atomic view, use this for operator.
+  XOrEqualAtomicViewFunctor( const view_type & input_, view_type & even_odd_result_, const long length_ )
+    : input( input_ )
+    , even_odd_result( even_odd_result_ )
+    , length( length_ )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const long i) const {
+  void operator()( const long i ) const {
     if ( i < length ) {
       if ( i % 2 == 0 ) {
-        even_odd_result(0) ^= input(i);
+        even_odd_result( 0 ) ^= input( i );
       }
       else {
-        even_odd_result(1) ^= input(i);
+        even_odd_result( 1 ) ^= input( i );
       }
     }
   }
-
 };
 
-
-template<class T, class execution_space >
-T XOrEqualAtomicView(const long input_length) {
-
-  typedef Kokkos::View< T* , execution_space > view_type ;
-  typedef typename view_type::HostMirror host_view_type ;
+template< class T, class execution_space >
+T XOrEqualAtomicView( const long input_length ) {
+  typedef Kokkos::View< T*, execution_space > view_type;
+  typedef typename view_type::HostMirror host_view_type;
 
   const long length = input_length;
 
-  view_type input("input_view",length) ;
-  view_type result_view("result_view",2) ;
+  view_type input( "input_view", length );
+  view_type result_view( "result_view", 2 );
 
-  InitFunctor_Seq<T, execution_space> init_f( input , length ) ;
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), init_f );
+  InitFunctor_Seq< T, execution_space > init_f( input, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), init_f );
 
-  XOrEqualAtomicViewFunctor<T,execution_space> functor(input, result_view,length);
-  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, length), functor);
+  XOrEqualAtomicViewFunctor< T, execution_space > functor( input, result_view, length );
+  Kokkos::parallel_for( Kokkos::RangePolicy< execution_space >( 0, length ), functor );
   Kokkos::fence();
 
-  host_view_type h_result_view = Kokkos::create_mirror_view(result_view);
-  Kokkos::deep_copy(h_result_view, result_view);
+  host_view_type h_result_view = Kokkos::create_mirror_view( result_view );
+  Kokkos::deep_copy( h_result_view, result_view );
 
-  return (T) (h_result_view(0)) ;
+  return (T) ( h_result_view( 0 ) );
 }
 
-template<class T>
+template< class T >
 T XOrEqualAtomicViewCheck( const long input_length ) {
-
   const long N = input_length;
-  T result[2] = {0};
+  T result[2] = { 0 };
   for ( long i = 0; i < N; ++i ) {
     if ( i % 2 == 0 ) {
-      result[0] ^= (T)i;
+      result[0] ^= (T) i;
     }
     else {
-      result[1] ^= (T)i;
+      result[1] ^= (T) i;
     }
   }
 
-  return (T)(result[0]);
+  return (T) ( result[0] );
 }
 
-template<class T,class DeviceType>
-bool XOrEqualAtomicViewTest(long input_length)
+template< class T, class DeviceType >
+bool XOrEqualAtomicViewTest( long input_length )
 {
+  static_assert( std::is_integral< T >::value, "XOrEqualAtomicViewTest: Must be integral type for test" );
 
-  static_assert( std::is_integral<T>::value, "XOrEqualAtomicViewTest: Must be integral type for test");
-
-  T res       = XOrEqualAtomicView<T,DeviceType>(input_length);
-  T resSerial = XOrEqualAtomicViewCheck<T>(input_length);
+  T res       = XOrEqualAtomicView< T, DeviceType >( input_length );
+  T resSerial = XOrEqualAtomicViewCheck< T >( input_length );
 
   bool passed = true;
 
@@ -1479,54 +1388,52 @@ bool XOrEqualAtomicViewTest(long input_length)
     passed = false;
 
     std::cout << "Loop<"
-              << typeid(T).name()
+              << typeid( T ).name()
               << ">( test = XOrEqualAtomicViewTest"
               << " FAILED : "
               << resSerial << " != " << res
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 }
 
-
 // inc/dec?
 
-
 //---------------------------------------------------
 //--------------atomic_test_control------------------
 //---------------------------------------------------
 
-template<class T,class DeviceType>
-bool AtomicViewsTestIntegralType( const int length , int test )
+template< class T, class DeviceType >
+bool AtomicViewsTestIntegralType( const int length, int test )
 {
-  static_assert( std::is_integral<T>::value, "TestAtomicViews Error: Non-integral type passed into IntegralType tests");
-
-  switch (test) {
-    case 1: return PlusEqualAtomicViewTest<T,DeviceType>( length );
-    case 2: return MinusEqualAtomicViewTest<T,DeviceType>( length );
-    case 3: return RSEqualAtomicViewTest<T,DeviceType>( length );
-    case 4: return LSEqualAtomicViewTest<T,DeviceType>( length );
-    case 5: return ModEqualAtomicViewTest<T,DeviceType>( length );
-    case 6: return AndEqualAtomicViewTest<T,DeviceType>( length );
-    case 7: return OrEqualAtomicViewTest<T,DeviceType>( length );
-    case 8: return XOrEqualAtomicViewTest<T,DeviceType>( length );
+  static_assert( std::is_integral< T >::value, "TestAtomicViews Error: Non-integral type passed into IntegralType tests" );
+
+  switch ( test ) {
+    case 1: return PlusEqualAtomicViewTest< T, DeviceType >( length );
+    case 2: return MinusEqualAtomicViewTest< T, DeviceType >( length );
+    case 3: return RSEqualAtomicViewTest< T, DeviceType >( length );
+    case 4: return LSEqualAtomicViewTest< T, DeviceType >( length );
+    case 5: return ModEqualAtomicViewTest< T, DeviceType >( length );
+    case 6: return AndEqualAtomicViewTest< T, DeviceType >( length );
+    case 7: return OrEqualAtomicViewTest< T, DeviceType >( length );
+    case 8: return XOrEqualAtomicViewTest< T, DeviceType >( length );
   }
+
   return 0;
 }
 
-
-template<class T,class DeviceType>
-bool AtomicViewsTestNonIntegralType( const int length , int test )
+template< class T, class DeviceType >
+bool AtomicViewsTestNonIntegralType( const int length, int test )
 {
-  switch (test) {
-    case 1: return PlusEqualAtomicViewTest<T,DeviceType>( length );
-    case 2: return MinusEqualAtomicViewTest<T,DeviceType>( length );
-    case 3: return TimesEqualAtomicViewTest<T,DeviceType>( length );
-    case 4: return DivEqualAtomicViewTest<T,DeviceType>( length );
+  switch ( test ) {
+    case 1: return PlusEqualAtomicViewTest< T, DeviceType >( length );
+    case 2: return MinusEqualAtomicViewTest< T, DeviceType >( length );
+    case 3: return TimesEqualAtomicViewTest< T, DeviceType >( length );
+    case 4: return DivEqualAtomicViewTest< T, DeviceType >( length );
   }
+
   return 0;
 }
 
-} // namespace
-
+} // namespace TestAtomicViews
diff --git a/lib/kokkos/core/unit_test/TestCXX11.hpp b/lib/kokkos/core/unit_test/TestCXX11.hpp
index d6dde5e963e1f0706fecd333b56dd9e8ed181d0e..e2ad623d9c89cef44c4e55a9096d3dba6796adf6 100644
--- a/lib/kokkos/core/unit_test/TestCXX11.hpp
+++ b/lib/kokkos/core/unit_test/TestCXX11.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,283 +36,294 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
+
 #include <Kokkos_Core.hpp>
 
 namespace TestCXX11 {
 
-template<class DeviceType>
-struct FunctorAddTest{
-  typedef Kokkos::View<double**,DeviceType> view_type;
-  view_type a_, b_;
+template< class DeviceType >
+struct FunctorAddTest {
+  typedef Kokkos::View< double**, DeviceType > view_type;
   typedef DeviceType execution_space;
-  FunctorAddTest(view_type & a, view_type &b):a_(a),b_(b) {}
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member;
+
+  view_type a_, b_;
+
+  FunctorAddTest( view_type & a, view_type & b ) : a_( a ), b_( b ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-    b_(i,0) = a_(i,1) + a_(i,2);
-    b_(i,1) = a_(i,0) - a_(i,3);
-    b_(i,2) = a_(i,4) + a_(i,0);
-    b_(i,3) = a_(i,2) - a_(i,1);
-    b_(i,4) = a_(i,3) + a_(i,4);
+  void operator() ( const int& i ) const {
+    b_( i, 0 ) = a_( i, 1 ) + a_( i, 2 );
+    b_( i, 1 ) = a_( i, 0 ) - a_( i, 3 );
+    b_( i, 2 ) = a_( i, 4 ) + a_( i, 0 );
+    b_( i, 3 ) = a_( i, 2 ) - a_( i, 1 );
+    b_( i, 4 ) = a_( i, 3 ) + a_( i, 4 );
   }
 
-  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_member & dev) const {
-    const int begin = dev.league_rank() * 4 ;
-    const int end   = begin + 4 ;
-    for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-      b_(i,0) = a_(i,1) + a_(i,2);
-      b_(i,1) = a_(i,0) - a_(i,3);
-      b_(i,2) = a_(i,4) + a_(i,0);
-      b_(i,3) = a_(i,2) - a_(i,1);
-      b_(i,4) = a_(i,3) + a_(i,4);
+  void operator() ( const team_member & dev ) const {
+    const int begin = dev.league_rank() * 4;
+    const int end   = begin + 4;
+    for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+      b_( i, 0 ) = a_( i, 1 ) + a_( i, 2 );
+      b_( i, 1 ) = a_( i, 0 ) - a_( i, 3 );
+      b_( i, 2 ) = a_( i, 4 ) + a_( i, 0 );
+      b_( i, 3 ) = a_( i, 2 ) - a_( i, 1 );
+      b_( i, 4 ) = a_( i, 3 ) + a_( i, 4 );
     }
   }
 };
 
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double AddTestFunctor() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
 
-  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-
-  Kokkos::View<double**,DeviceType> a("A",100,5);
-  Kokkos::View<double**,DeviceType> b("B",100,5);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
+  Kokkos::View< double**, DeviceType > a( "A", 100, 5 );
+  Kokkos::View< double**, DeviceType > b( "B", 100, 5 );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_a = Kokkos::create_mirror_view( a );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_b = Kokkos::create_mirror_view( b );
 
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  for ( int i = 0; i < 100; i++ ) {
+    for  ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy( a, h_a );
 
-  if(PWRTest==false)
-    Kokkos::parallel_for(100,FunctorAddTest<DeviceType>(a,b));
-  else
-    Kokkos::parallel_for(policy_type(25,Kokkos::AUTO),FunctorAddTest<DeviceType>(a,b));
-  Kokkos::deep_copy(h_b,b);
+  if ( PWRTest == false ) {
+    Kokkos::parallel_for( 100, FunctorAddTest< DeviceType >( a, b ) );
+  }
+  else {
+    Kokkos::parallel_for( policy_type( 25, Kokkos::AUTO ), FunctorAddTest< DeviceType >( a, b ) );
+  }
+  Kokkos::deep_copy( h_b, b );
 
   double result = 0;
-  for(int i=0;i<100;i++) {
-      for(int j=0;j<5;j++)
-         result += h_b(i,j);
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+      result += h_b( i, j );
     }
+  }
 
   return result;
 }
 
-
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-template<class DeviceType, bool PWRTest>
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+template< class DeviceType, bool PWRTest >
 double AddTestLambda() {
-
-  Kokkos::View<double**,DeviceType> a("A",100,5);
-  Kokkos::View<double**,DeviceType> b("B",100,5);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
-  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
-
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  Kokkos::View< double**, DeviceType > a( "A", 100, 5 );
+  Kokkos::View< double**, DeviceType > b( "B", 100, 5 );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_a = Kokkos::create_mirror_view( a );
+  typename Kokkos::View< double**, DeviceType >::HostMirror h_b = Kokkos::create_mirror_view( b );
+
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
-
-  if(PWRTest==false) {
-    Kokkos::parallel_for(100,KOKKOS_LAMBDA(const int& i)  {
-      b(i,0) = a(i,1) + a(i,2);
-      b(i,1) = a(i,0) - a(i,3);
-      b(i,2) = a(i,4) + a(i,0);
-      b(i,3) = a(i,2) - a(i,1);
-      b(i,4) = a(i,3) + a(i,4);
+  Kokkos::deep_copy( a, h_a );
+
+  if ( PWRTest == false ) {
+    Kokkos::parallel_for( 100, KOKKOS_LAMBDA( const int & i ) {
+      b( i, 0 ) = a( i, 1 ) + a( i, 2 );
+      b( i, 1 ) = a( i, 0 ) - a( i, 3 );
+      b( i, 2 ) = a( i, 4 ) + a( i, 0 );
+      b( i, 3 ) = a( i, 2 ) - a( i, 1 );
+      b( i, 4 ) = a( i, 3 ) + a( i, 4 );
     });
-  } else {
-    typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-    typedef typename policy_type::member_type team_member ;
-
-    policy_type policy(25,Kokkos::AUTO);
-
-    Kokkos::parallel_for(policy,KOKKOS_LAMBDA(const team_member & dev)  {
-      const int begin = dev.league_rank() * 4 ;
-      const int end   = begin + 4 ;
-      for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-        b(i,0) = a(i,1) + a(i,2);
-        b(i,1) = a(i,0) - a(i,3);
-        b(i,2) = a(i,4) + a(i,0);
-        b(i,3) = a(i,2) - a(i,1);
-        b(i,4) = a(i,3) + a(i,4);
+  }
+  else {
+    typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+    typedef typename policy_type::member_type team_member;
+
+    policy_type policy( 25, Kokkos::AUTO );
+
+    Kokkos::parallel_for( policy, KOKKOS_LAMBDA( const team_member & dev ) {
+      const int begin = dev.league_rank() * 4;
+      const int end   = begin + 4;
+      for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+        b( i, 0 ) = a( i, 1 ) + a( i, 2 );
+        b( i, 1 ) = a( i, 0 ) - a( i, 3 );
+        b( i, 2 ) = a( i, 4 ) + a( i, 0 );
+        b( i, 3 ) = a( i, 2 ) - a( i, 1 );
+        b( i, 4 ) = a( i, 3 ) + a( i, 4 );
       }
     });
   }
-  Kokkos::deep_copy(h_b,b);
+  Kokkos::deep_copy( h_b, b );
 
   double result = 0;
-  for(int i=0;i<100;i++) {
-      for(int j=0;j<5;j++)
-         result += h_b(i,j);
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+      result += h_b( i, j );
     }
+  }
 
   return result;
 }
-
 #else
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double AddTestLambda() {
-  return AddTestFunctor<DeviceType,PWRTest>();
+  return AddTestFunctor< DeviceType, PWRTest >();
 }
 #endif
 
-
-template<class DeviceType>
-struct FunctorReduceTest{
-  typedef Kokkos::View<double**,DeviceType> view_type;
-  view_type a_;
+template< class DeviceType >
+struct FunctorReduceTest {
+  typedef Kokkos::View< double**, DeviceType > view_type;
   typedef DeviceType execution_space;
   typedef double value_type;
-  FunctorReduceTest(view_type & a):a_(a) {}
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type team_member;
+
+  view_type a_;
+
+  FunctorReduceTest( view_type & a ) : a_( a ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, value_type& sum) const {
-    sum += a_(i,1) + a_(i,2);
-    sum += a_(i,0) - a_(i,3);
-    sum += a_(i,4) + a_(i,0);
-    sum += a_(i,2) - a_(i,1);
-    sum += a_(i,3) + a_(i,4);
+  void operator() ( const int & i, value_type & sum ) const {
+    sum += a_( i, 1 ) + a_( i, 2 );
+    sum += a_( i, 0 ) - a_( i, 3 );
+    sum += a_( i, 4 ) + a_( i, 0 );
+    sum += a_( i, 2 ) - a_( i, 1 );
+    sum += a_( i, 3 ) + a_( i, 4 );
   }
 
-  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
-
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_member & dev, value_type& sum) const {
-    const int begin = dev.league_rank() * 4 ;
-    const int end   = begin + 4 ;
-    for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-      sum += a_(i,1) + a_(i,2);
-      sum += a_(i,0) - a_(i,3);
-      sum += a_(i,4) + a_(i,0);
-      sum += a_(i,2) - a_(i,1);
-      sum += a_(i,3) + a_(i,4);
+  void operator() ( const team_member & dev, value_type & sum ) const {
+    const int begin = dev.league_rank() * 4;
+    const int end   = begin + 4;
+    for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+      sum += a_( i, 1 ) + a_( i, 2 );
+      sum += a_( i, 0 ) - a_( i, 3 );
+      sum += a_( i, 4 ) + a_( i, 0 );
+      sum += a_( i, 2 ) - a_( i, 1 );
+      sum += a_( i, 3 ) + a_( i, 4 );
     }
   }
+
   KOKKOS_INLINE_FUNCTION
-  void init(value_type& update) const {update = 0.0;}
+  void init( value_type & update ) const { update = 0.0; }
+
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& update, volatile value_type const& input) const {update += input;}
+  void join( volatile value_type & update, volatile value_type const & input ) const { update += input; }
 };
 
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double ReduceTestFunctor() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef Kokkos::View< double, typename view_type::host_mirror_space, Kokkos::MemoryUnmanaged > unmanaged_result;
 
-  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-  typedef Kokkos::View<double**,DeviceType> view_type ;
-  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
-
-  view_type a("A",100,5);
-  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+  view_type a( "A", 100, 5 );
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view( a );
 
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy( a, h_a );
 
   double result = 0.0;
-  if(PWRTest==false)
-    Kokkos::parallel_reduce(100,FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
-  else
-    Kokkos::parallel_reduce(policy_type(25,Kokkos::AUTO),FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
+  if ( PWRTest == false ) {
+    Kokkos::parallel_reduce( 100, FunctorReduceTest< DeviceType >( a ), unmanaged_result( & result ) );
+  }
+  else {
+    Kokkos::parallel_reduce( policy_type( 25, Kokkos::AUTO ), FunctorReduceTest< DeviceType >( a ), unmanaged_result( & result ) );
+  }
 
   return result;
 }
 
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-template<class DeviceType, bool PWRTest>
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+template< class DeviceType, bool PWRTest >
 double ReduceTestLambda() {
+  typedef Kokkos::TeamPolicy< DeviceType > policy_type;
+  typedef Kokkos::View< double**, DeviceType > view_type;
+  typedef Kokkos::View< double, typename view_type::host_mirror_space, Kokkos::MemoryUnmanaged > unmanaged_result;
 
-  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
-  typedef Kokkos::View<double**,DeviceType> view_type ;
-  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
-
-  view_type a("A",100,5);
-  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+  view_type a( "A", 100, 5 );
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view( a );
 
-  for(int i=0;i<100;i++) {
-    for(int j=0;j<5;j++)
-       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  for ( int i = 0; i < 100; i++ ) {
+    for ( int j = 0; j < 5; j++ ) {
+       h_a( i, j ) = 0.1 * i / ( 1.1 * j + 1.0 ) + 0.5 * j;
+    }
   }
-  Kokkos::deep_copy(a,h_a);
+  Kokkos::deep_copy( a, h_a );
 
   double result = 0.0;
 
-  if(PWRTest==false) {
-    Kokkos::parallel_reduce(100,KOKKOS_LAMBDA(const int& i, double& sum)  {
-      sum += a(i,1) + a(i,2);
-      sum += a(i,0) - a(i,3);
-      sum += a(i,4) + a(i,0);
-      sum += a(i,2) - a(i,1);
-      sum += a(i,3) + a(i,4);
+  if ( PWRTest == false ) {
+    Kokkos::parallel_reduce( 100, KOKKOS_LAMBDA( const int & i, double & sum ) {
+      sum += a( i, 1 ) + a( i, 2 );
+      sum += a( i, 0 ) - a( i, 3 );
+      sum += a( i, 4 ) + a( i, 0 );
+      sum += a( i, 2 ) - a( i, 1 );
+      sum += a( i, 3 ) + a( i, 4 );
     }, unmanaged_result( & result ) );
-  } else {
-    typedef typename policy_type::member_type team_member ;
-    Kokkos::parallel_reduce(policy_type(25,Kokkos::AUTO),KOKKOS_LAMBDA(const team_member & dev, double& sum)  {
-      const int begin = dev.league_rank() * 4 ;
-      const int end   = begin + 4 ;
-      for ( int i = begin + dev.team_rank() ; i < end ; i += dev.team_size() ) {
-        sum += a(i,1) + a(i,2);
-        sum += a(i,0) - a(i,3);
-        sum += a(i,4) + a(i,0);
-        sum += a(i,2) - a(i,1);
-        sum += a(i,3) + a(i,4);
+  }
+  else {
+    typedef typename policy_type::member_type team_member;
+    Kokkos::parallel_reduce( policy_type( 25, Kokkos::AUTO ), KOKKOS_LAMBDA( const team_member & dev, double & sum ) {
+      const int begin = dev.league_rank() * 4;
+      const int end   = begin + 4;
+      for ( int i = begin + dev.team_rank(); i < end; i += dev.team_size() ) {
+        sum += a( i, 1 ) + a( i, 2 );
+        sum += a( i, 0 ) - a( i, 3 );
+        sum += a( i, 4 ) + a( i, 0 );
+        sum += a( i, 2 ) - a( i, 1 );
+        sum += a( i, 3 ) + a( i, 4 );
       }
     }, unmanaged_result( & result ) );
   }
 
   return result;
 }
-
 #else
-template<class DeviceType, bool PWRTest>
+template< class DeviceType, bool PWRTest >
 double ReduceTestLambda() {
-  return ReduceTestFunctor<DeviceType,PWRTest>();
+  return ReduceTestFunctor< DeviceType, PWRTest >();
 }
 #endif
 
-template<class DeviceType>
-double TestVariantLambda(int test) {
-  switch (test) {
-    case 1: return AddTestLambda<DeviceType,false>();
-    case 2: return AddTestLambda<DeviceType,true>();
-    case 3: return ReduceTestLambda<DeviceType,false>();
-    case 4: return ReduceTestLambda<DeviceType,true>();
+template< class DeviceType >
+double TestVariantLambda( int test ) {
+  switch ( test ) {
+    case 1: return AddTestLambda< DeviceType, false >();
+    case 2: return AddTestLambda< DeviceType, true >();
+    case 3: return ReduceTestLambda< DeviceType, false >();
+    case 4: return ReduceTestLambda< DeviceType, true >();
   }
+
   return 0;
 }
 
-
-template<class DeviceType>
-double TestVariantFunctor(int test) {
-  switch (test) {
-    case 1: return AddTestFunctor<DeviceType,false>();
-    case 2: return AddTestFunctor<DeviceType,true>();
-    case 3: return ReduceTestFunctor<DeviceType,false>();
-    case 4: return ReduceTestFunctor<DeviceType,true>();
+template< class DeviceType >
+double TestVariantFunctor( int test ) {
+  switch ( test ) {
+    case 1: return AddTestFunctor< DeviceType, false >();
+    case 2: return AddTestFunctor< DeviceType, true >();
+    case 3: return ReduceTestFunctor< DeviceType, false >();
+    case 4: return ReduceTestFunctor< DeviceType, true >();
   }
+
   return 0;
 }
 
-template<class DeviceType>
-bool Test(int test) {
-
+template< class DeviceType >
+bool Test( int test ) {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  double res_functor = TestVariantFunctor<DeviceType>(test);
-  double res_lambda = TestVariantLambda<DeviceType>(test);
+  double res_functor = TestVariantFunctor< DeviceType >( test );
+  double res_lambda = TestVariantLambda< DeviceType >( test );
 
-  char testnames[5][256] = {" "
-                            ,"AddTest","AddTest TeamPolicy"
-                            ,"ReduceTest","ReduceTest TeamPolicy"
+  char testnames[5][256] = { " "
+                           , "AddTest", "AddTest TeamPolicy"
+                           , "ReduceTest", "ReduceTest TeamPolicy"
                            };
   bool passed = true;
 
@@ -322,13 +333,13 @@ bool Test(int test) {
     std::cout << "CXX11 ( test = '"
               << testnames[test] << "' FAILED : "
               << res_functor << " != " << res_lambda
-              << std::endl ;
+              << std::endl;
   }
 
-  return passed ;
+  return passed;
 #else
   return true;
 #endif
 }
 
-}
+} // namespace TestCXX11
diff --git a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
index 359e17a44f1642d630b97987f8d049fc3217a9fb..b53b42b8e05bc906c17f2ad59bdf1ebb9fd62ef7 100644
--- a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
+++ b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,10 +36,11 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
+
 #include <Kokkos_Core.hpp>
 
 #ifndef TESTCXX11DEDUCTION_HPP
@@ -52,43 +53,40 @@ struct TestReductionDeductionTagB {};
 
 template < class ExecSpace >
 struct TestReductionDeductionFunctor {
-
   // KOKKOS_INLINE_FUNCTION
-  // void operator()( long i , long & value ) const
-  // { value += i + 1 ; }
+  // void operator()( long i, long & value ) const
+  // { value += i + 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( TestReductionDeductionTagA , long i , long & value ) const
+  void operator()( TestReductionDeductionTagA, long i, long & value ) const
   { value += ( 2 * i + 1 ) + ( 2 * i + 2 ); }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TestReductionDeductionTagB & , const long i , long & value ) const
-  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ) ; }
-
+  void operator()( const TestReductionDeductionTagB &, const long i, long & value ) const
+  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ); }
 };
 
 template< class ExecSpace >
 void test_reduction_deduction()
 {
-  typedef TestReductionDeductionFunctor< ExecSpace > Functor ;
+  typedef TestReductionDeductionFunctor< ExecSpace > Functor;
 
-  const long N = 50 ;
-  // const long answer  = N % 2 ? ( N * ((N+1)/2 )) : ( (N/2) * (N+1) );
-  const long answerA = N % 2 ? ( (2*N) * (((2*N)+1)/2 )) : ( ((2*N)/2) * ((2*N)+1) );
-  const long answerB = N % 2 ? ( (3*N) * (((3*N)+1)/2 )) : ( ((3*N)/2) * ((3*N)+1) );
-  long result = 0 ;
+  const long N = 50;
+  // const long answer  = N % 2 ? ( N * ( ( N + 1 ) / 2 ) ) : ( ( N / 2 ) * ( N + 1 ) );
+  const long answerA = N % 2 ? ( ( 2 * N ) * ( ( ( 2 * N ) + 1 ) / 2 ) ) : ( ( ( 2 * N ) / 2 ) * ( ( 2 * N ) + 1 ) );
+  const long answerB = N % 2 ? ( ( 3 * N ) * ( ( ( 3 * N ) + 1 ) / 2 ) ) : ( ( ( 3 * N ) / 2 ) * ( ( 3 * N ) + 1 ) );
+  long result = 0;
 
-  // Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace>(0,N) , Functor() , result );
-  // ASSERT_EQ( answer , result );
-  
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagA>(0,N) , Functor() , result );
-  ASSERT_EQ( answerA , result );
-  
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagB>(0,N) , Functor() , result );
-  ASSERT_EQ( answerB , result );
-}
+  // Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), Functor(), result );
+  // ASSERT_EQ( answer, result );
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TestReductionDeductionTagA >( 0, N ), Functor(), result );
+  ASSERT_EQ( answerA, result );
 
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TestReductionDeductionTagB >( 0, N ), Functor(), result );
+  ASSERT_EQ( answerB, result );
 }
 
-#endif
+} // namespace TestCXX11
 
+#endif
diff --git a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
index 5add656a4d7aaa7b70bc247a9ed3af1599e27211..45554383446ec13794f9e22bb0819477a7bdb278 100644
--- a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
+++ b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -47,17 +47,17 @@
 
 namespace TestCompilerMacros {
 
-template<class DEVICE_TYPE>
+template< class DEVICE_TYPE >
 struct AddFunctor {
   typedef DEVICE_TYPE execution_space;
-  typedef typename Kokkos::View<int**,execution_space> type;
-  type a,b;
+  typedef typename Kokkos::View< int**, execution_space > type;
+  type a, b;
   int length;
 
-  AddFunctor(type a_, type b_):a(a_),b(b_),length(a.dimension_1()) {}
+  AddFunctor( type a_, type b_ ) : a( a_ ), b( b_ ), length( a.dimension_1() ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int i) const {
+  void operator()( int i ) const {
 #ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
     #pragma unroll
 #endif
@@ -75,21 +75,23 @@ struct AddFunctor {
     #pragma simd
 #endif
 #endif
-    for(int j=0;j<length;j++)
-      a(i,j) += b(i,j);
+    for ( int j = 0; j < length; j++ ) {
+      a( i, j ) += b( i, j );
+    }
   }
 };
 
-template<class DeviceType>
+template< class DeviceType >
 bool Test() {
-  typedef typename Kokkos::View<int**,DeviceType> type;
-  type a("A",1024,128);
-  type b("B",1024,128);
+  typedef typename Kokkos::View< int**, DeviceType > type;
+  type a( "A", 1024, 128 );
+  type b( "B", 1024, 128 );
 
-  AddFunctor<DeviceType> f(a,b);
-  Kokkos::parallel_for(1024,f);
+  AddFunctor< DeviceType > f( a, b );
+  Kokkos::parallel_for( 1024, f );
   DeviceType::fence();
+
   return true;
 }
 
-}
+} // namespace TestCompilerMacros
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
index 7e08f67e69721dc803f1ea4a23cbe3328af391dc..f85a35c096516fe77c39cfaaa1778a9d5bb895ef 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
@@ -45,13 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestAtomic.hpp>
-
 #include <TestViewAPI.hpp>
-
 #include <TestReduce.hpp>
 #include <TestScan.hpp>
 #include <TestTeam.hpp>
@@ -78,24 +75,25 @@ protected:
 
 TEST_F( defaultdevicetype, host_space_access )
 {
-  typedef Kokkos::HostSpace::execution_space host_exec_space ;
-  typedef Kokkos::Device< host_exec_space , Kokkos::HostSpace > device_space ;
-  typedef Kokkos::Impl::HostMirror< Kokkos::DefaultExecutionSpace >::Space mirror_space ;
+  typedef Kokkos::HostSpace::execution_space host_exec_space;
+  typedef Kokkos::Device< host_exec_space, Kokkos::HostSpace > device_space;
+  typedef Kokkos::Impl::HostMirror< Kokkos::DefaultExecutionSpace >::Space mirror_space;
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< host_exec_space , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< host_exec_space, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< device_space , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< device_space, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< mirror_space , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< mirror_space, Kokkos::HostSpace >::accessible, "" );
 }
 
-TEST_F( defaultdevicetype, view_api) {
-  TestViewAPI< double , Kokkos::DefaultExecutionSpace >();
+TEST_F( defaultdevicetype, view_api )
+{
+  TestViewAPI< double, Kokkos::DefaultExecutionSpace >();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 7778efde301bb9fd8856c9743bfcaaea2d7b3095..401da58a5838d7cab5adaf38a00d4231f51721d2 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -44,376 +44,425 @@
 #include <gtest/gtest.h>
 
 #include <Kokkos_Core.hpp>
+
 #ifdef KOKKOS_ENABLE_OPENMP
 #include <omp.h>
 #endif
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 namespace Test {
 
 namespace Impl {
 
-  char** init_kokkos_args(bool do_threads,bool do_numa,bool do_device,bool do_other, int& nargs, Kokkos::InitArguments& init_args) {
-    nargs = (do_threads?1:0) +
-            (do_numa?1:0) +
-            (do_device?1:0) +
-            (do_other?4:0);
-    char** args_kokkos = new char*[nargs];
-    for(int i = 0; i < nargs; i++)
-      args_kokkos[i] = new char[20];
+char** init_kokkos_args( bool do_threads, bool do_numa, bool do_device, bool do_other, int & nargs, Kokkos::InitArguments & init_args ) {
+  nargs = ( do_threads ? 1 : 0 ) +
+          ( do_numa ? 1 : 0 ) +
+          ( do_device ? 1 : 0 ) +
+          ( do_other ? 4 : 0 );
 
-    int threads_idx = do_other?1:0;
-    int numa_idx = (do_other?3:0) + (do_threads?1:0);
-    int device_idx = (do_other?3:0) + (do_threads?1:0) + (do_numa?1:0);
+  char** args_kokkos = new char*[nargs];
+  for ( int i = 0; i < nargs; i++ ) {
+    args_kokkos[i] = new char[20];
+  }
 
+  int threads_idx = do_other ? 1 : 0;
+  int numa_idx = ( do_other ? 3 : 0 ) + ( do_threads ? 1 : 0 );
+  int device_idx = ( do_other ? 3 : 0 ) + ( do_threads ? 1 : 0 ) + ( do_numa ? 1 : 0 );
 
-    if(do_threads) {
-      int nthreads = 3;
+  if ( do_threads ) {
+    int nthreads = 3;
 
 #ifdef KOKKOS_ENABLE_OPENMP
-      if(omp_get_max_threads() < 3)
-        nthreads = omp_get_max_threads();
+    if ( omp_get_max_threads() < 3 )
+      nthreads = omp_get_max_threads();
 #endif
 
-      if(Kokkos::hwloc::available())  {
-        if(Kokkos::hwloc::get_available_threads_per_core()<3)
-            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
-                       * Kokkos::hwloc::get_available_numa_count();
-      }
-
-#ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        nthreads = 1;
-      }
-#endif
-      init_args.num_threads = nthreads;
-      sprintf(args_kokkos[threads_idx],"--threads=%i",nthreads);
+    if ( Kokkos::hwloc::available() ) {
+      if ( Kokkos::hwloc::get_available_threads_per_core() < 3 )
+        nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                   * Kokkos::hwloc::get_available_numa_count();
     }
 
-    if(do_numa) {
-      int numa = 1;
-      if(Kokkos::hwloc::available())
-        numa = Kokkos::hwloc::get_available_numa_count();
 #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        numa = 1;
-      }
-#endif
-
-      init_args.num_numa = numa;
-      sprintf(args_kokkos[numa_idx],"--numa=%i",numa);
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      nthreads = 1;
     }
+#endif
 
-    if(do_device) {
+    init_args.num_threads = nthreads;
+    sprintf( args_kokkos[threads_idx], "--threads=%i", nthreads );
+  }
 
-      init_args.device_id = 0;
-      sprintf(args_kokkos[device_idx],"--device=%i",0);
+  if ( do_numa ) {
+    int numa = 1;
+    if ( Kokkos::hwloc::available() ) {
+      numa = Kokkos::hwloc::get_available_numa_count();
     }
 
-    if(do_other) {
-      sprintf(args_kokkos[0],"--dummyarg=1");
-      sprintf(args_kokkos[threads_idx+(do_threads?1:0)],"--dummy2arg");
-      sprintf(args_kokkos[threads_idx+(do_threads?1:0)+1],"dummy3arg");
-      sprintf(args_kokkos[device_idx+(do_device?1:0)],"dummy4arg=1");
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      numa = 1;
     }
+#endif
 
+    init_args.num_numa = numa;
+    sprintf( args_kokkos[numa_idx], "--numa=%i", numa );
+  }
 
-    return args_kokkos;
+  if ( do_device ) {
+    init_args.device_id = 0;
+    sprintf( args_kokkos[device_idx], "--device=%i", 0 );
   }
 
-  Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, bool do_device) {
-    Kokkos::InitArguments args;
+  if ( do_other ) {
+    sprintf( args_kokkos[0], "--dummyarg=1" );
+    sprintf( args_kokkos[ threads_idx + ( do_threads ? 1 : 0 ) ], "--dummy2arg" );
+    sprintf( args_kokkos[ threads_idx + ( do_threads ? 1 : 0 ) + 1 ], "dummy3arg" );
+    sprintf( args_kokkos[ device_idx + ( do_device ? 1 : 0 ) ], "dummy4arg=1" );
+  }
+
+  return args_kokkos;
+}
+
+Kokkos::InitArguments init_initstruct( bool do_threads, bool do_numa, bool do_device ) {
+  Kokkos::InitArguments args;
 
-    if(do_threads) {
-      int nthreads = 3;
+  if ( do_threads ) {
+    int nthreads = 3;
 
 #ifdef KOKKOS_ENABLE_OPENMP
-      if(omp_get_max_threads() < 3)
-        nthreads = omp_get_max_threads();
+    if ( omp_get_max_threads() < 3 ) {
+      nthreads = omp_get_max_threads();
+    }
 #endif
 
-      if(Kokkos::hwloc::available())  {
-        if(Kokkos::hwloc::get_available_threads_per_core()<3)
-            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
-                       * Kokkos::hwloc::get_available_numa_count();
+    if ( Kokkos::hwloc::available() ) {
+      if ( Kokkos::hwloc::get_available_threads_per_core() < 3 ) {
+        nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                   * Kokkos::hwloc::get_available_numa_count();
       }
+    }
+
 #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        nthreads = 1;
-      }
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      nthreads = 1;
+    }
 #endif
 
-      args.num_threads = nthreads;
+    args.num_threads = nthreads;
+  }
+
+  if ( do_numa ) {
+    int numa = 1;
+    if ( Kokkos::hwloc::available() ) {
+      numa = Kokkos::hwloc::get_available_numa_count();
     }
 
-    if(do_numa) {
-      int numa = 1;
-      if(Kokkos::hwloc::available())
-        numa = Kokkos::hwloc::get_available_numa_count();
 #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
-         std::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
-        numa = 1;
-      }
-#endif
-      args.num_numa = numa;
+    if ( std::is_same< Kokkos::Serial, Kokkos::DefaultExecutionSpace >::value ||
+         std::is_same< Kokkos::Serial, Kokkos::DefaultHostExecutionSpace >::value ) {
+      numa = 1;
     }
+#endif
 
-    if(do_device) {
-      args.device_id = 0;
-    }
+    args.num_numa = numa;
+  }
 
-    return args;
+  if ( do_device ) {
+    args.device_id = 0;
   }
 
-  void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
-    ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1);
-    ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1);
-
-    //Figure out the number of threads the HostSpace ExecutionSpace should have initialized to
-    int expected_nthreads = argstruct.num_threads;
-    if(expected_nthreads<1) {
-      if(Kokkos::hwloc::available()) {
-        expected_nthreads = Kokkos::hwloc::get_available_numa_count()
-                          * Kokkos::hwloc::get_available_cores_per_numa()
-                          * Kokkos::hwloc::get_available_threads_per_core();
-      } else {
-        #ifdef KOKKOS_ENABLE_OPENMP
-        if(std::is_same<Kokkos::HostSpace::execution_space,Kokkos::OpenMP>::value) {
-          expected_nthreads = omp_get_max_threads();
-        } else
-        #endif
-          expected_nthreads = 1;
+  return args;
+}
+
+void check_correct_initialization( const Kokkos::InitArguments & argstruct ) {
+  ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1 );
+  ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1 );
+
+  // Figure out the number of threads the HostSpace ExecutionSpace should have initialized to.
+  int expected_nthreads = argstruct.num_threads;
 
+  if ( expected_nthreads < 1 ) {
+    if ( Kokkos::hwloc::available() ) {
+      expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                        * Kokkos::hwloc::get_available_cores_per_numa()
+                        * Kokkos::hwloc::get_available_threads_per_core();
+    }
+    else {
+#ifdef KOKKOS_ENABLE_OPENMP
+      if ( std::is_same< Kokkos::HostSpace::execution_space, Kokkos::OpenMP >::value ) {
+        expected_nthreads = omp_get_max_threads();
       }
-      #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
-         std::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value ) 
+      else
+#endif
         expected_nthreads = 1;
-      #endif
     }
 
-    int expected_numa = argstruct.num_numa;
-    if(expected_numa<1) {
-      if(Kokkos::hwloc::available()) {
-        expected_numa = Kokkos::hwloc::get_available_numa_count();
-      } else {
-        expected_numa = 1;
-      }
-      #ifdef KOKKOS_ENABLE_SERIAL
-      if(std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
-         std::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value )
-        expected_numa = 1;
-      #endif
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ||
+         std::is_same< Kokkos::DefaultHostExecutionSpace, Kokkos::Serial >::value ) {
+      expected_nthreads = 1;
     }
-    ASSERT_EQ(Kokkos::HostSpace::execution_space::thread_pool_size(),expected_nthreads);
+#endif
+  }
 
-#ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Cuda>::value) {
-      int device;
-      cudaGetDevice( &device );
-      int expected_device = argstruct.device_id;
-      if(argstruct.device_id<0) {
-        expected_device = 0;
-      }
-      ASSERT_EQ(expected_device,device);
+  int expected_numa = argstruct.num_numa;
+
+  if ( expected_numa < 1 ) {
+    if ( Kokkos::hwloc::available() ) {
+      expected_numa = Kokkos::hwloc::get_available_numa_count();
+    }
+    else {
+      expected_numa = 1;
     }
+
+#ifdef KOKKOS_ENABLE_SERIAL
+    if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ||
+         std::is_same< Kokkos::DefaultHostExecutionSpace, Kokkos::Serial >::value )
+      expected_numa = 1;
 #endif
   }
 
-  //ToDo: Add check whether correct number of threads are actually started
-  void test_no_arguments() {
-    Kokkos::initialize();
-    check_correct_initialization(Kokkos::InitArguments());
-    Kokkos::finalize();
-  }
+  ASSERT_EQ( Kokkos::HostSpace::execution_space::thread_pool_size(), expected_nthreads );
 
-  void test_commandline_args(int nargs, char** args, const Kokkos::InitArguments& argstruct) {
-    Kokkos::initialize(nargs,args);
-    check_correct_initialization(argstruct);
-    Kokkos::finalize();
-  }
 
-  void test_initstruct_args(const Kokkos::InitArguments& args) {
-    Kokkos::initialize(args);
-    check_correct_initialization(args);
-    Kokkos::finalize();
+#ifdef KOKKOS_ENABLE_CUDA
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Cuda >::value ) {
+    int device;
+    cudaGetDevice( &device );
+
+    int expected_device = argstruct.device_id;
+    if ( argstruct.device_id < 0 ) {
+      expected_device = 0;
+    }
+
+    ASSERT_EQ( expected_device, device );
   }
+#endif
+}
+
+// TODO: Add check whether correct number of threads are actually started.
+void test_no_arguments() {
+  Kokkos::initialize();
+  check_correct_initialization( Kokkos::InitArguments() );
+  Kokkos::finalize();
 }
 
+void test_commandline_args( int nargs, char** args, const Kokkos::InitArguments & argstruct ) {
+  Kokkos::initialize( nargs, args );
+  check_correct_initialization( argstruct );
+  Kokkos::finalize();
+}
+
+void test_initstruct_args( const Kokkos::InitArguments & args ) {
+  Kokkos::initialize( args );
+  check_correct_initialization( args );
+  Kokkos::finalize();
+}
+
+} // namespace Impl
+
 class defaultdevicetypeinit : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {
-  }
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {
-  }
+  static void TearDownTestCase() {}
 };
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_01
-TEST_F( defaultdevicetypeinit, no_args) {
+TEST_F( defaultdevicetypeinit, no_args )
+{
   Impl::test_no_arguments();
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_02
-TEST_F( defaultdevicetypeinit, commandline_args_empty) {
+TEST_F( defaultdevicetypeinit, commandline_args_empty )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,false,false,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, false, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_03
-TEST_F( defaultdevicetypeinit, commandline_args_other) {
+TEST_F( defaultdevicetypeinit, commandline_args_other )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,false,false,true,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, false, false, true, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_04
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,false,false,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, false, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_05
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,true,false,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, true, false, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_06
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,true,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, true, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_07
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,false,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, false, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_08
-TEST_F( defaultdevicetypeinit, commandline_args_numa_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_numa_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,true,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, true, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_09
-TEST_F( defaultdevicetypeinit, commandline_args_device) {
+TEST_F( defaultdevicetypeinit, commandline_args_device )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(false,false,true,false,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( false, false, true, false, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_10
-TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other )
+{
   Kokkos::InitArguments argstruct;
   int nargs = 0;
-  char** args = Impl::init_kokkos_args(true,true,true,true,nargs, argstruct);
-  Impl::test_commandline_args(nargs,args,argstruct);
-  for(int i = 0; i < nargs; i++)
+  char** args = Impl::init_kokkos_args( true, true, true, true, nargs, argstruct );
+  Impl::test_commandline_args( nargs, args, argstruct );
+
+  for ( int i = 0; i < nargs; i++ ) {
     delete [] args[i];
+  }
   delete [] args;
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_11
-TEST_F( defaultdevicetypeinit, initstruct_default) {
+TEST_F( defaultdevicetypeinit, initstruct_default )
+{
   Kokkos::InitArguments args;
-  Impl::test_initstruct_args(args);
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_12
-TEST_F( defaultdevicetypeinit, initstruct_nthreads) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,false,false);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, false, false );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_13
-TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,true,false);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, true, false );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_14
-TEST_F( defaultdevicetypeinit, initstruct_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(false,false,true);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( false, false, true );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_15
-TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,false,true);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, false, true );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
 #ifdef KOKKOS_DEFAULTDEVICETYPE_INIT_TEST_16
-TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) {
-  Kokkos::InitArguments args = Impl::init_initstruct(true,true,true);
-  Impl::test_initstruct_args(args);
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device )
+{
+  Kokkos::InitArguments args = Impl::init_initstruct( true, true, true );
+  Impl::test_initstruct_args( args );
 }
 #endif
 
-
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
index dd148a062446f253bbcbc854b775eefd85debf79..4fdfa959107becae384ffa5c5e09d444e9299670 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_a.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,12 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestReduce.hpp>
 
-
 namespace Test {
 
 class defaultdevicetype : public ::testing::Test {
@@ -66,11 +64,11 @@ protected:
   }
 };
 
-
-TEST_F( defaultdevicetype, reduce_instantiation_a) {
+TEST_F( defaultdevicetype, reduce_instantiation_a )
+{
   TestReduceCombinatoricalInstantiation<>::execute_a();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp
index c8edfdd5c39d575400408e8dbf5fb3cdd2005d66..841f34e03dd1f9900d304a8f6e889a5d30dc2a65 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_b.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,12 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestReduce.hpp>
 
-
 namespace Test {
 
 class defaultdevicetype : public ::testing::Test {
@@ -66,11 +64,11 @@ protected:
   }
 };
 
-
-TEST_F( defaultdevicetype, reduce_instantiation_b) {
+TEST_F( defaultdevicetype, reduce_instantiation_b )
+{
   TestReduceCombinatoricalInstantiation<>::execute_b();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp
index 405d49a9b891619f3d823a5559e7751b8f3b885b..602863be3852a603d6c8e803752ad4a67709c0d5 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_c.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,12 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestReduce.hpp>
 
-
 namespace Test {
 
 class defaultdevicetype : public ::testing::Test {
@@ -66,11 +64,11 @@ protected:
   }
 };
 
-
-TEST_F( defaultdevicetype, reduce_instantiation_c) {
+TEST_F( defaultdevicetype, reduce_instantiation_c )
+{
   TestReduceCombinatoricalInstantiation<>::execute_c();
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp
index 426cc4f06c6157d37db40ea2feeceac242710ea0..5d3665b905434d1310dc51e430940b17690baac1 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType_d.cpp
@@ -45,13 +45,10 @@
 
 #include <Kokkos_Core.hpp>
 
-#if !defined(KOKKOS_ENABLE_CUDA) || defined(__CUDACC__)
-//----------------------------------------------------------------------------
+#if !defined( KOKKOS_ENABLE_CUDA ) || defined( __CUDACC__ )
 
 #include <TestAtomic.hpp>
-
 #include <TestViewAPI.hpp>
-
 #include <TestReduce.hpp>
 #include <TestScan.hpp>
 #include <TestTeam.hpp>
@@ -76,162 +73,165 @@ protected:
   }
 };
 
-TEST_F( defaultdevicetype, test_utilities) {
+TEST_F( defaultdevicetype, test_utilities )
+{
   test_utilities();
 }
 
-TEST_F( defaultdevicetype, long_reduce) {
-  TestReduce< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, long_reduce )
+{
+  TestReduce< long, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, double_reduce) {
-  TestReduce< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, double_reduce )
+{
+  TestReduce< double, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-TEST_F( defaultdevicetype, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+TEST_F( defaultdevicetype, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::DefaultExecutionSpace >( 100000 );
 }
 
-
-TEST_F( defaultdevicetype , atomics )
+TEST_F( defaultdevicetype, atomics )
 {
-  const int loop_count = 1e4 ;
+  const int loop_count = 1e4;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::DefaultExecutionSpace >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::DefaultExecutionSpace >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::DefaultExecutionSpace >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::DefaultExecutionSpace >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::DefaultExecutionSpace >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::DefaultExecutionSpace >( 100, 3 ) ) );
 }
 
-/*TEST_F( defaultdevicetype , view_remap )
+/*TEST_F( defaultdevicetype, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::DefaultExecutionSpace > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::DefaultExecutionSpace > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::DefaultExecutionSpace > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
-}*/
-
-//----------------------------------------------------------------------------
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::DefaultExecutionSpace > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::DefaultExecutionSpace > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::DefaultExecutionSpace > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+  for ( size_t i3 = 0; i3 < N3; ++i3 ) {
+    for ( size_t i2 = 0; i2 < N2; ++i2 ) {
+      for ( size_t i1 = 0; i1 < N1; ++i1 ) {
+        for ( size_t i0 = 0; i0 < N0; ++i0 ) {
+          input( i0, i1, i2, i3 ) = ++value;
+        }
+      }
+    }
+  }
 
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+  for ( size_t i3 = 0; i3 < N3; ++i3 ) {
+    for ( size_t i2 = 0; i2 < N2; ++i2 ) {
+      for ( size_t i1 = 0; i1 < N1; ++i1 ) {
+        for ( size_t i0 = 0; i0 < N0; ++i0 ) {
+          ++value;
+          ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+        }
+      }
+    }
+  }
+}*/
 
-TEST_F( defaultdevicetype , view_aggregate )
+TEST_F( defaultdevicetype, view_aggregate )
 {
   TestViewAggregate< Kokkos::DefaultExecutionSpace >();
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( defaultdevicetype , scan )
+TEST_F( defaultdevicetype, scan )
 {
-  TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1 , 1000 );
+  TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1, 1000 );
   TestScan< Kokkos::DefaultExecutionSpace >( 1000000 );
   TestScan< Kokkos::DefaultExecutionSpace >( 10000000 );
   Kokkos::DefaultExecutionSpace::fence();
 }
 
-
-//----------------------------------------------------------------------------
-
-TEST_F( defaultdevicetype , compiler_macros )
+TEST_F( defaultdevicetype, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::DefaultExecutionSpace >() ) );
 }
 
-
-//----------------------------------------------------------------------------
-TEST_F( defaultdevicetype , cxx11 )
+TEST_F( defaultdevicetype, cxx11 )
 {
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(1) ) );
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(2) ) );
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(3) ) );
-  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(4) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 1 ) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 2 ) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 3 ) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >( 4 ) ) );
 }
 
-TEST_F( defaultdevicetype , team_vector )
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND) && !defined(KOKKOS_ARCH_PASCAL)
+TEST_F( defaultdevicetype, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >( 5 ) ) );
 }
+#endif
 
-TEST_F( defaultdevicetype , malloc )
+TEST_F( defaultdevicetype, malloc )
 {
-  int* data = (int*) Kokkos::kokkos_malloc(100*sizeof(int));
-  ASSERT_NO_THROW(data = (int*) Kokkos::kokkos_realloc(data,120*sizeof(int)));
-  Kokkos::kokkos_free(data);
+  int* data = (int*) Kokkos::kokkos_malloc( 100 * sizeof( int ) );
+  ASSERT_NO_THROW( data = (int*) Kokkos::kokkos_realloc( data, 120 * sizeof( int ) ) );
+  Kokkos::kokkos_free( data );
 
-  int* data2 = (int*) Kokkos::kokkos_malloc(0);
-  ASSERT_TRUE(data2==NULL);
-  Kokkos::kokkos_free(data2);
+  int* data2 = (int*) Kokkos::kokkos_malloc( 0 );
+  ASSERT_TRUE( data2 == NULL );
+  Kokkos::kokkos_free( data2 );
 }
 
-} // namespace test
+} // namespace Test
 
 #endif
diff --git a/lib/kokkos/core/unit_test/TestHWLOC.cpp b/lib/kokkos/core/unit_test/TestHWLOC.cpp
index 1637dec5de4ff762cfbd259ee47932b5e85eb4d0..d03d9b816f9c3ac3ee85b61886baa243e5160714 100644
--- a/lib/kokkos/core/unit_test/TestHWLOC.cpp
+++ b/lib/kokkos/core/unit_test/TestHWLOC.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -44,26 +44,24 @@
 #include <gtest/gtest.h>
 
 #include <iostream>
+
 #include <Kokkos_hwloc.hpp>
 
 namespace Test {
 
 class hwloc : public ::testing::Test {
 protected:
-  static void SetUpTestCase()
-  {}
+  static void SetUpTestCase() {}
 
-  static void TearDownTestCase()
-  {}
+  static void TearDownTestCase() {}
 };
 
-TEST_F( hwloc, query)
+TEST_F( hwloc, query )
 {
   std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]"
             << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]"
             << " PU[" << Kokkos::hwloc::get_available_threads_per_core()  << "]"
-            << std::endl ;
-}
-
+            << std::endl;
 }
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index 9894d1ce697c1f109163f7711e62f12cfceef703..1dc349cc1268e680aabc0859a771c7a786a388de 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -47,509 +47,1675 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
+
 namespace {
 
 template <typename ExecSpace >
 struct TestMDRange_2D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType**, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
 
-  using DataType     = int ;
-  using ViewType     = typename Kokkos::View< DataType** ,  ExecSpace > ;
-  using HostViewType = typename ViewType::HostMirror ;
+  ViewType input_view;
 
-  ViewType input_view ;
+  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view( "input_view", N0, N1 ) {}
 
-  TestMDRange_2D( const DataType N0, const DataType N1 ) : input_view("input_view", N0, N1) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j ) const
+  {
+    input_view( i, j ) = 1;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , const int j ) const
+  void operator()( const int i, const int j, double &lsum ) const
   {
-    input_view(i,j) = 1;
+    lsum += input_view( i, j ) * 2;
   }
 
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j ) const
+  {
+    input_view( i, j ) = 3;
+  }
 
-  static void test_for2( const int64_t N0, const int64_t N1 )
+  static void test_reduce2( const int N0, const int N1 )
   {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 2, 6 } } );
+
+      TestMDRange_2D functor( N0, N1 );
 
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 );
+    }
+  } // end test_reduce2
+
+  static void test_for2( const int N0, const int N1 )
+  {
     using namespace Kokkos::Experimental;
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> >;
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, InitTag > range_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Default Layouts + InitTag op() + Default Tile: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "No info: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 4, 4 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "D D: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {3,3} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "L L: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Left , Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {7,7} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 7, 7 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "L R: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {16,16} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 16, 16 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "R L: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0}, {N0,N1}, {5,16} );
-      TestMDRange_2D functor(N0,N1);
+      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 5, 16 } } );
+      TestMDRange_2D functor( N0, N1 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          if ( h_view(i,j) != 1 ) {
-            ++counter;
-          }
-        }}
-      if ( counter != 0 )
-        printf(" Errors in test_for2; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      {
+        if ( h_view( i, j ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "R R: Errors in test_for2; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
-
-  } //end test_for2
-}; //MDRange_2D
+  } // end test_for2
+}; // MDRange_2D
 
 template <typename ExecSpace >
 struct TestMDRange_3D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType***, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
 
-  using DataType = int ;
-  using ViewType     = typename Kokkos::View< DataType*** ,  ExecSpace > ;
-  using HostViewType = typename ViewType::HostMirror ;
+  ViewType input_view;
 
-  ViewType input_view ;
+  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0, N1, N2 ) {}
 
-  TestMDRange_3D( const DataType N0, const DataType N1, const DataType N2 ) : input_view("input_view", N0, N1, N2) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k ) const
+  {
+    input_view( i, j, k ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, double &lsum ) const
+  {
+    lsum += input_view( i, j, k ) * 2;
+  }
 
+  // tagged operators
+  struct InitTag {};
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , const int j , const int k ) const
+  void operator()( const InitTag &, const int i, const int j, const int k ) const
   {
-    input_view(i,j,k) = 1;
+    input_view( i, j, k ) = 3;
   }
 
-  static void test_for3( const int64_t N0, const int64_t N1, const int64_t N2 )
+  static void test_reduce3( const int N0, const int N1, const int N2 )
   {
     using namespace Kokkos::Experimental;
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 6 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+      double sum = 0.0;
+      md_parallel_reduce( range, functor, sum );
+
+      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+    }
+  } // end test_reduce3
+
+  static void test_for3( const int N0, const int N1, const int N2 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Default, Iterate::Default >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 5, 7 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Default>, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 8, 8, 8 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Flat, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 2, 4, 2 } } );
+      TestMDRange_3D functor( N0, N1, N2 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      {
+        if ( h_view( i, j, k ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for3; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  } // end test_for3
+};
+
+template <typename ExecSpace >
+struct TestMDRange_4D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+
+  TestMDRange_4D( const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0, N1, N2, N3 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i, j, k, l ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l ) const
+  {
+    input_view( i, j, k, l ) = 3;
+  }
+
+  static void test_for4( const int N0, const int N1, const int N2, const int N3 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4> > range_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } } );
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Flat >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
 
-      range_type range( {0,0,0}, {N0,N1,N2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Left, Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {3,5,7} );
-      TestMDRange_3D functor(N0,N1,N2);
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Left >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {8,8,8} );
-      TestMDRange_3D functor(N0,N1,N2);
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
     {
-      using range_type = MDRangePolicy< ExecSpace, Rank<3, Iterate::Right, Iterate::Right >, Kokkos::IndexType<int> >;
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
 
-      range_type range( {0,0,0}, {N0,N1,N2}, {2,4,2} );
-      TestMDRange_3D functor(N0,N1,N2);
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
 
       md_parallel_for( range, functor );
 
       HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
-      Kokkos::deep_copy( h_view , functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i=0; i<N0; ++i ) {
-        for ( int j=0; j<N1; ++j ) {
-          for ( int k=0; k<N2; ++k ) {
-          if ( h_view(i,j,k) != 1 ) {
-            ++counter;
-          }
-        }}}
-      if ( counter != 0 )
-        printf(" Errors in test_for3; mismatches = %d\n\n",counter);
-      ASSERT_EQ( counter , 0 );
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
     }
 
-  } //end test_for3
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 4, 4, 4, 4 } } );
+
+      TestMDRange_4D functor( N0, N1, N2, N3 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      {
+        if ( h_view( i, j, k, l ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for4; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  } // end test_for4
 };
 
-} /* namespace */
-} /* namespace Test */
+template <typename ExecSpace >
+struct TestMDRange_5D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType*****, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+
+  TestMDRange_5D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0, N1, N2, N3, N4 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i, j, k, l, m ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m ) const
+  {
+    input_view( i, j, k, l, m ) = 3;
+  }
+
+  static void test_for5( const int N0, const int N1, const int N2, const int N3, const int N4 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } } );
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 7 } } );
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 4, 4, 4, 2, 2 } } );
+
+      TestMDRange_5D functor( N0, N1, N2, N3, N4 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      {
+        if ( h_view( i, j, k, l, m ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for5; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  }
+};
+
+template <typename ExecSpace >
+struct TestMDRange_6D {
+  using DataType     = int;
+  using ViewType     = typename Kokkos::View< DataType******, ExecSpace >;
+  using HostViewType = typename ViewType::HostMirror;
+
+  ViewType input_view;
+
+  TestMDRange_6D( const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0, N1, N2, N3, N4, N5 ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i, j, k, l, m, n ) = 1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i, const int j, const int k, const int l, const int m, const int n, double &lsum ) const
+  {
+    lsum += input_view( i, j, k, l, m, n ) * 2;
+  }
+
+  // tagged operators
+  struct InitTag {};
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const InitTag &, const int i, const int j, const int k, const int l, const int m, const int n ) const
+  {
+    input_view( i, j, k, l, m, n ) = 3;
+  }
+
+  static void test_for6( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
+  {
+    using namespace Kokkos::Experimental;
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } } );
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + No Tile: Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int>, InitTag > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 3 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Default, Iterate::Default>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Left, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Left, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Right, Iterate::Left>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+
+    {
+      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6, Iterate::Right, Iterate::Right>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::tile_type tile_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 4, 4, 4, 2, 2, 2 } } );
+
+      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
+
+      md_parallel_for( range, functor );
+
+      HostViewType h_view = Kokkos::create_mirror_view( functor.input_view );
+      Kokkos::deep_copy( h_view, functor.input_view );
+
+      int counter = 0;
+      for ( int i = 0; i < N0; ++i )
+      for ( int j = 0; j < N1; ++j )
+      for ( int k = 0; k < N2; ++k )
+      for ( int l = 0; l < N3; ++l )
+      for ( int m = 0; m < N4; ++m )
+      for ( int n = 0; n < N5; ++n )
+      {
+        if ( h_view( i, j, k, l, m, n ) != 1 ) {
+          ++counter;
+        }
+      }
+
+      if ( counter != 0 ) {
+        printf( " Errors in test_for6; mismatches = %d\n\n", counter );
+      }
+
+      ASSERT_EQ( counter, 0 );
+    }
+  }
+};
 
-/*--------------------------------------------------------------------------*/
+} // namespace
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
index 868e64e9da5e46ee0d06f59736a0f4b20d576ee0..925f0e35ed6d12d3a822daa63421827fe636c86c 100644
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -156,7 +156,7 @@ struct fill_memory {
   void operator()( size_type i ) const
   {
     if ( i % STRIDE == 0 ) {
-      *m_pointers[i / STRIDE].ptr = i / STRIDE ;
+      *m_pointers[i / STRIDE].ptr = i / STRIDE;
     }
   }
 };
@@ -493,12 +493,12 @@ T smallest_power2_ge( T val )
   // Find the most significant nonzero bit.
   int first_nonzero_bit = Kokkos::Impl::bit_scan_reverse( val );
 
-  // If val is an integral power of 2, ceil( log2(val) ) is equal to the
+  // If val is an integral power of 2, ceil( log2( val ) ) is equal to the
   // most significant nonzero bit.  Otherwise, you need to add 1.
   int lg2_size = first_nonzero_bit +
                  !Kokkos::Impl::is_integral_power_of_two( val );
 
-  return T(1) << T(lg2_size);
+  return T( 1 ) << T( lg2_size );
 }
 
 // This test makes allocation requests for multiple sizes and interleaves
@@ -547,7 +547,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   phase1_size = ( ( phase1_size + num_chunk_sizes - 1 ) / num_chunk_sizes ) *
                 num_chunk_sizes;
 
-  // Make sure the phase 2 size is multiples of (2 * num_chunk_sizes).
+  // Make sure the phase 2 size is multiples of ( 2 * num_chunk_sizes ).
   phase2_size =
     ( ( phase2_size + 2 * num_chunk_sizes - 1 ) / ( 2 * num_chunk_sizes ) ) *
     2 * num_chunk_sizes;
@@ -567,7 +567,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   // each chunk size.
   work_view phase1_work( "Phase 1 Work", phase1_size );
   typename work_view::HostMirror host_phase1_work =
-    create_mirror_view(phase1_work);
+    create_mirror_view( phase1_work );
 
   size_t inner_size = phase1_size / num_chunk_sizes;
   unsigned chunk_size = base_chunk_size;
@@ -589,7 +589,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   // deallocations with an equal number of allocations for each chunk size.
   work_view phase2_work( "Phase 2 Work", phase2_size );
   typename work_view::HostMirror host_phase2_work =
-    create_mirror_view(phase2_work);
+    create_mirror_view( phase2_work );
 
   inner_size = half_phase2_size / num_chunk_sizes;
   chunk_size = base_chunk_size;
@@ -614,7 +614,7 @@ void test_mempool2( unsigned base_chunk_size, size_t num_chunk_sizes,
   // Initialize the phase 3 work view with all deallocations.
   work_view phase3_work( "Phase 3 Work", phase3_size );
   typename work_view::HostMirror host_phase3_work =
-    create_mirror_view(phase3_work);
+    create_mirror_view( phase3_work );
 
   inner_size = phase3_size / num_chunk_sizes;
 
diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 1bb45481c9b76d6dde29ff9e9d192d5ae4531829..6f2ca6a61c34b84f96cefd1195a6a11e2a6d32d1 100644
--- a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -48,7 +48,7 @@
 #include <sstream>
 #include <iostream>
 
-struct SomeTag{};
+struct SomeTag {};
 
 template< class ExecutionSpace >
 class TestRangePolicyConstruction {
@@ -56,179 +56,194 @@ public:
   TestRangePolicyConstruction() {
     test_compile_time_parameters();
   }
+
 private:
   void test_compile_time_parameters() {
     {
       Kokkos::Impl::expand_variadic();
-      Kokkos::Impl::expand_variadic(1,2,3);
+      Kokkos::Impl::expand_variadic( 1, 2, 3 );
     }
+
     {
       typedef Kokkos::RangePolicy<> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::IndexType<long>, ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,ExecutionSpace,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::RangePolicy< Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::RangePolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::RangePolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
   }
 };
@@ -240,258 +255,274 @@ public:
     test_compile_time_parameters();
     test_run_time_parameters();
   }
+
 private:
   void test_compile_time_parameters() {
     {
       typedef Kokkos::TeamPolicy<> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace> policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Static>    >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Static>    >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,ExecutionSpace,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, ExecutionSpace, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,ExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, ExecutionSpace                      >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace                      >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,typename execution_space::size_type >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace        >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      typename execution_space::size_type >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,void                                >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::IndexType<long>, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        void                                >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,SomeTag > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, SomeTag > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
+
     {
-      typedef Kokkos::TeamPolicy<SomeTag,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > policy_t;
-      typedef typename policy_t::execution_space execution_space;
-      typedef typename policy_t::index_type      index_type;
-      typedef typename policy_t::schedule_type   schedule_type;
-      typedef typename policy_t::work_tag        work_tag;
-
-      ASSERT_TRUE((std::is_same<execution_space ,Kokkos::DefaultExecutionSpace       >::value));
-      ASSERT_TRUE((std::is_same<index_type      ,long                                >::value));
-      ASSERT_TRUE((std::is_same<schedule_type   ,Kokkos::Schedule<Kokkos::Dynamic>   >::value));
-      ASSERT_TRUE((std::is_same<work_tag        ,SomeTag                             >::value));
+      typedef Kokkos::TeamPolicy< SomeTag, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > policy_t;
+      typedef typename policy_t::execution_space  execution_space;
+      typedef typename policy_t::index_type       index_type;
+      typedef typename policy_t::schedule_type    schedule_type;
+      typedef typename policy_t::work_tag         work_tag;
+
+      ASSERT_TRUE( ( std::is_same< execution_space, Kokkos::DefaultExecutionSpace       >::value ) );
+      ASSERT_TRUE( ( std::is_same< index_type,      long                                >::value ) );
+      ASSERT_TRUE( ( std::is_same< schedule_type,   Kokkos::Schedule<Kokkos::Dynamic>   >::value ) );
+      ASSERT_TRUE( ( std::is_same< work_tag,        SomeTag                             >::value ) );
     }
   }
 
 
-  template<class policy_t>
+  template< class policy_t >
   void test_run_time_parameters_type() {
     int league_size = 131;
-    int team_size = 4<policy_t::execution_space::concurrency()?4:policy_t::execution_space::concurrency();
+    int team_size = 4 < policy_t::execution_space::concurrency() ? 4 : policy_t::execution_space::concurrency();
     int chunk_size = 4;
     int per_team_scratch = 1024;
     int per_thread_scratch = 16;
-    int scratch_size = per_team_scratch + per_thread_scratch*team_size;
-    policy_t p1(league_size,team_size);
-    ASSERT_EQ  (p1.league_size() , league_size);
-    ASSERT_EQ  (p1.team_size()   , team_size);
-    ASSERT_TRUE(p1.chunk_size()  > 0);
-    ASSERT_EQ  (p1.scratch_size(0), 0);
-
-    policy_t p2 = p1.set_chunk_size(chunk_size);
-    ASSERT_EQ  (p1.league_size() , league_size);
-    ASSERT_EQ  (p1.team_size()   , team_size);
-    ASSERT_TRUE(p1.chunk_size()  > 0);
-    ASSERT_EQ  (p1.scratch_size(0), 0);
-
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-
-    policy_t p3 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p3.league_size() , league_size);
-    ASSERT_EQ  (p3.team_size()   , team_size);
-    ASSERT_EQ  (p3.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
-
-    policy_t p4 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p4.league_size() , league_size);
-    ASSERT_EQ  (p4.team_size()   , team_size);
-    ASSERT_EQ  (p4.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p4.scratch_size(0), per_thread_scratch*team_size);
-
-    policy_t p5 = p2.set_scratch_size(0,Kokkos::PerThread(per_thread_scratch),Kokkos::PerTeam(per_team_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p5.league_size() , league_size);
-    ASSERT_EQ  (p5.team_size()   , team_size);
-    ASSERT_EQ  (p5.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p5.scratch_size(0), scratch_size);
-
-    policy_t p6 = p2.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
-    ASSERT_EQ  (p2.league_size() , league_size);
-    ASSERT_EQ  (p2.team_size()   , team_size);
-    ASSERT_EQ  (p2.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p2.scratch_size(0), 0);
-    ASSERT_EQ  (p6.league_size() , league_size);
-    ASSERT_EQ  (p6.team_size()   , team_size);
-    ASSERT_EQ  (p6.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p6.scratch_size(0), scratch_size);
-
-    policy_t p7 = p3.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch),Kokkos::PerThread(per_thread_scratch));
-    ASSERT_EQ  (p3.league_size() , league_size);
-    ASSERT_EQ  (p3.team_size()   , team_size);
-    ASSERT_EQ  (p3.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p3.scratch_size(0), per_team_scratch);
-    ASSERT_EQ  (p7.league_size() , league_size);
-    ASSERT_EQ  (p7.team_size()   , team_size);
-    ASSERT_EQ  (p7.chunk_size()  , chunk_size);
-    ASSERT_EQ  (p7.scratch_size(0), scratch_size);
-}
+    int scratch_size = per_team_scratch + per_thread_scratch * team_size;
+
+    policy_t p1( league_size, team_size );
+    ASSERT_EQ  ( p1.league_size(),     league_size                    );
+    ASSERT_EQ  ( p1.team_size(),       team_size                      );
+    ASSERT_TRUE( p1.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p1.scratch_size( 0 ), 0                              );
+
+    policy_t p2 = p1.set_chunk_size( chunk_size );
+    ASSERT_EQ  ( p1.league_size(),     league_size                    );
+    ASSERT_EQ  ( p1.team_size(),       team_size                      );
+    ASSERT_TRUE( p1.chunk_size()  > 0                                 );
+    ASSERT_EQ  ( p1.scratch_size( 0 ), 0                              );
+
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+
+    policy_t p3 = p2.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p3.league_size(),     league_size                    );
+    ASSERT_EQ  ( p3.team_size(),       team_size                      );
+    ASSERT_EQ  ( p3.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p3.scratch_size( 0 ), per_team_scratch               );
+
+    policy_t p4 = p2.set_scratch_size( 0, Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p4.league_size(),     league_size                    );
+    ASSERT_EQ  ( p4.team_size(),       team_size                      );
+    ASSERT_EQ  ( p4.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p4.scratch_size( 0 ), per_thread_scratch * team_size );
+
+    policy_t p5 = p2.set_scratch_size( 0, Kokkos::PerThread( per_thread_scratch ), Kokkos::PerTeam( per_team_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p5.league_size(),     league_size                    );
+    ASSERT_EQ  ( p5.team_size(),       team_size                      );
+    ASSERT_EQ  ( p5.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p5.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p6 = p2.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p2.league_size(),     league_size                    );
+    ASSERT_EQ  ( p2.team_size(),       team_size                      );
+    ASSERT_EQ  ( p2.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p2.scratch_size( 0 ), 0                              );
+    ASSERT_EQ  ( p6.league_size(),     league_size                    );
+    ASSERT_EQ  ( p6.team_size(),       team_size                      );
+    ASSERT_EQ  ( p6.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p6.scratch_size( 0 ), scratch_size                   );
+
+    policy_t p7 = p3.set_scratch_size( 0, Kokkos::PerTeam( per_team_scratch ), Kokkos::PerThread( per_thread_scratch ) );
+    ASSERT_EQ  ( p3.league_size(),     league_size                    );
+    ASSERT_EQ  ( p3.team_size(),       team_size                      );
+    ASSERT_EQ  ( p3.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p3.scratch_size( 0 ), per_team_scratch               );
+    ASSERT_EQ  ( p7.league_size(),     league_size                    );
+    ASSERT_EQ  ( p7.team_size(),       team_size                      );
+    ASSERT_EQ  ( p7.chunk_size(),      chunk_size                     );
+    ASSERT_EQ  ( p7.scratch_size( 0 ), scratch_size                   );
+  }
+
   void test_run_time_parameters() {
-    test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace> >();
-    test_run_time_parameters_type<Kokkos::TeamPolicy<ExecutionSpace,Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long> > >();
-    test_run_time_parameters_type<Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > >();
-    test_run_time_parameters_type<Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,Kokkos::IndexType<long>,ExecutionSpace,SomeTag > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<ExecutionSpace> >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long> > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<Kokkos::IndexType<long>, ExecutionSpace, Kokkos::Schedule<Kokkos::Dynamic> > >();
+    test_run_time_parameters_type< Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>, Kokkos::IndexType<long>, ExecutionSpace, SomeTag > >();
   }
 };
diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp
deleted file mode 100644
index a465f39ca8ab428b72b68c103ec3989c92fb670f..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/unit_test/TestQthread.cpp
+++ /dev/null
@@ -1,287 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 2.0
-//              Copyright (2014) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <gtest/gtest.h>
-
-#include <Kokkos_Core.hpp>
-#include <Kokkos_Qthread.hpp>
-
-//----------------------------------------------------------------------------
-
-#include <TestAtomic.hpp>
-
-#include <TestViewAPI.hpp>
-#include <TestViewOfClass.hpp>
-
-#include <TestTeam.hpp>
-#include <TestRange.hpp>
-#include <TestReduce.hpp>
-#include <TestScan.hpp>
-#include <TestAggregate.hpp>
-#include <TestCompilerMacros.hpp>
-#include <TestTaskScheduler.hpp>
-// #include <TestTeamVector.hpp>
-
-namespace Test {
-
-class qthread : public ::testing::Test {
-protected:
-  static void SetUpTestCase()
-  {
-    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
-    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
-    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
-
-    int threads_count = std::max( 1u , numa_count )
-                      * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
-    Kokkos::Qthread::initialize( threads_count );
-    Kokkos::Qthread::print_configuration( std::cout , true );
-  }
-
-  static void TearDownTestCase()
-  {
-    Kokkos::Qthread::finalize();
-  }
-};
-
-TEST_F( qthread , compiler_macros )
-{
-  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthread >() ) );
-}
-
-TEST_F( qthread, view_impl) {
-  test_view_impl< Kokkos::Qthread >();
-}
-
-TEST_F( qthread, view_api) {
-  TestViewAPI< double , Kokkos::Qthread >();
-}
-
-TEST_F( qthread , view_nested_view )
-{
-  ::Test::view_nested_view< Kokkos::Qthread >();
-}
-
-TEST_F( qthread , range_tag )
-{
-  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-}
-
-TEST_F( qthread , team_tag )
-{
-  TestTeamPolicy< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
-  TestTeamPolicy< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
-}
-
-TEST_F( qthread, long_reduce) {
-  TestReduce< long ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, double_reduce) {
-  TestReduce< double ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Qthread >( 1000000 );
-}
-
-TEST_F( qthread, team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 1000000 );
-}
-
-TEST_F( qthread, team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 1000000 );
-}
-
-
-TEST_F( qthread , atomics )
-{
-  const int loop_count = 1e4 ;
-
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,3) ) );
-
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,3) ) );
-
-#if defined( KOKKOS_ENABLE_ASM )
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,3) ) );
-#endif
-
-}
-
-TEST_F( qthread , view_remap )
-{
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::Qthread > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Qthread > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Qthread > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( qthread , view_aggregate )
-{
-  TestViewAggregate< Kokkos::Qthread >();
-}
-
-//----------------------------------------------------------------------------
-
-TEST_F( qthread , scan )
-{
-  TestScan< Kokkos::Qthread >::test_range( 1 , 1000 );
-  TestScan< Kokkos::Qthread >( 1000000 );
-  TestScan< Kokkos::Qthread >( 10000000 );
-  Kokkos::Qthread::fence();
-}
-
-TEST_F( qthread, team_shared ) {
-  TestSharedTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >();
-}
-
-TEST_F( qthread, shmem_size) {
-  TestShmemSize< Kokkos::Qthread >();
-}
-
-TEST_F( qthread , team_scan )
-{
-  TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Qthread , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-}
-
-#if 0 /* disable */
-TEST_F( qthread , team_vector )
-{
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(4) ) );
-}
-#endif
-
-//----------------------------------------------------------------------------
-
-TEST_F( qthread , task_policy )
-{
-  TestTaskScheduler::test_task_dep< Kokkos::Qthread >( 10 );
-  for ( long i = 0 ; i < 25 ; ++i ) TestTaskScheduler::test_fib< Kokkos::Qthread >(i);
-  for ( long i = 0 ; i < 35 ; ++i ) TestTaskScheduler::test_fib2< Kokkos::Qthread >(i);
-}
-
-TEST_F( qthread , task_team )
-{
-  TestTaskScheduler::test_task_team< Kokkos::Qthread >(1000);
-}
-
-//----------------------------------------------------------------------------
-
-} // namespace test
-
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
index e342e844c7665650732a38e49063abee626a4a8c..90411a57a0c9c871f946dd3a8b04b4af0554b380 100644
--- a/lib/kokkos/core/unit_test/TestRange.hpp
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -45,198 +45,204 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
+
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestRange {
+  typedef int value_type; ///< typedef required for the parallel_reduce
 
-  typedef int value_type ; ///< typedef required for the parallel_reduce
-
-  typedef Kokkos::View<int*,ExecSpace> view_type ;
+  typedef Kokkos::View< int*, ExecSpace > view_type;
 
-  view_type m_flags ;
+  view_type m_flags;
 
   struct VerifyInitTag {};
   struct ResetTag {};
   struct VerifyResetTag {};
 
   TestRange( const size_t N )
-    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags"), N )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ), N )
     {}
 
   static void test_for( const size_t N )
-    {
-      TestRange functor(N);
+  {
+    TestRange functor( N );
 
-      typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags );
+    typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags );
 
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType,VerifyInitTag>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyInitTag >( 0, N ), functor );
 
-      Kokkos::deep_copy( host_flags , functor.m_flags );
+    Kokkos::deep_copy( host_flags, functor.m_flags );
 
-      size_t error_count = 0 ;
-      for ( size_t i = 0 ; i < N ; ++i ) {
-        if ( int(i) != host_flags(i) ) ++error_count ;
-      }
-      ASSERT_EQ( error_count , size_t(0) );
+    size_t error_count = 0;
+    for ( size_t i = 0; i < N; ++i ) {
+      if ( int( i ) != host_flags( i ) ) ++error_count;
+    }
+    ASSERT_EQ( error_count, size_t( 0 ) );
 
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType,ResetTag>(0,N) , functor );
-      Kokkos::parallel_for( std::string("TestKernelFor") , Kokkos::RangePolicy<ExecSpace,ScheduleType,VerifyResetTag>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType, ResetTag >( 0, N ), functor );
+    Kokkos::parallel_for( std::string( "TestKernelFor" ), Kokkos::RangePolicy< ExecSpace, ScheduleType, VerifyResetTag >( 0, N ), functor );
 
-      Kokkos::deep_copy( host_flags , functor.m_flags );
+    Kokkos::deep_copy( host_flags, functor.m_flags );
 
-      error_count = 0 ;
-      for ( size_t i = 0 ; i < N ; ++i ) {
-        if ( int(2*i) != host_flags(i) ) ++error_count ;
-      }
-      ASSERT_EQ( error_count , size_t(0) );
+    error_count = 0;
+    for ( size_t i = 0; i < N; ++i ) {
+      if ( int( 2 * i ) != host_flags( i ) ) ++error_count;
     }
+    ASSERT_EQ( error_count, size_t( 0 ) );
+  }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const int i ) const
-    { m_flags(i) = i ; }
+  { m_flags( i )  = i; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const VerifyInitTag & , const int i ) const
-    { if ( i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+  void operator()( const VerifyInitTag &, const int i ) const
+  {
+    if ( i != m_flags( i ) ) {
+      printf( "TestRange::test_for error at %d != %d\n", i, m_flags( i ) );
+    }
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const ResetTag & , const int i ) const
-    { m_flags(i) = 2 * m_flags(i); }
+  void operator()( const ResetTag &, const int i ) const
+  { m_flags( i ) = 2 * m_flags( i ); }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const VerifyResetTag & , const int i ) const
-    { if ( 2 * i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+  void operator()( const VerifyResetTag &, const int i ) const
+  {
+    if ( 2 * i != m_flags( i ) )
+    {
+      printf( "TestRange::test_for error at %d != %d\n", i, m_flags( i ) );
+    }
+  }
 
   //----------------------------------------
 
   struct OffsetTag {};
 
   static void test_reduce( const size_t N )
-    {
-      TestRange functor(N);
-      int total = 0 ;
+  {
+    TestRange functor( N );
+    int total = 0;
 
-      Kokkos::parallel_for(    Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor );
 
-      Kokkos::parallel_reduce( "TestKernelReduce" , Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor , total );
-      // sum( 0 .. N-1 )
-      ASSERT_EQ( size_t((N-1)*(N)/2) , size_t(total) );
+    Kokkos::parallel_reduce( "TestKernelReduce", Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor, total );
+    // sum( 0 .. N-1 )
+    ASSERT_EQ( size_t( ( N - 1 ) * ( N ) / 2 ), size_t( total ) );
 
-      Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,ScheduleType,OffsetTag>(0,N) , functor , total );
-      // sum( 1 .. N )
-      ASSERT_EQ( size_t((N)*(N+1)/2) , size_t(total) );
-    }
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), functor, total );
+    // sum( 1 .. N )
+    ASSERT_EQ( size_t( ( N ) * ( N + 1 ) / 2 ), size_t( total ) );
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int i , value_type & update ) const
-    { update += m_flags(i); }
+  void operator()( const int i, value_type & update ) const
+  { update += m_flags( i ); }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const OffsetTag & , const int i , value_type & update ) const
-    { update += 1 + m_flags(i); }
+  void operator()( const OffsetTag &, const int i, value_type & update ) const
+  { update += 1 + m_flags( i ); }
 
   //----------------------------------------
 
   static void test_scan( const size_t N )
-    {
-      TestRange functor(N);
+  {
+    TestRange functor( N );
 
-      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ScheduleType>(0,N) , functor );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, ScheduleType >( 0, N ), functor );
 
-      Kokkos::parallel_scan( "TestKernelScan" , Kokkos::RangePolicy<ExecSpace,ScheduleType,OffsetTag>(0,N) , functor );
-    }
+    Kokkos::parallel_scan( "TestKernelScan", Kokkos::RangePolicy< ExecSpace, ScheduleType, OffsetTag>( 0, N ), functor );
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const OffsetTag & , const int i , value_type & update , bool final ) const
-    {
-      update += m_flags(i);
+  void operator()( const OffsetTag &, const int i, value_type & update, bool final ) const
+  {
+    update += m_flags( i );
 
-      if ( final ) {
-        if ( update != (i*(i+1))/2 ) {
-          printf("TestRange::test_scan error %d : %d != %d\n",i,(i*(i+1))/2,m_flags(i));
-        }
+    if ( final ) {
+      if ( update != ( i * ( i + 1 ) ) / 2 ) {
+        printf( "TestRange::test_scan error %d : %d != %d\n", i, ( i * ( i + 1 ) ) / 2, m_flags( i ) );
       }
     }
+  }
 
-  static void test_dynamic_policy( const size_t N ) {
-
-
-    typedef Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
+  static void test_dynamic_policy( const size_t N )
+  {
+    typedef Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> > policy_t;
 
     {
-      Kokkos::View<size_t*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > count("Count",ExecSpace::concurrency());
-      Kokkos::View<int*,ExecSpace> a("A",N);
-
-      Kokkos::parallel_for( policy_t(0,N),
-          KOKKOS_LAMBDA (const typename policy_t::member_type& i) {
-        for(int k=0; k<(i<N/2?1:10000); k++ )
-          a(i)++;
-        count(ExecSpace::hardware_thread_id())++;
+      Kokkos::View< size_t*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > count( "Count", ExecSpace::concurrency() );
+      Kokkos::View< int*, ExecSpace > a( "A", N );
+
+      Kokkos::parallel_for( policy_t( 0, N ), KOKKOS_LAMBDA ( const typename policy_t::member_type& i ) {
+        for ( int k = 0; k < ( i < N / 2 ? 1 : 10000 ); k++ ) {
+          a( i )++;
+        }
+        count( ExecSpace::hardware_thread_id() )++;
       });
 
       int error = 0;
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N), KOKKOS_LAMBDA(const typename policy_t::member_type& i, int& lsum) {
-        lsum += ( a(i)!= (i<N/2?1:10000) );
-      },error);
-      ASSERT_EQ(error,0);
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), KOKKOS_LAMBDA( const typename policy_t::member_type & i, int & lsum ) {
+        lsum += ( a( i ) != ( i < N / 2 ? 1 : 10000 ) );
+      }, error );
+      ASSERT_EQ( error, 0 );
 
-      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<size_t>(4*ExecSpace::concurrency())) ) {
+      if ( ( ExecSpace::concurrency() > (int) 1 ) && ( N > static_cast<size_t>( 4 * ExecSpace::concurrency() ) ) ) {
         size_t min = N;
         size_t max = 0;
-        for(int t=0; t<ExecSpace::concurrency(); t++) {
-          if(count(t)<min) min = count(t);
-          if(count(t)>max) max = count(t);
+        for ( int t = 0; t < ExecSpace::concurrency(); t++ ) {
+          if ( count( t ) < min ) min = count( t );
+          if ( count( t ) > max ) max = count( t );
         }
-        ASSERT_TRUE(min<max);
-        //if(ExecSpace::concurrency()>2)
-        //  ASSERT_TRUE(2*min<max);
+        ASSERT_TRUE( min < max );
+
+        //if ( ExecSpace::concurrency() > 2 ) {
+        //  ASSERT_TRUE( 2 * min < max );
+        //}
       }
-      
     }
 
     {
-      Kokkos::View<size_t*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > count("Count",ExecSpace::concurrency());
-      Kokkos::View<int*,ExecSpace> a("A",N);
+      Kokkos::View< size_t*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > count( "Count", ExecSpace::concurrency() );
+      Kokkos::View< int*, ExecSpace> a( "A", N );
 
       int sum = 0;
-      Kokkos::parallel_reduce( policy_t(0,N),
-          KOKKOS_LAMBDA (const typename policy_t::member_type& i, int& lsum) {
-        for(int k=0; k<(i<N/2?1:10000); k++ )
-          a(i)++;
-        count(ExecSpace::hardware_thread_id())++;
+      Kokkos::parallel_reduce( policy_t( 0, N ), KOKKOS_LAMBDA( const typename policy_t::member_type & i, int & lsum ) {
+        for ( int k = 0; k < ( i < N / 2 ? 1 : 10000 ); k++ ) {
+          a( i )++;
+        }
+        count( ExecSpace::hardware_thread_id() )++;
         lsum++;
-      },sum);
-      ASSERT_EQ(sum,N);
+      }, sum );
+      ASSERT_EQ( sum, N );
 
       int error = 0;
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N), KOKKOS_LAMBDA(const typename policy_t::member_type& i, int& lsum) {
-        lsum += ( a(i)!= (i<N/2?1:10000) );
-      },error);
-      ASSERT_EQ(error,0);
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), KOKKOS_LAMBDA( const typename policy_t::member_type & i, int & lsum ) {
+        lsum += ( a( i ) != ( i < N / 2 ? 1 : 10000 ) );
+      }, error );
+      ASSERT_EQ( error, 0 );
 
-      if( ( ExecSpace::concurrency()>(int)1) && (N>static_cast<size_t>(4*ExecSpace::concurrency())) ) {
+      if ( ( ExecSpace::concurrency() > (int) 1 ) && ( N > static_cast<size_t>( 4 * ExecSpace::concurrency() ) ) ) {
         size_t min = N;
         size_t max = 0;
-        for(int t=0; t<ExecSpace::concurrency(); t++) {
-          if(count(t)<min) min = count(t);
-          if(count(t)>max) max = count(t);
+        for ( int t = 0; t < ExecSpace::concurrency(); t++ ) {
+          if ( count( t ) < min ) min = count( t );
+          if ( count( t ) > max ) max = count( t );
         }
-        ASSERT_TRUE(min<max);
-        //if(ExecSpace::concurrency()>2)
-        //  ASSERT_TRUE(2*min<max);
+        ASSERT_TRUE( min < max );
+
+        //if ( ExecSpace::concurrency() > 2 ) {
+        //  ASSERT_TRUE( 2 * min < max );
+        //}
       }
     }
-
   }
 };
 
-} /* namespace */
-} /* namespace Test */
-
-/*--------------------------------------------------------------------------*/
+} // namespace
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
index 645fc9e31b3b1cf86d06779304343cc93cc2242a..7e77dadf6249fe3eaa763c0c9848b93965379e7e 100644
--- a/lib/kokkos/core/unit_test/TestReduce.hpp
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -48,24 +48,23 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class ReduceFunctor
 {
 public:
-  typedef DeviceType  execution_space ;
-  typedef typename execution_space::size_type size_type ;
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   struct value_type {
-    ScalarType value[3] ;
+    ScalarType value[3];
   };
 
-  const size_type nwork ;
+  const size_type nwork;
 
-  ReduceFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+  ReduceFunctor( const size_type & arg_nwork )
+    : nwork( arg_nwork ) {}
 
   ReduceFunctor( const ReduceFunctor & rhs )
     : nwork( rhs.nwork ) {}
@@ -74,66 +73,63 @@ public:
   KOKKOS_INLINE_FUNCTION
   void init( value_type & dst ) const
   {
-    dst.value[0] = 0 ;
-    dst.value[1] = 0 ;
-    dst.value[2] = 0 ;
+    dst.value[0] = 0;
+    dst.value[1] = 0;
+    dst.value[2] = 0;
   }
 */
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile value_type & dst ,
+  void join( volatile value_type & dst,
              const volatile value_type & src ) const
   {
-    dst.value[0] += src.value[0] ;
-    dst.value[1] += src.value[1] ;
-    dst.value[2] += src.value[2] ;
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type iwork , value_type & dst ) const
+  void operator()( size_type iwork, value_type & dst ) const
   {
-    dst.value[0] += 1 ;
-    dst.value[1] += iwork + 1 ;
-    dst.value[2] += nwork - iwork ;
+    dst.value[0] += 1;
+    dst.value[1] += iwork + 1;
+    dst.value[2] += nwork - iwork;
   }
 };
 
 template< class DeviceType >
-class ReduceFunctorFinal : public ReduceFunctor< long , DeviceType > {
+class ReduceFunctorFinal : public ReduceFunctor< long, DeviceType > {
 public:
-
-  typedef typename ReduceFunctor< long , DeviceType >::value_type value_type ;
+  typedef typename ReduceFunctor< long, DeviceType >::value_type value_type;
 
   ReduceFunctorFinal( const size_t n )
-    : ReduceFunctor<long,DeviceType>(n)
-    {}
+    : ReduceFunctor< long, DeviceType >( n ) {}
 
   KOKKOS_INLINE_FUNCTION
   void final( value_type & dst ) const
   {
-    dst.value[0] = - dst.value[0] ;
-    dst.value[1] = - dst.value[1] ;
-    dst.value[2] = - dst.value[2] ;
+    dst.value[0] = -dst.value[0];
+    dst.value[1] = -dst.value[1];
+    dst.value[2] = -dst.value[2];
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class RuntimeReduceFunctor
 {
 public:
   // Required for functor:
-  typedef DeviceType  execution_space ;
-  typedef ScalarType  value_type[] ;
-  const unsigned      value_count ;
-
+  typedef DeviceType  execution_space;
+  typedef ScalarType  value_type[];
+  const unsigned      value_count;
 
   // Unit test details:
 
-  typedef typename execution_space::size_type  size_type ;
+  typedef typename execution_space::size_type size_type;
 
-  const size_type     nwork ;
+  const size_type     nwork;
 
-  RuntimeReduceFunctor( const size_type arg_nwork ,
+  RuntimeReduceFunctor( const size_type arg_nwork,
                         const size_type arg_count )
     : value_count( arg_count )
     , nwork( arg_nwork ) {}
@@ -141,247 +137,251 @@ public:
   KOKKOS_INLINE_FUNCTION
   void init( ScalarType dst[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] = 0;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile ScalarType dst[] ,
+  void join( volatile ScalarType dst[],
              const volatile ScalarType src[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] += src[i];
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type iwork , ScalarType dst[] ) const
+  void operator()( size_type iwork, ScalarType dst[] ) const
   {
-    const size_type tmp[3] = { 1 , iwork + 1 , nwork - iwork };
+    const size_type tmp[3] = { 1, iwork + 1, nwork - iwork };
 
-    for ( size_type i = 0 ; i < value_count ; ++i ) {
+    for ( size_type i = 0; i < value_count; ++i ) {
       dst[i] += tmp[ i % 3 ];
     }
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class RuntimeReduceMinMax
 {
 public:
   // Required for functor:
-  typedef DeviceType  execution_space ;
-  typedef ScalarType  value_type[] ;
-  const unsigned      value_count ;
+  typedef DeviceType  execution_space;
+  typedef ScalarType  value_type[];
+  const unsigned      value_count;
 
   // Unit test details:
 
-  typedef typename execution_space::size_type  size_type ;
+  typedef typename execution_space::size_type size_type;
 
-  const size_type     nwork ;
-  const ScalarType    amin ;
-  const ScalarType    amax ;
+  const size_type     nwork;
+  const ScalarType    amin;
+  const ScalarType    amax;
 
-  RuntimeReduceMinMax( const size_type arg_nwork ,
+  RuntimeReduceMinMax( const size_type arg_nwork,
                        const size_type arg_count )
     : value_count( arg_count )
     , nwork( arg_nwork )
-    , amin( std::numeric_limits<ScalarType>::min() )
-    , amax( std::numeric_limits<ScalarType>::max() )
+    , amin( std::numeric_limits< ScalarType >::min() )
+    , amax( std::numeric_limits< ScalarType >::max() )
     {}
 
   KOKKOS_INLINE_FUNCTION
   void init( ScalarType dst[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) {
-      dst[i] = i % 2 ? amax : amin ;
+    for ( unsigned i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? amax : amin;
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile ScalarType dst[] ,
+  void join( volatile ScalarType dst[],
              const volatile ScalarType src[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) {
+    for ( unsigned i = 0; i < value_count; ++i ) {
       dst[i] = i % 2 ? ( dst[i] < src[i] ? dst[i] : src[i] )  // min
                      : ( dst[i] > src[i] ? dst[i] : src[i] ); // max
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_type iwork , ScalarType dst[] ) const
+  void operator()( size_type iwork, ScalarType dst[] ) const
   {
-    const ScalarType tmp[2] = { ScalarType(iwork + 1)
-                              , ScalarType(nwork - iwork) };
+    const ScalarType tmp[2] = { ScalarType( iwork + 1 )
+                              , ScalarType( nwork - iwork ) };
 
-    for ( size_type i = 0 ; i < value_count ; ++i ) {
-      dst[i] = i % 2 ? ( dst[i] < tmp[i%2] ? dst[i] : tmp[i%2] )
-                     : ( dst[i] > tmp[i%2] ? dst[i] : tmp[i%2] );
+    for ( size_type i = 0; i < value_count; ++i ) {
+      dst[i] = i % 2 ? ( dst[i] < tmp[i % 2] ? dst[i] : tmp[i % 2] )
+                     : ( dst[i] > tmp[i % 2] ? dst[i] : tmp[i % 2] );
     }
   }
 };
 
 template< class DeviceType >
-class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long , DeviceType > {
+class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long, DeviceType > {
 public:
+  typedef RuntimeReduceFunctor< long, DeviceType > base_type;
+  typedef typename base_type::value_type value_type;
+  typedef long scalar_type;
 
-  typedef RuntimeReduceFunctor< long , DeviceType > base_type ;
-  typedef typename base_type::value_type value_type ;
-  typedef long scalar_type ;
-
-  RuntimeReduceFunctorFinal( const size_t theNwork , const size_t count ) : base_type(theNwork,count) {}
+  RuntimeReduceFunctorFinal( const size_t theNwork, const size_t count )
+    : base_type( theNwork, count ) {}
 
   KOKKOS_INLINE_FUNCTION
   void final( value_type dst ) const
   {
-    for ( unsigned i = 0 ; i < base_type::value_count ; ++i ) {
-      dst[i] = - dst[i] ;
+    for ( unsigned i = 0; i < base_type::value_count; ++i ) {
+      dst[i] = -dst[i];
     }
   }
 };
+
 } // namespace Test
 
 namespace {
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestReduce
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef typename execution_space::size_type size_type ;
-
-  //------------------------------------
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   TestReduce( const size_type & nwork )
   {
-    run_test(nwork);
-    run_test_final(nwork);
+    run_test( nwork );
+    run_test_final( nwork );
   }
 
   void run_test( const size_type & nwork )
   {
-    typedef Test::ReduceFunctor< ScalarType , execution_space > functor_type ;
-    typedef typename functor_type::value_type value_type ;
+    typedef Test::ReduceFunctor< ScalarType, execution_space > functor_type;
+    typedef typename functor_type::value_type value_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
     value_type result[ Repeat ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      Kokkos::parallel_reduce( nwork, functor_type( nwork ), result[i] );
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
       }
     }
   }
 
   void run_test_final( const size_type & nwork )
   {
-    typedef Test::ReduceFunctorFinal< execution_space > functor_type ;
-    typedef typename functor_type::value_type value_type ;
+    typedef Test::ReduceFunctorFinal< execution_space > functor_type;
+    typedef typename functor_type::value_type value_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
     value_type result[ Repeat ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
-      else
-        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork ), result[i] );
+      }
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , - result[i].value[j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, -result[i].value[j] );
       }
     }
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestReduceDynamic
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef typename execution_space::size_type size_type ;
-
-  //------------------------------------
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   TestReduceDynamic( const size_type nwork )
   {
-    run_test_dynamic(nwork);
-    run_test_dynamic_minmax(nwork);
-    run_test_dynamic_final(nwork);
+    run_test_dynamic( nwork );
+    run_test_dynamic_minmax( nwork );
+    run_test_dynamic_final( nwork );
   }
 
   void run_test_dynamic( const size_type nwork )
   {
-    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+    typedef Test::RuntimeReduceFunctor< ScalarType, execution_space > functor_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
-    ScalarType result[ Repeat ][ Count ] ;
+    ScalarType result[ Repeat ][ Count ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
-      else
-        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork,Count) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , result[i][j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i][j] );
       }
     }
   }
 
   void run_test_dynamic_minmax( const size_type nwork )
   {
-    typedef Test::RuntimeReduceMinMax< ScalarType , execution_space > functor_type ;
+    typedef Test::RuntimeReduceMinMax< ScalarType, execution_space > functor_type;
 
     enum { Count = 2 };
     enum { Repeat = 100 };
 
-    ScalarType result[ Repeat ][ Count ] ;
+    ScalarType result[ Repeat ][ Count ];
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
-      else
-        Kokkos::parallel_reduce( "Reduce", nwork , functor_type(nwork,Count) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "Reduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
         if ( nwork == 0 )
         {
-          ScalarType amin( std::numeric_limits<ScalarType>::min() );
-          ScalarType amax( std::numeric_limits<ScalarType>::max() );
-          const ScalarType correct = (j%2) ? amax : amin;
-          ASSERT_EQ( (ScalarType) correct , result[i][j] );
-        } else {
-          const unsigned long correct = j % 2 ? 1 : nwork ;
-          ASSERT_EQ( (ScalarType) correct , result[i][j] );
+          ScalarType amin( std::numeric_limits< ScalarType >::min() );
+          ScalarType amax( std::numeric_limits< ScalarType >::max() );
+          const ScalarType correct = ( j % 2 ) ? amax : amin;
+          ASSERT_EQ( (ScalarType) correct, result[i][j] );
+        }
+        else {
+          const unsigned long correct = j % 2 ? 1 : nwork;
+          ASSERT_EQ( (ScalarType) correct, result[i][j] );
         }
       }
     }
@@ -389,169 +389,172 @@ public:
 
   void run_test_dynamic_final( const size_type nwork )
   {
-    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type ;
+    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
-    typename functor_type::scalar_type result[ Repeat ][ Count ] ;
+    typename functor_type::scalar_type result[ Repeat ][ Count ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      if(i%2==0)
-        Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
-      else
-        Kokkos::parallel_reduce( "TestKernelReduce" , nwork , functor_type(nwork,Count) , result[i] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      if ( i % 2 == 0 ) {
+        Kokkos::parallel_reduce( nwork, functor_type( nwork, Count ), result[i] );
+      }
+      else {
+        Kokkos::parallel_reduce( "TestKernelReduce", nwork, functor_type( nwork, Count ), result[i] );
+      }
 
     }
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , - result[i][j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, -result[i][j] );
       }
     }
   }
 };
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestReduceDynamicView
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef typename execution_space::size_type size_type ;
-
-  //------------------------------------
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
   TestReduceDynamicView( const size_type nwork )
   {
-    run_test_dynamic_view(nwork);
+    run_test_dynamic_view( nwork );
   }
 
   void run_test_dynamic_view( const size_type nwork )
   {
-    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+    typedef Test::RuntimeReduceFunctor< ScalarType, execution_space > functor_type;
 
-    typedef Kokkos::View< ScalarType* , DeviceType > result_type ;
-    typedef typename result_type::HostMirror result_host_type ;
+    typedef Kokkos::View< ScalarType*, DeviceType > result_type;
+    typedef typename result_type::HostMirror result_host_type;
 
-    const unsigned CountLimit = 23 ;
+    const unsigned CountLimit = 23;
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    for ( unsigned count = 0 ; count < CountLimit ; ++count ) {
+    for ( unsigned count = 0; count < CountLimit; ++count ) {
 
-      result_type result("result",count);
+      result_type result( "result", count );
       result_host_type host_result = Kokkos::create_mirror( result );
 
       // Test result to host pointer:
 
-      std::string str("TestKernelReduce");
-      if(count%2==0)
-        Kokkos::parallel_reduce( nw , functor_type(nw,count) , host_result.ptr_on_device() );
-      else
-        Kokkos::parallel_reduce( str , nw , functor_type(nw,count) , host_result.ptr_on_device() );
+      std::string str( "TestKernelReduce" );
+      if ( count % 2 == 0 ) {
+        Kokkos::parallel_reduce( nw, functor_type( nw, count ), host_result.ptr_on_device() );
+      }
+      else {
+        Kokkos::parallel_reduce( str, nw, functor_type( nw, count ), host_result.ptr_on_device() );
+      }
 
-      for ( unsigned j = 0 ; j < count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( host_result(j), (ScalarType) correct );
-        host_result(j) = 0 ;
+      for ( unsigned j = 0; j < count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( host_result( j ), (ScalarType) correct );
+        host_result( j ) = 0;
       }
     }
   }
 };
-}
+
+} // namespace
 
 // Computes y^T*A*x
-// (modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
+// ( modified from kokkos-tutorials/GTC2016/Exercises/ThreeLevelPar )
 
 #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA )
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestTripleNestedReduce
 {
 public:
-  typedef DeviceType execution_space ;
-  typedef typename execution_space::size_type size_type ;
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
-  //------------------------------------
-
-  TestTripleNestedReduce( const size_type & nrows , const size_type & ncols
-                        , const size_type & team_size , const size_type & vector_length )
+  TestTripleNestedReduce( const size_type & nrows, const size_type & ncols
+                        , const size_type & team_size, const size_type & vector_length )
   {
-    run_test( nrows , ncols , team_size, vector_length );
+    run_test( nrows, ncols, team_size, vector_length );
   }
 
-  void run_test( const size_type & nrows , const size_type & ncols
+  void run_test( const size_type & nrows, const size_type & ncols
                , const size_type & team_size, const size_type & vector_length )
   {
     //typedef Kokkos::LayoutLeft Layout;
     typedef Kokkos::LayoutRight Layout;
 
-    typedef Kokkos::View<ScalarType* , DeviceType>            ViewVector;
-    typedef Kokkos::View<ScalarType** , Layout , DeviceType>   ViewMatrix;
-    ViewVector y( "y" , nrows );
-    ViewVector x( "x" , ncols );
-    ViewMatrix A( "A" , nrows , ncols );
+    typedef Kokkos::View< ScalarType*, DeviceType >            ViewVector;
+    typedef Kokkos::View< ScalarType**, Layout, DeviceType >   ViewMatrix;
+
+    ViewVector y( "y", nrows );
+    ViewVector x( "x", ncols );
+    ViewMatrix A( "A", nrows, ncols );
 
     typedef Kokkos::RangePolicy<DeviceType> range_policy;
 
-    // Initialize y vector
-    Kokkos::parallel_for( range_policy( 0 , nrows ) , KOKKOS_LAMBDA( const int i ) { y( i ) = 1; } );
+    // Initialize y vector.
+    Kokkos::parallel_for( range_policy( 0, nrows ), KOKKOS_LAMBDA ( const int i ) { y( i ) = 1; } );
 
-    // Initialize x vector
-    Kokkos::parallel_for( range_policy( 0 , ncols ) , KOKKOS_LAMBDA( const int i ) { x( i ) = 1; } );
+    // Initialize x vector.
+    Kokkos::parallel_for( range_policy( 0, ncols ), KOKKOS_LAMBDA ( const int i ) { x( i ) = 1; } );
 
-    typedef Kokkos::TeamPolicy<DeviceType>                        team_policy;
-    typedef typename Kokkos::TeamPolicy<DeviceType>::member_type  member_type;
+    typedef Kokkos::TeamPolicy< DeviceType >                        team_policy;
+    typedef typename Kokkos::TeamPolicy< DeviceType >::member_type  member_type;
 
-    // Initialize A matrix, note 2D indexing computation
-    Kokkos::parallel_for( team_policy( nrows , Kokkos::AUTO ) , KOKKOS_LAMBDA( const member_type& teamMember ) {
+    // Initialize A matrix, note 2D indexing computation.
+    Kokkos::parallel_for( team_policy( nrows, Kokkos::AUTO ), KOKKOS_LAMBDA ( const member_type & teamMember ) {
       const int j = teamMember.league_rank();
-      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , ncols ) , [&] ( const int i ) {
-        A( j , i ) = 1;
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, ncols ), [&] ( const int i ) {
+        A( j, i ) = 1;
       } );
     } );
 
-    // Three level parallelism kernel to force caching of vector x
+    // Three level parallelism kernel to force caching of vector x.
     ScalarType result = 0.0;
     int chunk_size = 128;
-    Kokkos::parallel_reduce( team_policy( nrows/chunk_size , team_size , vector_length ) , KOKKOS_LAMBDA ( const member_type& teamMember , double &update ) {
+    Kokkos::parallel_reduce( team_policy( nrows / chunk_size, team_size, vector_length ),
+                             KOKKOS_LAMBDA ( const member_type & teamMember, double & update ) {
       const int row_start = teamMember.league_rank() * chunk_size;
       const int row_end   = row_start + chunk_size;
-      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember , row_start , row_end ) , [&] ( const int i ) {
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( teamMember, row_start, row_end ), [&] ( const int i ) {
         ScalarType sum_i = 0.0;
-        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember , ncols ) , [&] ( const int j , ScalarType &innerUpdate ) {
-          innerUpdate += A( i , j ) * x( j );
-        } , sum_i );
-        Kokkos::single( Kokkos::PerThread( teamMember ) , [&] () {
+        Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( teamMember, ncols ), [&] ( const int j, ScalarType &innerUpdate ) {
+          innerUpdate += A( i, j ) * x( j );
+        }, sum_i );
+        Kokkos::single( Kokkos::PerThread( teamMember ), [&] () {
           update += y( i ) * sum_i;
         } );
       } );
-    } , result );
+    }, result );
 
-    const ScalarType solution= ( ScalarType ) nrows * ( ScalarType ) ncols;
-    ASSERT_EQ( solution , result );
+    const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols;
+    ASSERT_EQ( solution, result );
   }
 };
 
-#else /* #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA ) */
+#else // #if ( ! defined( KOKKOS_ENABLE_CUDA ) ) || defined( KOKKOS_ENABLE_CUDA_LAMBDA )
 
-template< typename ScalarType , class DeviceType >
+template< typename ScalarType, class DeviceType >
 class TestTripleNestedReduce
 {
 public:
-  typedef DeviceType execution_space ;
-  typedef typename execution_space::size_type size_type ;
+  typedef DeviceType execution_space;
+  typedef typename execution_space::size_type size_type;
 
-  TestTripleNestedReduce( const size_type & , const size_type
-                        , const size_type & , const size_type )
-  { }
+  TestTripleNestedReduce( const size_type &, const size_type
+                        , const size_type &, const size_type )
+  {}
 };
 
 #endif
@@ -559,38 +562,38 @@ public:
 //--------------------------------------------------------------------------
 
 namespace Test {
+
 namespace ReduceCombinatorical {
 
-template<class Scalar,class Space = Kokkos::HostSpace>
+template< class Scalar, class Space = Kokkos::HostSpace >
 struct AddPlus {
 public:
-  //Required
+  // Required.
   typedef AddPlus reducer_type;
   typedef Scalar value_type;
 
-  typedef Kokkos::View<value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+  typedef Kokkos::View< value_type, Space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
 
 private:
   result_view_type result;
 
 public:
+  AddPlus( value_type & result_ ) : result( &result_ ) {}
 
-  AddPlus(value_type& result_):result(&result_) {}
-
-  //Required
+  // Required.
   KOKKOS_INLINE_FUNCTION
-  void join(value_type& dest, const value_type& src)  const {
+  void join( value_type & dest, const value_type & src ) const {
     dest += src + 1;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& dest, const volatile value_type& src) const {
+  void join( volatile value_type & dest, const volatile value_type & src ) const {
     dest += src + 1;
   }
 
-  //Optional
+  // Optional.
   KOKKOS_INLINE_FUNCTION
-  void init( value_type& val)  const {
+  void init( value_type & val )  const {
     val = value_type();
   }
 
@@ -599,624 +602,651 @@ public:
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalar;
 
 template<>
-struct FunctorScalar<0>{
-  FunctorScalar(Kokkos::View<double> r):result(r) {}
-  Kokkos::View<double> result;
+struct FunctorScalar< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalar( Kokkos::View< double > r ) : result( r ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i,double& update) const {
-    update+=i;
+  void operator()( const int & i, double & update ) const {
+    update += i;
   }
 };
 
 template<>
-struct FunctorScalar<1>{
-  FunctorScalar(Kokkos::View<double> r):result(r) {}
-  Kokkos::View<double> result;
-
+struct FunctorScalar< 1 > {
   typedef Kokkos::TeamPolicy<>::member_type team_type;
+
+  Kokkos::View< double > result;
+
+  FunctorScalar( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarInit;
 
 template<>
-struct FunctorScalarInit<0> {
-  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarInit< 0 > {
+  Kokkos::View< double > result;
 
-  Kokkos::View<double> result;
+  FunctorScalarInit( Kokkos::View< double > r ) : result( r ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
 template<>
-struct FunctorScalarInit<1> {
-  FunctorScalarInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarInit( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarFinal;
 
-
 template<>
-struct FunctorScalarFinal<0> {
-  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
-
+struct FunctorScalarFinal< 0 > {
   Kokkos::View<double> result;
+
+  FunctorScalarFinal( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
 template<>
-struct FunctorScalarFinal<1> {
-  FunctorScalarFinal(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarFinal< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
+  FunctorScalarFinal( Kokkos::View< double > r ) : result( r ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team, double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
+
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoin;
 
 template<>
-struct FunctorScalarJoin<0> {
-  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
-
+struct FunctorScalarJoin< 0 > {
   Kokkos::View<double> result;
+
+  FunctorScalarJoin( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 };
 
 template<>
-struct FunctorScalarJoin<1> {
-  FunctorScalarJoin(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoin< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoin( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoinFinal;
 
 template<>
-struct FunctorScalarJoinFinal<0> {
-  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinFinal< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinal( Kokkos::View< double > r ) : result( r ) {}
 
-  Kokkos::View<double> result;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
 template<>
-struct FunctorScalarJoinFinal<1> {
-  FunctorScalarJoinFinal(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinFinal< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinal( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoinInit;
 
 template<>
-struct FunctorScalarJoinInit<0> {
-  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinInit< 0 > {
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinInit( Kokkos::View< double > r ) : result( r ) {}
 
-  Kokkos::View<double> result;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
 template<>
-struct FunctorScalarJoinInit<1> {
-  FunctorScalarJoinInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinInit( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
-template<int ISTEAM>
+template< int ISTEAM >
 struct FunctorScalarJoinFinalInit;
 
 template<>
-struct FunctorScalarJoinFinalInit<0> {
-  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
-
+struct FunctorScalarJoinFinalInit< 0 > {
   Kokkos::View<double> result;
 
+  FunctorScalarJoinFinalInit( Kokkos::View< double > r ) : result( r ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i, double& update)  const {
+  void operator()( const int & i, double & update ) const {
     update += i;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
 
 template<>
-struct FunctorScalarJoinFinalInit<1> {
-  FunctorScalarJoinFinalInit(Kokkos::View<double> r):result(r) {}
+struct FunctorScalarJoinFinalInit< 1 > {
+  typedef Kokkos::TeamPolicy<>::member_type team_type;
 
-  Kokkos::View<double> result;
+  Kokkos::View< double > result;
+
+  FunctorScalarJoinFinalInit( Kokkos::View< double > r ) : result( r ) {}
 
-  typedef Kokkos::TeamPolicy<>::member_type team_type;
   KOKKOS_INLINE_FUNCTION
-  void operator() (const team_type& team,double& update) const {
-    update+=1.0/team.team_size()*team.league_rank();
+  void operator()( const team_type & team, double & update ) const {
+    update += 1.0 / team.team_size() * team.league_rank();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile double& dst, const volatile double& update) const {
+  void join( volatile double & dst, const volatile double & update ) const {
     dst += update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void final(double& update) const {
+  void final( double & update ) const {
     result() = update;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& update) const {
+  void init( double & update ) const {
     update = 0.0;
   }
 };
+
 struct Functor1 {
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i,double& update) const {
-    update+=i;
+  void operator()( const int & i, double & update ) const {
+    update += i;
   }
 };
 
 struct Functor2 {
   typedef double value_type[];
+
   const unsigned value_count;
 
-  Functor2(unsigned n):value_count(n){}
+  Functor2( unsigned n ) : value_count( n ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const unsigned& i,double update[]) const {
-    for(unsigned j=0;j<value_count;j++)
-      update[j]+=i;
+  void operator()( const unsigned & i, double update[] ) const {
+    for ( unsigned j = 0; j < value_count; j++ ) {
+      update[j] += i;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
   void init( double dst[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] = 0;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile double dst[] ,
+  void join( volatile double dst[],
              const volatile double src[] ) const
   {
-    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+    for ( unsigned i = 0; i < value_count; ++i ) dst[i] += src[i];
   }
 };
 
-}
-}
+} // namespace ReduceCombinatorical
+
+} // namespace Test
 
 namespace Test {
 
-template<class ExecSpace = Kokkos::DefaultExecutionSpace>
+template< class ExecSpace = Kokkos::DefaultExecutionSpace >
 struct TestReduceCombinatoricalInstantiation {
-  template<class ... Args>
-  static void CallParallelReduce(Args... args) {
-    Kokkos::parallel_reduce(args...);
+  template< class ... Args >
+  static void CallParallelReduce( Args... args ) {
+    Kokkos::parallel_reduce( args... );
   }
 
-  template<class ... Args>
-  static void AddReturnArgument(Args... args) {
-    Kokkos::View<double,Kokkos::HostSpace> result_view("ResultView");
-    double expected_result = 1000.0*999.0/2.0;
+  template< class ... Args >
+  static void AddReturnArgument( Args... args ) {
+    Kokkos::View< double, Kokkos::HostSpace > result_view( "ResultView" );
+    double expected_result = 1000.0 * 999.0 / 2.0;
 
     double value = 0;
-    Kokkos::parallel_reduce(args...,value);
-    ASSERT_EQ(expected_result,value);
+    Kokkos::parallel_reduce( args..., value );
+    ASSERT_EQ( expected_result, value );
 
     result_view() = 0;
-    CallParallelReduce(args...,result_view);
-    ASSERT_EQ(expected_result,result_view());
+    CallParallelReduce( args..., result_view );
+    ASSERT_EQ( expected_result, result_view() );
 
     value = 0;
-    CallParallelReduce(args...,Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>(&value));
-    ASSERT_EQ(expected_result,value);
+    CallParallelReduce( args..., Kokkos::View< double, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >( &value ) );
+    ASSERT_EQ( expected_result, value );
 
     result_view() = 0;
-    const Kokkos::View<double,Kokkos::HostSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> result_view_const_um = result_view;
-    CallParallelReduce(args...,result_view_const_um);
-    ASSERT_EQ(expected_result,result_view_const_um());
+    const Kokkos::View< double, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_const_um = result_view;
+    CallParallelReduce( args..., result_view_const_um );
+    ASSERT_EQ( expected_result, result_view_const_um() );
 
     value = 0;
-    CallParallelReduce(args...,Test::ReduceCombinatorical::AddPlus<double>(value));
-    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<value);
-    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<=value);
-    else
-      ASSERT_EQ(expected_result,value);
+    CallParallelReduce( args..., Test::ReduceCombinatorical::AddPlus< double >( value ) );
+    if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) && ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result < value );
+    }
+    else if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) || ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result <= value );
+    }
+    else {
+      ASSERT_EQ( expected_result, value );
+    }
 
     value = 0;
-    Test::ReduceCombinatorical::AddPlus<double> add(value);
-    CallParallelReduce(args...,add);
-    if((Kokkos::DefaultExecutionSpace::concurrency() > 1) && (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<value);
-    else if((Kokkos::DefaultExecutionSpace::concurrency() > 1) || (ExecSpace::concurrency()>1))
-      ASSERT_TRUE(expected_result<=value);
-    else
-      ASSERT_EQ(expected_result,value);
+    Test::ReduceCombinatorical::AddPlus< double > add( value );
+    CallParallelReduce( args..., add );
+    if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) && ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result < value );
+    }
+    else if ( ( Kokkos::DefaultExecutionSpace::concurrency() > 1 ) || ( ExecSpace::concurrency() > 1 ) ) {
+      ASSERT_TRUE( expected_result <= value );
+    }
+    else {
+      ASSERT_EQ( expected_result, value );
+    }
   }
 
-
-  template<class ... Args>
-  static void AddLambdaRange(void*,Args... args) {
-    AddReturnArgument(args...,  KOKKOS_LAMBDA (const int&i , double& lsum) {
+  template< class ... Args >
+  static void AddLambdaRange( void*, Args... args ) {
+    AddReturnArgument( args..., KOKKOS_LAMBDA ( const int & i, double & lsum ) {
       lsum += i;
     });
   }
 
-  template<class ... Args>
-  static void AddLambdaTeam(void*,Args... args) {
-    AddReturnArgument(args..., KOKKOS_LAMBDA (const Kokkos::TeamPolicy<>::member_type& team, double& update) {
-      update+=1.0/team.team_size()*team.league_rank();
+  template< class ... Args >
+  static void AddLambdaTeam( void*, Args... args ) {
+    AddReturnArgument( args..., KOKKOS_LAMBDA ( const Kokkos::TeamPolicy<>::member_type & team, double & update ) {
+      update += 1.0 / team.team_size() * team.league_rank();
     });
   }
 
-  template<class ... Args>
-  static void AddLambdaRange(Kokkos::InvalidType,Args... args) {
-  }
+  template< class ... Args >
+  static void AddLambdaRange( Kokkos::InvalidType, Args... args ) {}
 
-  template<class ... Args>
-  static void AddLambdaTeam(Kokkos::InvalidType,Args... args) {
-  }
+  template< class ... Args >
+  static void AddLambdaTeam( Kokkos::InvalidType, Args... args ) {}
 
-  template<int ISTEAM, class ... Args>
-  static void AddFunctor(Args... args) {
-    Kokkos::View<double> result_view("FunctorView");
-    auto h_r = Kokkos::create_mirror_view(result_view);
-    Test::ReduceCombinatorical::FunctorScalar<ISTEAM> functor(result_view);
-    double expected_result = 1000.0*999.0/2.0;
+  template< int ISTEAM, class ... Args >
+  static void AddFunctor( Args... args ) {
+    Kokkos::View< double > result_view( "FunctorView" );
+    auto h_r = Kokkos::create_mirror_view( result_view );
+    Test::ReduceCombinatorical::FunctorScalar< ISTEAM > functor( result_view );
+    double expected_result = 1000.0 * 999.0 / 2.0;
 
-    AddReturnArgument(args..., functor);
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalar<ISTEAM>(result_view));
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarInit<ISTEAM>(result_view));
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoin<ISTEAM>(result_view));
-    AddReturnArgument(args..., Test::ReduceCombinatorical::FunctorScalarJoinInit<ISTEAM>(result_view));
+    AddReturnArgument( args..., functor );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalar< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarInit< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarJoin< ISTEAM >( result_view ) );
+    AddReturnArgument( args..., Test::ReduceCombinatorical::FunctorScalarJoinInit< ISTEAM >( result_view ) );
 
     h_r() = 0;
-    Kokkos::deep_copy(result_view,h_r);
-    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarFinal<ISTEAM>(result_view));
-    Kokkos::deep_copy(h_r,result_view);
-    ASSERT_EQ(expected_result,h_r());
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarFinal< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
 
     h_r() = 0;
-    Kokkos::deep_copy(result_view,h_r);
-    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal<ISTEAM>(result_view));
-    Kokkos::deep_copy(h_r,result_view);
-    ASSERT_EQ(expected_result,h_r());
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarJoinFinal< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
 
     h_r() = 0;
-    Kokkos::deep_copy(result_view,h_r);
-    CallParallelReduce(args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit<ISTEAM>(result_view));
-    Kokkos::deep_copy(h_r,result_view);
-    ASSERT_EQ(expected_result,h_r());
+    Kokkos::deep_copy( result_view, h_r );
+    CallParallelReduce( args..., Test::ReduceCombinatorical::FunctorScalarJoinFinalInit< ISTEAM >( result_view ) );
+    Kokkos::deep_copy( h_r, result_view );
+    ASSERT_EQ( expected_result, h_r() );
   }
 
-  template<class ... Args>
-  static void AddFunctorLambdaRange(Args... args) {
-    AddFunctor<0,Args...>(args...);
-    #ifdef  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-    AddLambdaRange(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
-    #endif
+  template< class ... Args >
+  static void AddFunctorLambdaRange( Args... args ) {
+    AddFunctor< 0, Args... >( args... );
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    AddLambdaRange( typename std::conditional< std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, void*, Kokkos::InvalidType >::type(), args... );
+#endif
   }
 
-  template<class ... Args>
-  static void AddFunctorLambdaTeam(Args... args) {
-    AddFunctor<1,Args...>(args...);
-    #ifdef  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-    AddLambdaTeam(typename std::conditional<std::is_same<ExecSpace,Kokkos::DefaultExecutionSpace>::value,void*,Kokkos::InvalidType>::type(), args...);
-    #endif
+  template< class ... Args >
+  static void AddFunctorLambdaTeam( Args... args ) {
+    AddFunctor< 1, Args... >( args... );
+#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+    AddLambdaTeam( typename std::conditional< std::is_same<ExecSpace, Kokkos::DefaultExecutionSpace>::value, void*, Kokkos::InvalidType >::type(), args... );
+#endif
   }
 
-  template<class ... Args>
-  static void AddPolicy(Args... args) {
+  template< class ... Args >
+  static void AddPolicy( Args... args ) {
     int N = 1000;
-    Kokkos::RangePolicy<ExecSpace> policy(0,N);
+    Kokkos::RangePolicy< ExecSpace > policy( 0, N );
 
-    AddFunctorLambdaRange(args...,1000);
-    AddFunctorLambdaRange(args...,N);
-    AddFunctorLambdaRange(args...,policy);
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace>(0,N));
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N));
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(0,N).set_chunk_size(10));
-    AddFunctorLambdaRange(args...,Kokkos::RangePolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(0,N).set_chunk_size(10));
+    AddFunctorLambdaRange( args..., 1000 );
+    AddFunctorLambdaRange( args..., N );
+    AddFunctorLambdaRange( args..., policy );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace >( 0, N ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( 0, N ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Static> >( 0, N ).set_chunk_size( 10 ) );
+    AddFunctorLambdaRange( args..., Kokkos::RangePolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( 0, N ).set_chunk_size( 10 ) );
 
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace>(N,Kokkos::AUTO));
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO));
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Static> >(N,Kokkos::AUTO).set_chunk_size(10));
-    AddFunctorLambdaTeam(args...,Kokkos::TeamPolicy<ExecSpace,Kokkos::Schedule<Kokkos::Dynamic> >(N,Kokkos::AUTO).set_chunk_size(10));
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace >( N, Kokkos::AUTO ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( N, Kokkos::AUTO ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Static> >( N, Kokkos::AUTO ).set_chunk_size( 10 ) );
+    AddFunctorLambdaTeam( args..., Kokkos::TeamPolicy< ExecSpace, Kokkos::Schedule<Kokkos::Dynamic> >( N, Kokkos::AUTO ).set_chunk_size( 10 ) );
   }
 
-
   static void execute_a() {
     AddPolicy();
   }
 
   static void execute_b() {
-    std::string s("Std::String");
-    AddPolicy(s.c_str());
-    AddPolicy("Char Constant");
+    std::string s( "Std::String" );
+    AddPolicy( s.c_str() );
+    AddPolicy( "Char Constant" );
   }
 
   static void execute_c() {
-    std::string s("Std::String");
-    AddPolicy(s);
+    std::string s( "Std::String" );
+    AddPolicy( s );
   }
 };
 
-template<class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace>
+template< class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace >
 struct TestReducers {
-
   struct SumFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value += values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value += values( i );
     }
   };
 
   struct ProdFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value *= values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value *= values( i );
     }
   };
 
   struct MinFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      if(values(i) < value)
-        value = values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      if ( values( i ) < value ) value = values( i );
     }
   };
 
   struct MaxFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      if(values(i) > value)
-        value = values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      if ( values( i ) > value ) value = values( i );
     }
   };
 
   struct MinLocFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i,
-        typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type& value) const {
-      if(values(i) < value.val) {
-        value.val = values(i);
+    void operator()( const int & i, typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) < value.val ) {
+        value.val = values( i );
         value.loc = i;
       }
     }
   };
 
   struct MaxLocFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i,
-        typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type& value) const {
-      if(values(i) > value.val) {
-        value.val = values(i);
+    void operator()( const int & i, typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.val ) {
+        value.val = values( i );
         value.loc = i;
       }
     }
   };
 
   struct MinMaxLocFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i,
-        typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type& value) const {
-      if(values(i) > value.max_val) {
-        value.max_val = values(i);
+    void operator()( const int & i, typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type & value ) const {
+      if ( values( i ) > value.max_val ) {
+        value.max_val = values( i );
         value.max_loc = i;
       }
-      if(values(i) < value.min_val) {
-        value.min_val = values(i);
+
+      if ( values( i ) < value.min_val ) {
+        value.min_val = values( i );
         value.min_loc = i;
       }
     }
   };
 
   struct BAndFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value & values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value & values( i );
     }
   };
 
   struct BOrFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value | values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value | values( i );
     }
   };
 
   struct BXorFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value ^ values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value ^ values( i );
     }
   };
 
   struct LAndFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value && values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value && values( i );
     }
   };
 
   struct LOrFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value || values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value || values( i );
     }
   };
 
   struct LXorFunctor {
-    Kokkos::View<const Scalar*,ExecSpace> values;
+    Kokkos::View< const Scalar*, ExecSpace > values;
+
     KOKKOS_INLINE_FUNCTION
-    void operator() (const int& i, Scalar& value) const {
-      value = value ? (!values(i)) : values(i);
+    void operator()( const int & i, Scalar & value ) const {
+      value = value ? ( !values( i ) ) : values( i );
     }
   };
 
-  static void test_sum(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_sum( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_sum = 0;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100);
-      reference_sum += h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100 );
+      reference_sum += h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     SumFunctor f;
     f.values = values;
@@ -1224,556 +1254,669 @@ struct TestReducers {
 
     {
       Scalar sum_scalar = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_scalar(sum_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(sum_scalar,reference_sum);
+      Kokkos::Experimental::Sum< Scalar > reducer_scalar( sum_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( sum_scalar, reference_sum );
+
       Scalar sum_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(sum_scalar_view,reference_sum);
+      ASSERT_EQ( sum_scalar_view, reference_sum );
     }
+
     {
       Scalar sum_scalar_init = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_scalar_init(sum_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(sum_scalar_init,reference_sum);
+      Kokkos::Experimental::Sum< Scalar > reducer_scalar_init( sum_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( sum_scalar_init, reference_sum );
+
       Scalar sum_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(sum_scalar_init_view,reference_sum);
+      ASSERT_EQ( sum_scalar_init_view, reference_sum );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace> sum_view( "View" );
       sum_view() = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_view(sum_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Sum< Scalar > reducer_view( sum_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar sum_view_scalar = sum_view();
-      ASSERT_EQ(sum_view_scalar,reference_sum);
+      ASSERT_EQ( sum_view_scalar, reference_sum );
+
       Scalar sum_view_view = reducer_view.result_view()();
-      ASSERT_EQ(sum_view_view,reference_sum);
+      ASSERT_EQ( sum_view_view, reference_sum );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> sum_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > sum_view_init( "View" );
       sum_view_init() = init;
-      Kokkos::Experimental::Sum<Scalar> reducer_view_init(sum_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Sum< Scalar > reducer_view_init( sum_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar sum_view_init_scalar = sum_view_init();
-      ASSERT_EQ(sum_view_init_scalar,reference_sum);
+      ASSERT_EQ( sum_view_init_scalar, reference_sum );
+
       Scalar sum_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(sum_view_init_view,reference_sum);
+      ASSERT_EQ( sum_view_init_view, reference_sum );
     }
   }
 
-  static void test_prod(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_prod( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_prod = 1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%4+1);
-      reference_prod *= h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 4 + 1 );
+      reference_prod *= h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     ProdFunctor f;
     f.values = values;
     Scalar init = 1;
 
-    if(std::is_arithmetic<Scalar>::value)
+    if ( std::is_arithmetic< Scalar >::value )
     {
       Scalar prod_scalar = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_scalar(prod_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(prod_scalar,reference_prod);
+      Kokkos::Experimental::Prod< Scalar > reducer_scalar( prod_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( prod_scalar, reference_prod );
+
       Scalar prod_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(prod_scalar_view,reference_prod);
+      ASSERT_EQ( prod_scalar_view, reference_prod );
     }
+
     {
       Scalar prod_scalar_init = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_scalar_init(prod_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(prod_scalar_init,reference_prod);
+      Kokkos::Experimental::Prod< Scalar > reducer_scalar_init( prod_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( prod_scalar_init, reference_prod );
+
       Scalar prod_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(prod_scalar_init_view,reference_prod);
+      ASSERT_EQ( prod_scalar_init_view, reference_prod );
     }
 
-    if(std::is_arithmetic<Scalar>::value)
+    if ( std::is_arithmetic< Scalar >::value )
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > prod_view( "View" );
       prod_view() = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_view(prod_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Prod< Scalar > reducer_view( prod_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar prod_view_scalar = prod_view();
-      ASSERT_EQ(prod_view_scalar,reference_prod);
+      ASSERT_EQ( prod_view_scalar, reference_prod );
+
       Scalar prod_view_view = reducer_view.result_view()();
-      ASSERT_EQ(prod_view_view,reference_prod);
+      ASSERT_EQ( prod_view_view, reference_prod );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> prod_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > prod_view_init( "View" );
       prod_view_init() = init;
-      Kokkos::Experimental::Prod<Scalar> reducer_view_init(prod_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Prod< Scalar > reducer_view_init( prod_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar prod_view_init_scalar = prod_view_init();
-      ASSERT_EQ(prod_view_init_scalar,reference_prod);
+      ASSERT_EQ( prod_view_init_scalar, reference_prod );
+
       Scalar prod_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(prod_view_init_view,reference_prod);
+      ASSERT_EQ( prod_view_init_view, reference_prod );
     }
   }
 
-  static void test_min(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_min = std::numeric_limits<Scalar>::max();
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000);
-      if(h_values(i)<reference_min)
-        reference_min = h_values(i);
+  static void test_min( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_min = std::numeric_limits< Scalar >::max();
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) < reference_min ) reference_min = h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MinFunctor f;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::max();
+    Scalar init = std::numeric_limits< Scalar >::max();
 
     {
       Scalar min_scalar = init;
-      Kokkos::Experimental::Min<Scalar> reducer_scalar(min_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(min_scalar,reference_min);
+      Kokkos::Experimental::Min< Scalar > reducer_scalar( min_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( min_scalar, reference_min );
+
       Scalar min_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(min_scalar_view,reference_min);
+      ASSERT_EQ( min_scalar_view, reference_min );
     }
+
     {
       Scalar min_scalar_init = init;
-      Kokkos::Experimental::Min<Scalar> reducer_scalar_init(min_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(min_scalar_init,reference_min);
+      Kokkos::Experimental::Min< Scalar > reducer_scalar_init( min_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( min_scalar_init, reference_min );
+
       Scalar min_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(min_scalar_init_view,reference_min);
+      ASSERT_EQ( min_scalar_init_view, reference_min );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> min_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > min_view( "View" );
       min_view() = init;
-      Kokkos::Experimental::Min<Scalar> reducer_view(min_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Min< Scalar > reducer_view( min_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar min_view_scalar = min_view();
-      ASSERT_EQ(min_view_scalar,reference_min);
+      ASSERT_EQ( min_view_scalar, reference_min );
+
       Scalar min_view_view = reducer_view.result_view()();
-      ASSERT_EQ(min_view_view,reference_min);
+      ASSERT_EQ( min_view_view, reference_min );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> min_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > min_view_init( "View" );
       min_view_init() = init;
-      Kokkos::Experimental::Min<Scalar> reducer_view_init(min_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Min< Scalar > reducer_view_init( min_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar min_view_init_scalar = min_view_init();
-      ASSERT_EQ(min_view_init_scalar,reference_min);
+      ASSERT_EQ( min_view_init_scalar, reference_min );
+
       Scalar min_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(min_view_init_view,reference_min);
+      ASSERT_EQ( min_view_init_view, reference_min );
     }
   }
 
-  static void test_max(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_max = std::numeric_limits<Scalar>::min();
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000+1);
-      if(h_values(i)>reference_max)
-        reference_max = h_values(i);
+  static void test_max( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_max = std::numeric_limits< Scalar >::min();
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 + 1 );
+
+      if ( h_values( i ) > reference_max ) reference_max = h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MaxFunctor f;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::min();
+    Scalar init = std::numeric_limits< Scalar >::min();
 
     {
       Scalar max_scalar = init;
-      Kokkos::Experimental::Max<Scalar> reducer_scalar(max_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(max_scalar,reference_max);
+      Kokkos::Experimental::Max< Scalar > reducer_scalar( max_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( max_scalar, reference_max );
+
       Scalar max_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(max_scalar_view,reference_max);
+      ASSERT_EQ( max_scalar_view, reference_max );
     }
+
     {
       Scalar max_scalar_init = init;
-      Kokkos::Experimental::Max<Scalar> reducer_scalar_init(max_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(max_scalar_init,reference_max);
+      Kokkos::Experimental::Max< Scalar > reducer_scalar_init( max_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( max_scalar_init, reference_max );
+
       Scalar max_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(max_scalar_init_view,reference_max);
+      ASSERT_EQ( max_scalar_init_view, reference_max );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> max_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > max_view( "View" );
       max_view() = init;
-      Kokkos::Experimental::Max<Scalar> reducer_view(max_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::Max< Scalar > reducer_view( max_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar max_view_scalar = max_view();
-      ASSERT_EQ(max_view_scalar,reference_max);
+      ASSERT_EQ( max_view_scalar, reference_max );
+
       Scalar max_view_view = reducer_view.result_view()();
-      ASSERT_EQ(max_view_view,reference_max);
+      ASSERT_EQ( max_view_view, reference_max );
     }
+
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> max_view_init("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > max_view_init( "View" );
       max_view_init() = init;
-      Kokkos::Experimental::Max<Scalar> reducer_view_init(max_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::Experimental::Max< Scalar > reducer_view_init( max_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       Scalar max_view_init_scalar = max_view_init();
-      ASSERT_EQ(max_view_init_scalar,reference_max);
+      ASSERT_EQ( max_view_init_scalar, reference_max );
+
       Scalar max_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(max_view_init_view,reference_max);
+      ASSERT_EQ( max_view_init_view, reference_max );
     }
   }
 
-  static void test_minloc(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_min = std::numeric_limits<Scalar>::max();
+  static void test_minloc( int N ) {
+    typedef typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type value_type;
+
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_min = std::numeric_limits< Scalar >::max();
     int reference_loc = -1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000);
-      if(h_values(i)<reference_min) {
-        reference_min = h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) < reference_min ) {
+        reference_min = h_values( i );
         reference_loc = i;
-      } else if (h_values(i) == reference_min) {
-        // make min unique
-        h_values(i) += std::numeric_limits<Scalar>::epsilon();
+      }
+      else if ( h_values( i ) == reference_min ) {
+        // Make min unique.
+        h_values( i ) += std::numeric_limits< Scalar >::epsilon();
       }
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MinLocFunctor f;
-    typedef typename Kokkos::Experimental::MinLoc<Scalar,int>::value_type value_type;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::max();
-
+    Scalar init = std::numeric_limits< Scalar >::max();
 
     {
       value_type min_scalar;
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar(min_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(min_scalar.val,reference_min);
-      ASSERT_EQ(min_scalar.loc,reference_loc);
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_scalar( min_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( min_scalar.val, reference_min );
+      ASSERT_EQ( min_scalar.loc, reference_loc );
+
       value_type min_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(min_scalar_view.val,reference_min);
-      ASSERT_EQ(min_scalar_view.loc,reference_loc);
+      ASSERT_EQ( min_scalar_view.val, reference_min );
+      ASSERT_EQ( min_scalar_view.loc, reference_loc );
     }
+
     {
       value_type min_scalar_init;
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_scalar_init(min_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(min_scalar_init.val,reference_min);
-      ASSERT_EQ(min_scalar_init.loc,reference_loc);
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_scalar_init( min_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( min_scalar_init.val, reference_min );
+      ASSERT_EQ( min_scalar_init.loc, reference_loc );
+
       value_type min_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(min_scalar_init_view.val,reference_min);
-      ASSERT_EQ(min_scalar_init_view.loc,reference_loc);
+      ASSERT_EQ( min_scalar_init_view.val, reference_min );
+      ASSERT_EQ( min_scalar_init_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> min_view("View");
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view(min_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::View< value_type, Kokkos::HostSpace > min_view( "View" );
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_view( min_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       value_type min_view_scalar = min_view();
-      ASSERT_EQ(min_view_scalar.val,reference_min);
-      ASSERT_EQ(min_view_scalar.loc,reference_loc);
+      ASSERT_EQ( min_view_scalar.val, reference_min );
+      ASSERT_EQ( min_view_scalar.loc, reference_loc );
+
       value_type min_view_view = reducer_view.result_view()();
-      ASSERT_EQ(min_view_view.val,reference_min);
-      ASSERT_EQ(min_view_view.loc,reference_loc);
+      ASSERT_EQ( min_view_view.val, reference_min );
+      ASSERT_EQ( min_view_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> min_view_init("View");
-      Kokkos::Experimental::MinLoc<Scalar,int> reducer_view_init(min_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::View< value_type, Kokkos::HostSpace > min_view_init( "View" );
+      Kokkos::Experimental::MinLoc< Scalar, int > reducer_view_init( min_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       value_type min_view_init_scalar = min_view_init();
-      ASSERT_EQ(min_view_init_scalar.val,reference_min);
-      ASSERT_EQ(min_view_init_scalar.loc,reference_loc);
+      ASSERT_EQ( min_view_init_scalar.val, reference_min );
+      ASSERT_EQ( min_view_init_scalar.loc, reference_loc );
+
       value_type min_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(min_view_init_view.val,reference_min);
-      ASSERT_EQ(min_view_init_view.loc,reference_loc);
+      ASSERT_EQ( min_view_init_view.val, reference_min );
+      ASSERT_EQ( min_view_init_view.loc, reference_loc );
     }
   }
 
-  static void test_maxloc(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_max = std::numeric_limits<Scalar>::min();
+  static void test_maxloc( int N ) {
+    typedef typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type value_type;
+
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_max = std::numeric_limits< Scalar >::min();
     int reference_loc = -1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000);
-      if(h_values(i)>reference_max) {
-        reference_max = h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 );
+
+      if ( h_values( i ) > reference_max ) {
+        reference_max = h_values( i );
         reference_loc = i;
-      } else if (h_values(i) == reference_max) {
-        // make max unique
-        h_values(i) -= std::numeric_limits<Scalar>::epsilon();
+      }
+      else if ( h_values( i ) == reference_max ) {
+        // Make max unique.
+        h_values( i ) -= std::numeric_limits< Scalar >::epsilon();
       }
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     MaxLocFunctor f;
-    typedef typename Kokkos::Experimental::MaxLoc<Scalar,int>::value_type value_type;
     f.values = values;
-    Scalar init = std::numeric_limits<Scalar>::min();
-
+    Scalar init = std::numeric_limits< Scalar >::min();
 
     {
       value_type max_scalar;
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar(max_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(max_scalar.val,reference_max);
-      ASSERT_EQ(max_scalar.loc,reference_loc);
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_scalar( max_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( max_scalar.val, reference_max );
+      ASSERT_EQ( max_scalar.loc, reference_loc );
+
       value_type max_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(max_scalar_view.val,reference_max);
-      ASSERT_EQ(max_scalar_view.loc,reference_loc);
+      ASSERT_EQ( max_scalar_view.val, reference_max );
+      ASSERT_EQ( max_scalar_view.loc, reference_loc );
     }
+
     {
       value_type max_scalar_init;
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_scalar_init(max_scalar_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-      ASSERT_EQ(max_scalar_init.val,reference_max);
-      ASSERT_EQ(max_scalar_init.loc,reference_loc);
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_scalar_init( max_scalar_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+      ASSERT_EQ( max_scalar_init.val, reference_max );
+      ASSERT_EQ( max_scalar_init.loc, reference_loc );
+
       value_type max_scalar_init_view = reducer_scalar_init.result_view()();
-      ASSERT_EQ(max_scalar_init_view.val,reference_max);
-      ASSERT_EQ(max_scalar_init_view.loc,reference_loc);
+      ASSERT_EQ( max_scalar_init_view.val, reference_max );
+      ASSERT_EQ( max_scalar_init_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> max_view("View");
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view(max_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::View< value_type, Kokkos::HostSpace > max_view( "View" );
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_view( max_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       value_type max_view_scalar = max_view();
-      ASSERT_EQ(max_view_scalar.val,reference_max);
-      ASSERT_EQ(max_view_scalar.loc,reference_loc);
+      ASSERT_EQ( max_view_scalar.val, reference_max );
+      ASSERT_EQ( max_view_scalar.loc, reference_loc );
+
       value_type max_view_view = reducer_view.result_view()();
-      ASSERT_EQ(max_view_view.val,reference_max);
-      ASSERT_EQ(max_view_view.loc,reference_loc);
+      ASSERT_EQ( max_view_view.val, reference_max );
+      ASSERT_EQ( max_view_view.loc, reference_loc );
     }
+
     {
-      Kokkos::View<value_type,Kokkos::HostSpace> max_view_init("View");
-      Kokkos::Experimental::MaxLoc<Scalar,int> reducer_view_init(max_view_init,init);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+      Kokkos::View< value_type, Kokkos::HostSpace > max_view_init( "View" );
+      Kokkos::Experimental::MaxLoc< Scalar, int > reducer_view_init( max_view_init, init );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
       value_type max_view_init_scalar = max_view_init();
-      ASSERT_EQ(max_view_init_scalar.val,reference_max);
-      ASSERT_EQ(max_view_init_scalar.loc,reference_loc);
+      ASSERT_EQ( max_view_init_scalar.val, reference_max );
+      ASSERT_EQ( max_view_init_scalar.loc, reference_loc );
+
       value_type max_view_init_view = reducer_view_init.result_view()();
-      ASSERT_EQ(max_view_init_view.val,reference_max);
-      ASSERT_EQ(max_view_init_view.loc,reference_loc);
+      ASSERT_EQ( max_view_init_view.val, reference_max );
+      ASSERT_EQ( max_view_init_view.loc, reference_loc );
     }
   }
 
-  static void test_minmaxloc(int N) {
-     Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-     auto h_values = Kokkos::create_mirror_view(values);
-     Scalar reference_max = std::numeric_limits<Scalar>::min();
-     Scalar reference_min = std::numeric_limits<Scalar>::max();
+  static void test_minmaxloc( int N ) {
+     typedef typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type value_type;
+
+     Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+     auto h_values = Kokkos::create_mirror_view( values );
+     Scalar reference_max = std::numeric_limits< Scalar >::min();
+     Scalar reference_min = std::numeric_limits< Scalar >::max();
      int reference_minloc = -1;
      int reference_maxloc = -1;
-     for(int i=0; i<N; i++) {
-       h_values(i) = (Scalar)(rand()%100000);
+
+     for ( int i = 0; i < N; i++ ) {
+       h_values( i ) = (Scalar) ( rand() % 100000 );
      }
-     for(int i=0; i<N; i++) {
-       if(h_values(i)>reference_max) {
-         reference_max = h_values(i);
+
+     for ( int i = 0; i < N; i++ ) {
+       if ( h_values( i ) > reference_max ) {
+         reference_max = h_values( i );
          reference_maxloc = i;
-       } else if (h_values(i) == reference_max) {
-         // make max unique
-         h_values(i) -= std::numeric_limits<Scalar>::epsilon();
+       }
+       else if ( h_values( i ) == reference_max ) {
+         // Make max unique.
+         h_values( i ) -= std::numeric_limits< Scalar >::epsilon();
        }
      }
-     for(int i=0; i<N; i++) {
-       if(h_values(i)<reference_min) {
-         reference_min = h_values(i);
+
+     for ( int i = 0; i < N; i++ ) {
+       if ( h_values( i ) < reference_min ) {
+         reference_min = h_values( i );
          reference_minloc = i;
-       } else if (h_values(i) == reference_min) {
-         // make min unique
-         h_values(i) += std::numeric_limits<Scalar>::epsilon();
+       }
+       else if ( h_values( i ) == reference_min ) {
+         // Make min unique.
+         h_values( i ) += std::numeric_limits< Scalar >::epsilon();
        }
      }
-     Kokkos::deep_copy(values,h_values);
+
+     Kokkos::deep_copy( values, h_values );
 
      MinMaxLocFunctor f;
-     typedef typename Kokkos::Experimental::MinMaxLoc<Scalar,int>::value_type value_type;
      f.values = values;
-     Scalar init_min = std::numeric_limits<Scalar>::max();
-     Scalar init_max = std::numeric_limits<Scalar>::min();
-
+     Scalar init_min = std::numeric_limits< Scalar >::max();
+     Scalar init_max = std::numeric_limits< Scalar >::min();
 
      {
        value_type minmax_scalar;
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar(minmax_scalar);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-       ASSERT_EQ(minmax_scalar.min_val,reference_min);
-       for(int i=0; i<N; i++) {
-         if((i == minmax_scalar.min_loc) && (h_values(i)==reference_min))
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_scalar( minmax_scalar );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+       ASSERT_EQ( minmax_scalar.min_val, reference_min );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.min_loc ) && ( h_values( i ) == reference_min ) ) {
            reference_minloc = i;
+         }
        }
-       ASSERT_EQ(minmax_scalar.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar.max_val,reference_max);
-       for(int i=0; i<N; i++) {
-         if((i == minmax_scalar.max_loc) && (h_values(i)==reference_max))
+
+       ASSERT_EQ( minmax_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar.max_val, reference_max );
+
+       for ( int i = 0; i < N; i++ ) {
+         if ( ( i == minmax_scalar.max_loc ) && ( h_values( i ) == reference_max ) ) {
            reference_maxloc = i;
+         }
        }
-       ASSERT_EQ(minmax_scalar.max_loc,reference_maxloc);
+
+       ASSERT_EQ( minmax_scalar.max_loc, reference_maxloc );
+
        value_type minmax_scalar_view = reducer_scalar.result_view()();
-       ASSERT_EQ(minmax_scalar_view.min_val,reference_min);
-       ASSERT_EQ(minmax_scalar_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar_view.max_val,reference_max);
-       ASSERT_EQ(minmax_scalar_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_scalar_view.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_view.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_view.max_loc, reference_maxloc );
      }
+
      {
        value_type minmax_scalar_init;
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_scalar_init(minmax_scalar_init,init_min,init_max);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar_init);
-       ASSERT_EQ(minmax_scalar_init.min_val,reference_min);
-       ASSERT_EQ(minmax_scalar_init.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar_init.max_val,reference_max);
-       ASSERT_EQ(minmax_scalar_init.max_loc,reference_maxloc);
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_scalar_init( minmax_scalar_init, init_min, init_max );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar_init );
+
+       ASSERT_EQ( minmax_scalar_init.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_init.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_init.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_init.max_loc, reference_maxloc );
+
        value_type minmax_scalar_init_view = reducer_scalar_init.result_view()();
-       ASSERT_EQ(minmax_scalar_init_view.min_val,reference_min);
-       ASSERT_EQ(minmax_scalar_init_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_scalar_init_view.max_val,reference_max);
-       ASSERT_EQ(minmax_scalar_init_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_scalar_init_view.min_val, reference_min );
+       ASSERT_EQ( minmax_scalar_init_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_scalar_init_view.max_val, reference_max );
+       ASSERT_EQ( minmax_scalar_init_view.max_loc, reference_maxloc );
      }
+
      {
-       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view("View");
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view(minmax_view);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+       Kokkos::View< value_type, Kokkos::HostSpace > minmax_view( "View" );
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_view( minmax_view );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
        value_type minmax_view_scalar = minmax_view();
-       ASSERT_EQ(minmax_view_scalar.min_val,reference_min);
-       ASSERT_EQ(minmax_view_scalar.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_scalar.max_val,reference_max);
-       ASSERT_EQ(minmax_view_scalar.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_scalar.min_val, reference_min );
+       ASSERT_EQ( minmax_view_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_scalar.max_val, reference_max );
+       ASSERT_EQ( minmax_view_scalar.max_loc, reference_maxloc );
+
        value_type minmax_view_view = reducer_view.result_view()();
-       ASSERT_EQ(minmax_view_view.min_val,reference_min);
-       ASSERT_EQ(minmax_view_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_view.max_val,reference_max);
-       ASSERT_EQ(minmax_view_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_view.min_val, reference_min );
+       ASSERT_EQ( minmax_view_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_view.max_val, reference_max );
+       ASSERT_EQ( minmax_view_view.max_loc, reference_maxloc );
      }
+
      {
-       Kokkos::View<value_type,Kokkos::HostSpace> minmax_view_init("View");
-       Kokkos::Experimental::MinMaxLoc<Scalar,int> reducer_view_init(minmax_view_init,init_min,init_max);
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view_init);
+       Kokkos::View< value_type, Kokkos::HostSpace > minmax_view_init( "View" );
+       Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_view_init( minmax_view_init, init_min, init_max );
+       Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view_init );
+
        value_type minmax_view_init_scalar = minmax_view_init();
-       ASSERT_EQ(minmax_view_init_scalar.min_val,reference_min);
-       ASSERT_EQ(minmax_view_init_scalar.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_init_scalar.max_val,reference_max);
-       ASSERT_EQ(minmax_view_init_scalar.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_init_scalar.min_val, reference_min );
+       ASSERT_EQ( minmax_view_init_scalar.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_init_scalar.max_val, reference_max );
+       ASSERT_EQ( minmax_view_init_scalar.max_loc, reference_maxloc );
+
        value_type minmax_view_init_view = reducer_view_init.result_view()();
-       ASSERT_EQ(minmax_view_init_view.min_val,reference_min);
-       ASSERT_EQ(minmax_view_init_view.min_loc,reference_minloc);
-       ASSERT_EQ(minmax_view_init_view.max_val,reference_max);
-       ASSERT_EQ(minmax_view_init_view.max_loc,reference_maxloc);
+       ASSERT_EQ( minmax_view_init_view.min_val, reference_min );
+       ASSERT_EQ( minmax_view_init_view.min_loc, reference_minloc );
+       ASSERT_EQ( minmax_view_init_view.max_val, reference_max );
+       ASSERT_EQ( minmax_view_init_view.max_loc, reference_maxloc );
      }
    }
 
-  static void test_BAnd(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_band = Scalar() | (~Scalar());
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%100000+1);
-      reference_band = reference_band & h_values(i);
+  static void test_BAnd( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_band = Scalar() | ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 100000 + 1 );
+      reference_band = reference_band & h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     BAndFunctor f;
     f.values = values;
-    Scalar init = Scalar() | (~Scalar());
+    Scalar init = Scalar() | ( ~Scalar() );
 
     {
       Scalar band_scalar = init;
-      Kokkos::Experimental::BAnd<Scalar> reducer_scalar(band_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(band_scalar,reference_band);
+      Kokkos::Experimental::BAnd< Scalar > reducer_scalar( band_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( band_scalar, reference_band );
       Scalar band_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(band_scalar_view,reference_band);
+
+      ASSERT_EQ( band_scalar_view, reference_band );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> band_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > band_view( "View" );
       band_view() = init;
-      Kokkos::Experimental::BAnd<Scalar> reducer_view(band_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::BAnd< Scalar > reducer_view( band_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar band_view_scalar = band_view();
-      ASSERT_EQ(band_view_scalar,reference_band);
+      ASSERT_EQ( band_view_scalar, reference_band );
+
       Scalar band_view_view = reducer_view.result_view()();
-      ASSERT_EQ(band_view_view,reference_band);
+      ASSERT_EQ( band_view_view, reference_band );
     }
   }
 
-  static void test_BOr(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_bor = Scalar() & (~Scalar());
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)((rand()%100000+1)*2);
-      reference_bor = reference_bor | h_values(i);
+  static void test_BOr( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_bor = Scalar() & ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( ( rand() % 100000 + 1 ) * 2 );
+      reference_bor = reference_bor | h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     BOrFunctor f;
     f.values = values;
-    Scalar init = Scalar() & (~Scalar());
+    Scalar init = Scalar() & ( ~Scalar() );
 
     {
       Scalar bor_scalar = init;
-      Kokkos::Experimental::BOr<Scalar> reducer_scalar(bor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(bor_scalar,reference_bor);
+      Kokkos::Experimental::BOr< Scalar > reducer_scalar( bor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( bor_scalar, reference_bor );
+
       Scalar bor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(bor_scalar_view,reference_bor);
+      ASSERT_EQ( bor_scalar_view, reference_bor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> bor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > bor_view( "View" );
       bor_view() = init;
-      Kokkos::Experimental::BOr<Scalar> reducer_view(bor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::BOr< Scalar > reducer_view( bor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar bor_view_scalar = bor_view();
-      ASSERT_EQ(bor_view_scalar,reference_bor);
+      ASSERT_EQ( bor_view_scalar, reference_bor );
+
       Scalar bor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(bor_view_view,reference_bor);
+      ASSERT_EQ( bor_view_view, reference_bor );
     }
   }
 
-  static void test_BXor(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
-    Scalar reference_bxor = Scalar() & (~Scalar());
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)((rand()%100000+1)*2);
-      reference_bxor = reference_bxor ^ h_values(i);
+  static void test_BXor( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
+    Scalar reference_bxor = Scalar() & ( ~Scalar() );
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( ( rand() % 100000 + 1 ) * 2 );
+      reference_bxor = reference_bxor ^ h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     BXorFunctor f;
     f.values = values;
-    Scalar init = Scalar() & (~Scalar());
+    Scalar init = Scalar() & ( ~Scalar() );
 
     {
       Scalar bxor_scalar = init;
-      Kokkos::Experimental::BXor<Scalar> reducer_scalar(bxor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(bxor_scalar,reference_bxor);
+      Kokkos::Experimental::BXor< Scalar > reducer_scalar( bxor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( bxor_scalar, reference_bxor );
+
       Scalar bxor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(bxor_scalar_view,reference_bxor);
+      ASSERT_EQ( bxor_scalar_view, reference_bxor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> bxor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > bxor_view( "View" );
       bxor_view() = init;
-      Kokkos::Experimental::BXor<Scalar> reducer_view(bxor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::BXor< Scalar > reducer_view( bxor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar bxor_view_scalar = bxor_view();
-      ASSERT_EQ(bxor_view_scalar,reference_bxor);
+      ASSERT_EQ( bxor_view_scalar, reference_bxor );
+
       Scalar bxor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(bxor_view_view,reference_bxor);
+      ASSERT_EQ( bxor_view_view, reference_bxor );
     }
   }
 
-  static void test_LAnd(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_LAnd( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_land = 1;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%2);
-      reference_land = reference_land && h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_land = reference_land && h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     LAndFunctor f;
     f.values = values;
@@ -1781,34 +1924,39 @@ struct TestReducers {
 
     {
       Scalar land_scalar = init;
-      Kokkos::Experimental::LAnd<Scalar> reducer_scalar(land_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(land_scalar,reference_land);
+      Kokkos::Experimental::LAnd< Scalar > reducer_scalar( land_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( land_scalar, reference_land );
+
       Scalar land_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(land_scalar_view,reference_land);
+      ASSERT_EQ( land_scalar_view, reference_land );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> land_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > land_view( "View" );
       land_view() = init;
-      Kokkos::Experimental::LAnd<Scalar> reducer_view(land_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::LAnd< Scalar > reducer_view( land_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar land_view_scalar = land_view();
-      ASSERT_EQ(land_view_scalar,reference_land);
+      ASSERT_EQ( land_view_scalar, reference_land );
+
       Scalar land_view_view = reducer_view.result_view()();
-      ASSERT_EQ(land_view_view,reference_land);
+      ASSERT_EQ( land_view_view, reference_land );
     }
   }
 
-  static void test_LOr(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_LOr( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_lor = 0;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%2);
-      reference_lor = reference_lor || h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_lor = reference_lor || h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     LOrFunctor f;
     f.values = values;
@@ -1816,34 +1964,39 @@ struct TestReducers {
 
     {
       Scalar lor_scalar = init;
-      Kokkos::Experimental::LOr<Scalar> reducer_scalar(lor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(lor_scalar,reference_lor);
+      Kokkos::Experimental::LOr< Scalar > reducer_scalar( lor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( lor_scalar, reference_lor );
+
       Scalar lor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(lor_scalar_view,reference_lor);
+      ASSERT_EQ( lor_scalar_view, reference_lor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> lor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > lor_view( "View" );
       lor_view() = init;
-      Kokkos::Experimental::LOr<Scalar> reducer_view(lor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::LOr< Scalar > reducer_view( lor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar lor_view_scalar = lor_view();
-      ASSERT_EQ(lor_view_scalar,reference_lor);
+      ASSERT_EQ( lor_view_scalar, reference_lor );
+
       Scalar lor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(lor_view_view,reference_lor);
+      ASSERT_EQ( lor_view_view, reference_lor );
     }
   }
 
-  static void test_LXor(int N) {
-    Kokkos::View<Scalar*,ExecSpace> values("Values",N);
-    auto h_values = Kokkos::create_mirror_view(values);
+  static void test_LXor( int N ) {
+    Kokkos::View< Scalar*, ExecSpace > values( "Values", N );
+    auto h_values = Kokkos::create_mirror_view( values );
     Scalar reference_lxor = 0;
-    for(int i=0; i<N; i++) {
-      h_values(i) = (Scalar)(rand()%2);
-      reference_lxor = reference_lxor ? (!h_values(i)) : h_values(i);
+
+    for ( int i = 0; i < N; i++ ) {
+      h_values( i ) = (Scalar) ( rand() % 2 );
+      reference_lxor = reference_lxor ? ( !h_values( i ) ) : h_values( i );
     }
-    Kokkos::deep_copy(values,h_values);
+    Kokkos::deep_copy( values, h_values );
 
     LXorFunctor f;
     f.values = values;
@@ -1851,57 +2004,59 @@ struct TestReducers {
 
     {
       Scalar lxor_scalar = init;
-      Kokkos::Experimental::LXor<Scalar> reducer_scalar(lxor_scalar);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_scalar);
-      ASSERT_EQ(lxor_scalar,reference_lxor);
+      Kokkos::Experimental::LXor< Scalar > reducer_scalar( lxor_scalar );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar );
+
+      ASSERT_EQ( lxor_scalar, reference_lxor );
+
       Scalar lxor_scalar_view = reducer_scalar.result_view()();
-      ASSERT_EQ(lxor_scalar_view,reference_lxor);
+      ASSERT_EQ( lxor_scalar_view, reference_lxor );
     }
 
     {
-      Kokkos::View<Scalar,Kokkos::HostSpace> lxor_view("View");
+      Kokkos::View< Scalar, Kokkos::HostSpace > lxor_view( "View" );
       lxor_view() = init;
-      Kokkos::Experimental::LXor<Scalar> reducer_view(lxor_view);
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace>(0,N),f,reducer_view);
+      Kokkos::Experimental::LXor< Scalar > reducer_view( lxor_view );
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_view );
+
       Scalar lxor_view_scalar = lxor_view();
-      ASSERT_EQ(lxor_view_scalar,reference_lxor);
+      ASSERT_EQ( lxor_view_scalar, reference_lxor );
+
       Scalar lxor_view_view = reducer_view.result_view()();
-      ASSERT_EQ(lxor_view_view,reference_lxor);
+      ASSERT_EQ( lxor_view_view, reference_lxor );
     }
   }
 
   static void execute_float() {
-    test_sum(10001);
-    test_prod(35);
-    test_min(10003);
-    test_minloc(10003);
-    test_max(10007);
-    test_maxloc(10007);
-    test_minmaxloc(10007);
+    test_sum( 10001 );
+    test_prod( 35 );
+    test_min( 10003 );
+    test_minloc( 10003 );
+    test_max( 10007 );
+    test_maxloc( 10007 );
+    test_minmaxloc( 10007 );
   }
 
   static void execute_integer() {
-    test_sum(10001);
-    test_prod(35);
-    test_min(10003);
-    test_minloc(10003);
-    test_max(10007);
-    test_maxloc(10007);
-    test_minmaxloc(10007);
-    test_BAnd(35);
-    test_BOr(35);
-    test_BXor(35);
-    test_LAnd(35);
-    test_LOr(35);
-    test_LXor(35);
+    test_sum( 10001 );
+    test_prod( 35 );
+    test_min( 10003 );
+    test_minloc( 10003 );
+    test_max( 10007 );
+    test_maxloc( 10007 );
+    test_minmaxloc( 10007 );
+    test_BAnd( 35 );
+    test_BOr( 35 );
+    test_BXor( 35 );
+    test_LAnd( 35 );
+    test_LOr( 35 );
+    test_LXor( 35 );
   }
 
   static void execute_basic() {
-    test_sum(10001);
-    test_prod(35);
+    test_sum( 10001 );
+    test_prod( 35 );
   }
 };
-}
-
-/*--------------------------------------------------------------------------*/
 
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestScan.hpp b/lib/kokkos/core/unit_test/TestScan.hpp
index 1a9811a854f85e2b7ef918ff2d1e36b268ae6c28..547e03497601a0a7da8bc3d0027ee9fef603e196 100644
--- a/lib/kokkos/core/unit_test/TestScan.hpp
+++ b/lib/kokkos/core/unit_test/TestScan.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,82 +36,81 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-/*--------------------------------------------------------------------------*/
-
 #include <stdio.h>
 
 namespace Test {
 
-template< class Device , class WorkSpec = size_t >
+template< class Device, class WorkSpec = size_t >
 struct TestScan {
+  typedef  Device    execution_space;
+  typedef  long int  value_type;
 
-  typedef  Device    execution_space ;
-  typedef  long int  value_type ;
-
-  Kokkos::View<int,Device,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View< int, Device, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const int iwork , value_type & update , const bool final_pass ) const
+  void operator()( const int iwork, value_type & update, const bool final_pass ) const
   {
-    const value_type n = iwork + 1 ;
-    const value_type imbalance = ( (1000 <= n) && (0 == n % 1000) ) ? 1000 : 0 ;
+    const value_type n = iwork + 1;
+    const value_type imbalance = ( ( 1000 <= n ) && ( 0 == n % 1000 ) ) ? 1000 : 0;
 
     // Insert an artificial load imbalance
 
-    for ( value_type i = 0 ; i < imbalance ; ++i ) { ++update ; }
+    for ( value_type i = 0; i < imbalance; ++i ) { ++update; }
 
-    update += n - imbalance ;
+    update += n - imbalance;
 
     if ( final_pass ) {
       const value_type answer = n & 1 ? ( n * ( ( n + 1 ) / 2 ) ) : ( ( n / 2 ) * ( n + 1 ) );
 
       if ( answer != update ) {
         errors()++;
-        if(errors()<20)
-          printf("TestScan(%d,%ld) != %ld\n",iwork,update,answer);
+
+        if ( errors() < 20 ) {
+          printf( "TestScan(%d,%ld) != %ld\n", iwork, update, answer );
+        }
       }
     }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void init( value_type & update ) const { update = 0 ; }
+  void init( value_type & update ) const { update = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile       value_type & update ,
+  void join( volatile       value_type & update,
              volatile const value_type & input ) const
-  { update += input ; }
+  { update += input; }
 
   TestScan( const WorkSpec & N )
-    {
-      Kokkos::View<int,Device > errors_a("Errors");
-      Kokkos::deep_copy(errors_a,0);
-      errors = errors_a;
-      parallel_scan( N , *this );
-    }
+  {
+    Kokkos::View< int, Device > errors_a( "Errors" );
+    Kokkos::deep_copy( errors_a, 0 );
+    errors = errors_a;
+
+    parallel_scan( N , *this );
+  }
 
   TestScan( const WorkSpec & Start , const WorkSpec & N )
-    {
-      typedef Kokkos::RangePolicy<execution_space> exec_policy ;
+  {
+    typedef Kokkos::RangePolicy< execution_space > exec_policy ;
 
-      Kokkos::View<int,Device > errors_a("Errors");
-      Kokkos::deep_copy(errors_a,0);
-      errors = errors_a;
+    Kokkos::View< int, Device > errors_a( "Errors" );
+    Kokkos::deep_copy( errors_a, 0 );
+    errors = errors_a;
 
-      parallel_scan( exec_policy( Start , N ) , *this );
-    }
+    parallel_scan( exec_policy( Start , N ) , *this );
+  }
 
-  static void test_range( const WorkSpec & begin , const WorkSpec & end )
-    {
-      for ( WorkSpec i = begin ; i < end ; ++i ) {
-        (void) TestScan( i );
-      }
+  static void test_range( const WorkSpec & begin, const WorkSpec & end )
+  {
+    for ( WorkSpec i = begin; i < end; ++i ) {
+      (void) TestScan( i );
     }
+  }
 };
 
-}
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
index 291f9f60e4b8050e11b653f3f3ae975f1d1e8c91..6eca6bb38db08d562672d39b32eb22663da9f5b2 100644
--- a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
+++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -54,162 +54,157 @@
 namespace Test {
 
 struct SharedAllocDestroy {
+  volatile int * count;
 
-  volatile int * count ;
-
-  SharedAllocDestroy() = default ;
+  SharedAllocDestroy() = default;
   SharedAllocDestroy( int * arg ) : count( arg ) {}
 
   void destroy_shared_allocation()
-    {
-      Kokkos::atomic_increment( count );
-    }
-
+  {
+    Kokkos::atomic_increment( count );
+  }
 };
 
-template< class MemorySpace , class ExecutionSpace >
+template< class MemorySpace, class ExecutionSpace >
 void test_shared_alloc()
 {
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  typedef const Kokkos::Impl::SharedAllocationHeader                               Header;
+  typedef Kokkos::Impl::SharedAllocationTracker                                    Tracker;
+  typedef Kokkos::Impl::SharedAllocationRecord< void, void >                       RecordBase;
+  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace, void >                RecordMemS;
+  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace, SharedAllocDestroy >  RecordFull;
 
-  typedef const Kokkos::Impl::SharedAllocationHeader   Header ;
-  typedef Kokkos::Impl::SharedAllocationTracker  Tracker ;
-  typedef Kokkos::Impl::SharedAllocationRecord< void , void >                       RecordBase ;
-  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace , void >                RecordMemS ;
-  typedef Kokkos::Impl::SharedAllocationRecord< MemorySpace , SharedAllocDestroy >  RecordFull ;
-
-  static_assert( sizeof(Tracker) == sizeof(int*), "SharedAllocationTracker has wrong size!" );
+  static_assert( sizeof( Tracker ) == sizeof( int* ), "SharedAllocationTracker has wrong size!" );
 
-  MemorySpace s ;
+  MemorySpace s;
 
-  const size_t N = 1200 ;
-  const size_t size = 8 ;
+  const size_t N = 1200;
+  const size_t size = 8;
 
   RecordMemS * rarray[ N ];
   Header     * harray[ N ];
 
-  RecordMemS ** const r = rarray ;
-  Header     ** const h = harray ;
+  RecordMemS ** const r = rarray;
+  Header     ** const h = harray;
+
+  Kokkos::RangePolicy< ExecutionSpace > range( 0, N );
 
-  Kokkos::RangePolicy< ExecutionSpace > range(0,N);
-  
-  //----------------------------------------
   {
-  // Since always executed on host space, leave [=]
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      char name[64] ;
-      sprintf(name,"test_%.2d",int(i));
+    // Since always executed on host space, leave [=]
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      char name[64];
+      sprintf( name, "test_%.2d", int( i ) );
 
-      r[i] = RecordMemS::allocate( s , name , size * ( i + 1 ) );
+      r[i] = RecordMemS::allocate( s, name, size * ( i + 1 ) );
       h[i] = Header::get_header( r[i]->data() );
 
-      ASSERT_EQ( r[i]->use_count() , 0 );
+      ASSERT_EQ( r[i]->use_count(), 0 );
 
-      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+      for ( size_t j = 0; j < ( i / 10 ) + 1; ++j ) RecordBase::increment( r[i] );
 
-      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
-      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+      ASSERT_EQ( r[i]->use_count(), ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i], RecordMemS::get_record( r[i]->data() ) );
     });
 
     // Sanity check for the whole set of allocation records to which this record belongs.
     RecordBase::is_sane( r[0] );
-    // RecordMemS::print_records( std::cout , s , true );
+    // RecordMemS::print_records( std::cout, s, true );
 
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      while ( 0 != ( r[i] = static_cast< RecordMemS * >( RecordBase::decrement( r[i] ) ) ) ) {
         if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
       }
     });
   }
-  //----------------------------------------
+
   {
-    int destroy_count = 0 ;
-    SharedAllocDestroy counter( & destroy_count );
+    int destroy_count = 0;
+    SharedAllocDestroy counter( &destroy_count );
 
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      char name[64] ;
-      sprintf(name,"test_%.2d",int(i));
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      char name[64];
+      sprintf( name, "test_%.2d", int( i ) );
 
-      RecordFull * rec = RecordFull::allocate( s , name , size * ( i + 1 ) );
+      RecordFull * rec = RecordFull::allocate( s, name, size * ( i + 1 ) );
 
-      rec->m_destroy = counter ;
+      rec->m_destroy = counter;
 
-      r[i] = rec ;
+      r[i] = rec;
       h[i] = Header::get_header( r[i]->data() );
 
-      ASSERT_EQ( r[i]->use_count() , 0 );
+      ASSERT_EQ( r[i]->use_count(), 0 );
 
-      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+      for ( size_t j = 0; j < ( i / 10 ) + 1; ++j ) RecordBase::increment( r[i] );
 
-      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
-      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+      ASSERT_EQ( r[i]->use_count(), ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i], RecordMemS::get_record( r[i]->data() ) );
     });
 
     RecordBase::is_sane( r[0] );
 
-    Kokkos::parallel_for( range , [=]( size_t i ){
-      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+    Kokkos::parallel_for( range, [=] ( size_t i ) {
+      while ( 0 != ( r[i] = static_cast< RecordMemS * >( RecordBase::decrement( r[i] ) ) ) ) {
         if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
       }
     });
 
-    ASSERT_EQ( destroy_count , int(N) );
+    ASSERT_EQ( destroy_count, int( N ) );
   }
 
-  //----------------------------------------
   {
-    int destroy_count = 0 ;
+    int destroy_count = 0;
 
     {
-      RecordFull * rec = RecordFull::allocate( s , "test" , size );
+      RecordFull * rec = RecordFull::allocate( s, "test", size );
 
-      // ... Construction of the allocated { rec->data() , rec->size() }
+      // ... Construction of the allocated { rec->data(), rec->size() }
 
-      // Copy destruction function object into the allocation record
+      // Copy destruction function object into the allocation record.
       rec->m_destroy = SharedAllocDestroy( & destroy_count );
 
-      ASSERT_EQ( rec->use_count() , 0 );
+      ASSERT_EQ( rec->use_count(), 0 );
 
-      // Start tracking, increments the use count from 0 to 1
-      Tracker track ;
+      // Start tracking, increments the use count from 0 to 1.
+      Tracker track;
 
       track.assign_allocated_record_to_uninitialized( rec );
 
-      ASSERT_EQ( rec->use_count() , 1 );
-      ASSERT_EQ( track.use_count() , 1 );
+      ASSERT_EQ( rec->use_count(), 1 );
+      ASSERT_EQ( track.use_count(), 1 );
+
+      // Verify construction / destruction increment.
+      for ( size_t i = 0; i < N; ++i ) {
+        ASSERT_EQ( rec->use_count(), 1 );
 
-      // Verify construction / destruction increment
-      for ( size_t i = 0 ; i < N ; ++i ) {
-        ASSERT_EQ( rec->use_count() , 1 );
         {
-          Tracker local_tracker ;
+          Tracker local_tracker;
           local_tracker.assign_allocated_record_to_uninitialized( rec );
-          ASSERT_EQ( rec->use_count() , 2 );
-          ASSERT_EQ( local_tracker.use_count() , 2 );
+          ASSERT_EQ( rec->use_count(), 2 );
+          ASSERT_EQ( local_tracker.use_count(), 2 );
         }
-        ASSERT_EQ( rec->use_count() , 1 );
-        ASSERT_EQ( track.use_count() , 1 );
+
+        ASSERT_EQ( rec->use_count(), 1 );
+        ASSERT_EQ( track.use_count(), 1 );
       }
 
-      Kokkos::parallel_for( range , [=]( size_t i ){
-        Tracker local_tracker ;
+      Kokkos::parallel_for( range, [=] ( size_t i ) {
+        Tracker local_tracker;
         local_tracker.assign_allocated_record_to_uninitialized( rec );
-        ASSERT_GT( rec->use_count() , 1 );
+        ASSERT_GT( rec->use_count(), 1 );
       });
 
-      ASSERT_EQ( rec->use_count() , 1 );
-      ASSERT_EQ( track.use_count() , 1 );
+      ASSERT_EQ( rec->use_count(), 1 );
+      ASSERT_EQ( track.use_count(), 1 );
 
       // Destruction of 'track' object deallocates the 'rec' and invokes the destroy function object.
     }
 
-    ASSERT_EQ( destroy_count , 1 );
+    ASSERT_EQ( destroy_count, 1 );
   }
 
 #endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
 
 }
 
-
-}
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestSynchronic.cpp b/lib/kokkos/core/unit_test/TestSynchronic.cpp
deleted file mode 100644
index dc1abbd8b3d6a0532408956a5a7bffff1ec2f3f6..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/unit_test/TestSynchronic.cpp
+++ /dev/null
@@ -1,449 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-//#undef _WIN32_WINNT
-//#define _WIN32_WINNT 0x0602
-
-#if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || \
-	defined(__APPLE__) || defined(__ARM_ARCH_8A) || defined(_CRAYC)
-
-// Skip for now
-
-#else
-
-#include <gtest/gtest.h>
-
-#ifdef USEOMP
-#include <omp.h>
-#endif
-
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-#include <string>
-#include <vector>
-#include <map>
-#include <cstring>
-#include <ctime>
-
-//#include <details/config>
-//#undef __SYNCHRONIC_COMPATIBLE
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <impl/Kokkos_Synchronic_n3998.hpp>
-
-#include "TestSynchronic.hpp"
-
-// Uncomment to allow test to dump output
-//#define VERBOSE_TEST
-
-namespace Test {
-
-unsigned next_table[] =
-    {
-        0, 1, 2, 3,         //0-3
-        4, 4, 6, 6,         //4-7
-        8, 8, 8, 8,         //8-11
-        12, 12, 12, 12,     //12-15
-        16, 16, 16, 16,     //16-19
-        16, 16, 16, 16,     //20-23
-        24, 24, 24, 24,     //24-27
-        24, 24, 24, 24,     //28-31
-        32, 32, 32, 32,     //32-35
-        32, 32, 32, 32,     //36-39
-        40, 40, 40, 40,     //40-43
-        40, 40, 40, 40,     //44-47
-        48, 48, 48, 48,     //48-51
-        48, 48, 48, 48,     //52-55
-        56, 56, 56, 56,     //56-59
-        56, 56, 56, 56,     //60-63
-    };
-
-//change this if you want to allow oversubscription of the system, by default only the range {1-(system size)} is tested
-#define FOR_GAUNTLET(x) for(unsigned x = (std::min)(std::thread::hardware_concurrency()*8,unsigned(sizeof(next_table)/sizeof(unsigned))); x; x = next_table[x-1])
-
-//set this to override the benchmark of barriers to use OMP barriers instead of n3998 std::barrier
-//#define USEOMP
-
-#if defined(__SYNCHRONIC_COMPATIBLE)
-    #define PREFIX "futex-"
-#else
-    #define PREFIX "backoff-"
-#endif
-
-//this test uses a custom Mersenne twister to eliminate implementation variation
-MersenneTwister mt;
-
-int dummya = 1, dummyb =1;
-
-int dummy1 = 1;
-std::atomic<int> dummy2(1);
-std::atomic<int> dummy3(1);
-
-double time_item(int const count = (int)1E8)  {
-
-    clock_t const start = clock();
-
-    for(int i = 0;i < count; ++i)
-        mt.integer();
-
-    clock_t const end = clock();
-    double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC);
-
-    return elapsed_seconds / count;
-}
-double time_nil(int const count = (int)1E08)  {
-
-    clock_t const start = clock();
-
-    dummy3 = count;
-    for(int i = 0;i < (int)1E6; ++i) {
-        if(dummy1) {
-            // Do some work while holding the lock
-            int workunits = dummy3;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f);
-            for (int j = 1; j < workunits; j++)
-                dummy1 &= j;       // Do one work unit
-            dummy2.fetch_add(dummy1,std::memory_order_relaxed);
-        }
-    }
-
-    clock_t const end = clock();
-    double elapsed_seconds = (end - start) / double(CLOCKS_PER_SEC);
-
-    return elapsed_seconds / count;
-}
-
-
-template <class mutex_type>
-void testmutex_inner(mutex_type& m, std::atomic<int>& t,std::atomic<int>& wc,std::atomic<int>& wnc, int const num_iterations,
-                     int const num_items_critical, int const num_items_noncritical, MersenneTwister& mtc, MersenneTwister& mtnc, bool skip) {
-
-    for(int k = 0; k < num_iterations; ++k) {
-
-        if(num_items_noncritical) {
-            // Do some work without holding the lock
-            int workunits = num_items_noncritical;//(int) (mtnc.poissonInterval((float)num_items_noncritical) + 0.5f);
-            for (int i = 1; i < workunits; i++)
-                mtnc.integer();       // Do one work unit
-            wnc.fetch_add(workunits,std::memory_order_relaxed);
-        }
-
-        t.fetch_add(1,std::memory_order_relaxed);
-
-        if(!skip) {
-            std::unique_lock<mutex_type> l(m);
-            if(num_items_critical) {
-                // Do some work while holding the lock
-                int workunits = num_items_critical;//(int) (mtc.poissonInterval((float)num_items_critical) + 0.5f);
-                for (int i = 1; i < workunits; i++)
-                    mtc.integer();       // Do one work unit
-                wc.fetch_add(workunits,std::memory_order_relaxed);
-            }
-        }
-    }
-}
-template <class mutex_type>
-void testmutex_outer(std::map<std::string,std::vector<double>>& results, std::string const& name, double critical_fraction, double critical_duration) {
-
-    std::ostringstream truename;
-    truename << name << " (f=" << critical_fraction << ",d=" << critical_duration << ")";
-
-    std::vector<double>& data = results[truename.str()];
-
-    double const workItemTime = time_item() ,
-                 nilTime = time_nil();
-
-    int const num_items_critical = (critical_duration <= 0 ? 0 : (std::max)( int(critical_duration / workItemTime + 0.5), int(100 * nilTime / workItemTime + 0.5))),
-              num_items_noncritical = (num_items_critical <= 0 ? 0 : int( ( 1 - critical_fraction ) * num_items_critical / critical_fraction + 0.5 ));
-
-    FOR_GAUNTLET(num_threads) {
-
-        //Kokkos::Impl::portable_sleep(std::chrono::microseconds(2000000));
-
-        int const num_iterations = (num_items_critical + num_items_noncritical != 0) ?
-#ifdef __SYNCHRONIC_JUST_YIELD
-                                        int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) :
-#else
-                                        int( 1 / ( 8 * workItemTime ) / (num_items_critical + num_items_noncritical) / num_threads + 0.5 ) :
-#endif
-#ifdef WIN32
-                                        int( 1 / workItemTime / (20 * num_threads * num_threads) );
-#else
-                                        int( 1 / workItemTime / (200 * num_threads * num_threads) );
-#endif
-
-#ifdef VERBOSE_TEST
-        std::cerr << "running " << truename.str() << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\n" << std::flush;
-#endif
-
-
-        std::atomic<int> t[2], wc[2], wnc[2];
-
-        clock_t start[2], end[2];
-        for(int pass = 0; pass < 2; ++pass) {
-
-            t[pass] = 0;
-            wc[pass] = 0;
-            wnc[pass] = 0;
-
-            srand(num_threads);
-            std::vector<MersenneTwister> randomsnc(num_threads),
-                                         randomsc(num_threads);
-
-            mutex_type m;
-
-            start[pass] = clock();
-#ifdef USEOMP
-            omp_set_num_threads(num_threads);
-            std::atomic<int> _j(0);
-            #pragma omp parallel
-            {
-                int const j = _j.fetch_add(1,std::memory_order_relaxed);
-                testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0);
-                num_threads = omp_get_num_threads();
-            }
-#else
-            std::vector<std::thread*> threads(num_threads);
-            for(unsigned j = 0; j < num_threads; ++j)
-                threads[j] = new std::thread([&,j](){
-                        testmutex_inner(m, t[pass], wc[pass], wnc[pass], num_iterations, num_items_critical, num_items_noncritical, randomsc[j], randomsnc[j], pass==0);
-                    }
-                );
-            for(unsigned j = 0; j < num_threads; ++j) {
-                threads[j]->join();
-                delete threads[j];
-            }
-#endif
-            end[pass] = clock();
-        }
-        if(t[0] != t[1]) throw std::string("mismatched iteration counts");
-        if(wnc[0] != wnc[1]) throw std::string("mismatched work item counts");
-
-        double elapsed_seconds_0 = (end[0] - start[0]) / double(CLOCKS_PER_SEC),
-               elapsed_seconds_1 = (end[1] - start[1]) / double(CLOCKS_PER_SEC);
-        double time = (elapsed_seconds_1 - elapsed_seconds_0 - wc[1]*workItemTime) / num_iterations;
-
-        data.push_back(time);
-#ifdef VERBOSE_TEST
-        std::cerr << truename.str() << " : " << num_threads << "," << elapsed_seconds_1 / num_iterations << " - " << elapsed_seconds_0 / num_iterations << " - " << wc[1]*workItemTime/num_iterations << " = " << time << "                                                 \n";
-#endif
-    }
-}
-
-template <class barrier_type>
-void testbarrier_inner(barrier_type& b, int const num_threads, int const j, std::atomic<int>& t,std::atomic<int>& w,
-                       int const num_iterations_odd, int const num_iterations_even,
-                       int const num_items_noncritical, MersenneTwister& arg_mt, bool skip) {
-
-    for(int k = 0; k < (std::max)(num_iterations_even,num_iterations_odd); ++k) {
-
-        if(k >= (~j & 0x1 ? num_iterations_odd : num_iterations_even )) {
-            if(!skip)
-                b.arrive_and_drop();
-            break;
-        }
-
-        if(num_items_noncritical) {
-            // Do some work without holding the lock
-            int workunits = (int) (arg_mt.poissonInterval((float)num_items_noncritical) + 0.5f);
-            for (int i = 1; i < workunits; i++)
-                arg_mt.integer();       // Do one work unit
-            w.fetch_add(workunits,std::memory_order_relaxed);
-        }
-
-        t.fetch_add(1,std::memory_order_relaxed);
-
-        if(!skip) {
-            int const thiscount = (std::min)(k+1,num_iterations_odd)*((num_threads>>1)+(num_threads&1)) + (std::min)(k+1,num_iterations_even)*(num_threads>>1);
-            if(t.load(std::memory_order_relaxed) > thiscount) {
-                std::cerr << "FAILURE: some threads have run ahead of the barrier (" << t.load(std::memory_order_relaxed) << ">" <<  thiscount << ").\n";
-                EXPECT_TRUE(false);
-            }
-#ifdef USEOMP
-            #pragma omp barrier
-#else
-            b.arrive_and_wait();
-#endif
-            if(t.load(std::memory_order_relaxed) < thiscount) {
-                std::cerr << "FAILURE: some threads have fallen behind the barrier (" << t.load(std::memory_order_relaxed) << "<" << thiscount << ").\n";
-                EXPECT_TRUE(false);
-            }
-        }
-    }
-}
-template <class barrier_type>
-void testbarrier_outer(std::map<std::string,std::vector<double>>& results, std::string const& name, double barrier_frequency, double phase_duration, bool randomIterations = false) {
-
-    std::vector<double>& data = results[name];
-
-    double const workItemTime = time_item();
-    int const num_items_noncritical = int( phase_duration / workItemTime + 0.5 );
-
-    FOR_GAUNTLET(num_threads) {
-
-        int const num_iterations = int( barrier_frequency );
-#ifdef VERBOSE_TEST
-        std::cerr << "running " << name << " #" << num_threads << ", " << num_iterations << " * " << num_items_noncritical << "\r" << std::flush;
-#endif
-
-        srand(num_threads);
-
-        MersenneTwister local_mt;
-        int const num_iterations_odd = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations,
-                  num_iterations_even = randomIterations ? int(local_mt.poissonInterval((float)num_iterations)+0.5f) : num_iterations;
-
-        std::atomic<int> t[2], w[2];
-        std::chrono::time_point<std::chrono::high_resolution_clock> start[2], end[2];
-        for(int pass = 0; pass < 2; ++pass) {
-
-            t[pass] = 0;
-            w[pass] = 0;
-
-            srand(num_threads);
-            std::vector<MersenneTwister> randoms(num_threads);
-
-            barrier_type b(num_threads);
-
-            start[pass] = std::chrono::high_resolution_clock::now();
-#ifdef USEOMP
-            omp_set_num_threads(num_threads);
-            std::atomic<int> _j(0);
-            #pragma omp parallel
-            {
-                int const j = _j.fetch_add(1,std::memory_order_relaxed);
-                testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0);
-                num_threads = omp_get_num_threads();
-            }
-#else
-            std::vector<std::thread*> threads(num_threads);
-            for(unsigned j = 0; j < num_threads; ++j)
-                threads[j] = new std::thread([&,j](){
-                    testbarrier_inner(b, num_threads, j, t[pass], w[pass], num_iterations_odd, num_iterations_even, num_items_noncritical, randoms[j], pass==0);
-                });
-            for(unsigned j = 0; j < num_threads; ++j) {
-                threads[j]->join();
-                delete threads[j];
-            }
-#endif
-            end[pass] = std::chrono::high_resolution_clock::now();
-        }
-
-        if(t[0] != t[1]) throw std::string("mismatched iteration counts");
-        if(w[0] != w[1]) throw std::string("mismatched work item counts");
-
-        int const phases = (std::max)(num_iterations_odd, num_iterations_even);
-
-        std::chrono::duration<double> elapsed_seconds_0 = end[0]-start[0],
-                                      elapsed_seconds_1 = end[1]-start[1];
-        double const time = (elapsed_seconds_1.count() - elapsed_seconds_0.count()) / phases;
-
-        data.push_back(time);
-#ifdef VERBOSE_TEST
-        std::cerr << name << " : " << num_threads << "," << elapsed_seconds_1.count() / phases << " - " << elapsed_seconds_0.count() / phases << " = " << time << "                                                 \n";
-#endif
-    }
-}
-
-template <class... T>
-struct mutex_tester;
-template <class F>
-struct mutex_tester<F> {
-    static void run(std::map<std::string,std::vector<double>>& results, std::string const name[], double critical_fraction, double critical_duration) {
-        testmutex_outer<F>(results, *name, critical_fraction, critical_duration);
-    }
-};
-template <class F, class... T>
-struct mutex_tester<F,T...> {
-    static void run(std::map<std::string,std::vector<double>>& results, std::string const name[], double critical_fraction, double critical_duration) {
-        mutex_tester<F>::run(results, name, critical_fraction, critical_duration);
-        mutex_tester<T...>::run(results, ++name, critical_fraction, critical_duration);
-    }
-};
-
-TEST( synchronic, main )
-{
-    //warm up
-    time_item();
-
-    //measure up
-#ifdef VERBOSE_TEST
-    std::cerr << "measuring work item speed...\r";
-    std::cerr << "work item speed is " << time_item() << " per item, nil is " << time_nil() << "\n";
-#endif
-    try {
-
-      std::pair<double,double> testpoints[] = { {1, 0}, /*{1E-1, 10E-3}, {5E-1, 2E-6},  {3E-1, 50E-9},*/ };
-        for(auto x : testpoints ) {
-
-            std::map<std::string,std::vector<double>> results;
-
-            //testbarrier_outer<std::barrier>(results, PREFIX"bar 1khz 100us", 1E3, x.second);
-
-            std::string const names[] = {
-                PREFIX"tkt", PREFIX"mcs", PREFIX"ttas", PREFIX"std"
-#ifdef WIN32
-                ,PREFIX"srw"
-#endif
-            };
-
-            //run -->
-
-            mutex_tester<
-                ticket_mutex, mcs_mutex, ttas_mutex, std::mutex
-#ifdef WIN32
-                ,srw_mutex
-#endif
-            >::run(results, names, x.first, x.second);
-
-            //<-- run
-
-#ifdef VERBOSE_TEST
-            std::cout << "threads";
-            for(auto & i : results)
-                std::cout << ",\"" << i.first << '\"';
-            std::cout << std::endl;
-            int j = 0;
-            FOR_GAUNTLET(num_threads) {
-                std::cout << num_threads;
-                for(auto & i : results)
-                    std::cout << ',' << i.second[j];
-                std::cout << std::endl;
-                ++j;
-            }
-#endif
-        }
-    }
-    catch(std::string & e) {
-        std::cerr << "EXCEPTION : " << e << std::endl;
-        EXPECT_TRUE( false );
-    }
-}
-
-} // namespace Test
-
-#endif
diff --git a/lib/kokkos/core/unit_test/TestSynchronic.hpp b/lib/kokkos/core/unit_test/TestSynchronic.hpp
deleted file mode 100644
index f4341b97815b8d70956dfb85cf0d41a4f07bab4d..0000000000000000000000000000000000000000
--- a/lib/kokkos/core/unit_test/TestSynchronic.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
-
-Copyright (c) 2014, NVIDIA Corporation
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
-OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifndef TEST_SYNCHRONIC_HPP
-#define TEST_SYNCHRONIC_HPP
-
-#include <impl/Kokkos_Synchronic.hpp>
-#include <mutex>
-#include <cmath>
-
-namespace Test {
-
-template <bool truly>
-struct dumb_mutex {
-
-    dumb_mutex () : locked(0) {
-    }
-
-    void lock() {
-        while(1) {
-            bool state = false;
-            if (locked.compare_exchange_weak(state,true,std::memory_order_acquire)) {
-                break;
-            }
-            while (locked.load(std::memory_order_relaxed)) {
-              if (!truly) {
-                Kokkos::Impl::portable_yield();
-              }
-            }
-        }
-    }
-
-    void unlock() {
-        locked.store(false,std::memory_order_release);
-    }
-
-private :
-    std::atomic<bool> locked;
-};
-
-#ifdef WIN32
-#include <winsock2.h>
-#include <windows.h>
-#include <synchapi.h>
-struct srw_mutex {
-
-    srw_mutex () {
-        InitializeSRWLock(&_lock);
-    }
-
-    void lock() {
-        AcquireSRWLockExclusive(&_lock);
-    }
-    void unlock() {
-        ReleaseSRWLockExclusive(&_lock);
-    }
-
-private :
-    SRWLOCK _lock;
-};
-#endif
-
-struct ttas_mutex {
-
-    ttas_mutex() : locked(false) {
-    }
-
-	ttas_mutex(const ttas_mutex&) = delete;
-	ttas_mutex& operator=(const ttas_mutex&) = delete;
-
-    void lock() {
-        for(int i = 0;; ++i) {
-            bool state = false;
-            if(locked.compare_exchange_weak(state,true,std::memory_order_relaxed,Kokkos::Impl::notify_none))
-                break;
-            locked.expect_update(true);
-        }
-        std::atomic_thread_fence(std::memory_order_acquire);
-    }
-    void unlock() {
-        locked.store(false,std::memory_order_release);
-    }
-
-private :
-    Kokkos::Impl::synchronic<bool> locked;
-};
-
-struct ticket_mutex {
-
-    ticket_mutex() : active(0), queue(0) {
-    }
-
-	ticket_mutex(const ticket_mutex&) = delete;
-	ticket_mutex& operator=(const ticket_mutex&) = delete;
-
-    void lock() {
-        int const me = queue.fetch_add(1, std::memory_order_relaxed);
-        while(me != active.load_when_equal(me, std::memory_order_acquire))
-            ;
-    }
-
-    void unlock() {
-        active.fetch_add(1,std::memory_order_release);
-    }
-private :
-    Kokkos::Impl::synchronic<int> active;
-    std::atomic<int> queue;
-};
-
-struct mcs_mutex {
-
-    mcs_mutex() : head(nullptr) {
-    }
-
-	mcs_mutex(const mcs_mutex&) = delete;
-	mcs_mutex& operator=(const mcs_mutex&) = delete;
-
-    struct unique_lock {
-
-        unique_lock(mcs_mutex & arg_m) : m(arg_m), next(nullptr), ready(false) {
-
-            unique_lock * const h = m.head.exchange(this,std::memory_order_acquire);
-            if(__builtin_expect(h != nullptr,0)) {
-                h->next.store(this,std::memory_order_seq_cst,Kokkos::Impl::notify_one);
-                while(!ready.load_when_not_equal(false,std::memory_order_acquire))
-                    ;
-            }
-        }
-
-	    unique_lock(const unique_lock&) = delete;
-	    unique_lock& operator=(const unique_lock&) = delete;
-
-        ~unique_lock() {
-            unique_lock * h = this;
-            if(__builtin_expect(!m.head.compare_exchange_strong(h,nullptr,std::memory_order_release, std::memory_order_relaxed),0)) {
-                unique_lock * n = next.load(std::memory_order_relaxed);
-                while(!n)
-                    n = next.load_when_not_equal(n,std::memory_order_relaxed);
-                n->ready.store(true,std::memory_order_release,Kokkos::Impl::notify_one);
-            }
-        }
-
-    private:
-        mcs_mutex & m;
-        Kokkos::Impl::synchronic<unique_lock*> next;
-        Kokkos::Impl::synchronic<bool> ready;
-    };
-
-private :
-    std::atomic<unique_lock*> head;
-};
-
-}
-
-namespace std {
-template<>
-struct unique_lock<Test::mcs_mutex> : Test::mcs_mutex::unique_lock {
-  unique_lock(Test::mcs_mutex & arg_m) : Test::mcs_mutex::unique_lock(arg_m) {
-  }
-  unique_lock(const unique_lock&) = delete;
-  unique_lock& operator=(const unique_lock&) = delete;
-};
-
-}
-
-/* #include <cmath> */
-#include <stdlib.h>
-
-namespace Test {
-
-//-------------------------------------
-//  MersenneTwister
-//-------------------------------------
-#define MT_IA  397
-#define MT_LEN 624
-
-class MersenneTwister
-{
-    volatile unsigned long m_buffer[MT_LEN][64/sizeof(unsigned long)];
-    volatile int m_index;
-
-public:
-    MersenneTwister() {
-        for (int i = 0; i < MT_LEN; i++)
-            m_buffer[i][0] = rand();
-        m_index = 0;
-        for (int i = 0; i < MT_LEN * 100; i++)
-            integer();
-    }
-    unsigned long integer() {
-        // Indices
-        int i = m_index;
-        int i2 = m_index + 1; if (i2 >= MT_LEN) i2 = 0; // wrap-around
-        int j = m_index + MT_IA; if (j >= MT_LEN) j -= MT_LEN; // wrap-around
-
-        // Twist
-        unsigned long s = (m_buffer[i][0] & 0x80000000) | (m_buffer[i2][0] & 0x7fffffff);
-        unsigned long r = m_buffer[j][0] ^ (s >> 1) ^ ((s & 1) * 0x9908B0DF);
-        m_buffer[m_index][0] = r;
-        m_index = i2;
-
-        // Swizzle
-        r ^= (r >> 11);
-        r ^= (r << 7) & 0x9d2c5680UL;
-        r ^= (r << 15) & 0xefc60000UL;
-        r ^= (r >> 18);
-        return r;
-    }
-    float poissonInterval(float ooLambda) {
-        return -logf(1.0f - integer() * 2.3283e-10f) * ooLambda;
-    }
-};
-
-} // namespace Test
-
-#endif //TEST_HPP
diff --git a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
index 1134553980f8a63351f85a86b33537a35d52644c..57e47d4baa0d177dca9379cf43a05742af2519d1 100644
--- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
+++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,12 +36,11 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
-
 #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
 #define KOKKOS_UNITTEST_TASKSCHEDULER_HPP
 
@@ -51,9 +50,6 @@
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 namespace TestTaskScheduler {
 
 namespace {
@@ -61,14 +57,14 @@ namespace {
 inline
 long eval_fib( long n )
 {
-  constexpr long mask = 0x03 ;
+  constexpr long mask = 0x03;
 
-  long fib[4] = { 0 , 1 , 1 , 2 };
+  long fib[4] = { 0, 1, 1, 2 };
 
-  for ( long i = 2 ; i <= n ; ++i ) {
+  for ( long i = 2; i <= n; ++i ) {
     fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
   }
-  
+
   return fib[ n & mask ];
 }
 
@@ -77,100 +73,93 @@ long eval_fib( long n )
 template< typename Space >
 struct TestFib
 {
-  typedef Kokkos::TaskScheduler<Space>  policy_type ;
-  typedef Kokkos::Future<long,Space> future_type ;
-  typedef long value_type ;
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< long, Space >   future_type;
+  typedef long                            value_type;
 
-  policy_type policy ;
-  future_type fib_m1 ;
-  future_type fib_m2 ;
-  const value_type n ;
+  sched_type  sched;
+  future_type fib_m1;
+  future_type fib_m2;
+  const value_type n;
 
   KOKKOS_INLINE_FUNCTION
-  TestFib( const policy_type & arg_policy , const value_type arg_n )
-    : policy(arg_policy)
-    , fib_m1() , fib_m2()
-    , n( arg_n )
-    {}
+  TestFib( const sched_type & arg_sched, const value_type arg_n )
+    : sched( arg_sched ), fib_m1(), fib_m2(), n( arg_n ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type & , value_type & result )
-    {
+  void operator()( typename sched_type::member_type &, value_type & result )
+  {
 #if 0
-      printf( "\nTestFib(%ld) %d %d\n"
-             , n
-             , int( ! fib_m1.is_null() )
-             , int( ! fib_m2.is_null() )
-             );
+    printf( "\nTestFib(%ld) %d %d\n", n, int( !fib_m1.is_null() ), int( !fib_m2.is_null() ) );
 #endif
 
-      if ( n < 2 ) {
-        result = n ;
-      }
-      else if ( ! fib_m2.is_null() && ! fib_m1.is_null() ) {
-        result = fib_m1.get() + fib_m2.get();
-      }
-      else {
-
-        // Spawn new children and respawn myself to sum their results:
-        // Spawn lower value at higher priority as it has a shorter
-        // path to completion.
-
-        fib_m2 = policy.task_spawn( TestFib(policy,n-2)
-                                  , Kokkos::TaskSingle
-                                  , Kokkos::TaskHighPriority );
+    if ( n < 2 ) {
+      result = n;
+    }
+    else if ( !fib_m2.is_null() && !fib_m1.is_null() ) {
+      result = fib_m1.get() + fib_m2.get();
+    }
+    else {
+      // Spawn new children and respawn myself to sum their results.
+      // Spawn lower value at higher priority as it has a shorter
+      // path to completion.
 
-        fib_m1 = policy.task_spawn( TestFib(policy,n-1)
-                                  , Kokkos::TaskSingle );
+      fib_m2 = Kokkos::task_spawn( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
+                                 , TestFib( sched, n - 2 ) );
 
-        Kokkos::Future<Space> dep[] = { fib_m1 , fib_m2 };
+      fib_m1 = Kokkos::task_spawn( Kokkos::TaskSingle( sched )
+                                 , TestFib( sched, n - 1 ) );
 
-        Kokkos::Future<Space> fib_all = policy.when_all( 2 , dep );
+      Kokkos::Future< Space > dep[] = { fib_m1, fib_m2 };
+      Kokkos::Future< Space > fib_all = Kokkos::when_all( dep, 2 );
 
-        if ( ! fib_m2.is_null() && ! fib_m1.is_null() && ! fib_all.is_null() ) {
-          // High priority to retire this branch
-          policy.respawn( this , Kokkos::TaskHighPriority , fib_all );
-        }
-        else {
+      if ( !fib_m2.is_null() && !fib_m1.is_null() && !fib_all.is_null() ) {
+        // High priority to retire this branch.
+        Kokkos::respawn( this, fib_all, Kokkos::TaskPriority::High );
+      }
+      else {
 #if 1
-      printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
-             , n
-             , policy.allocation_capacity()
-             , policy.allocated_task_count_max()
-             , policy.allocated_task_count_accum()
-             );
+        printf( "TestFib(%ld) insufficient memory alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+               , n
+               , sched.allocation_capacity()
+               , sched.allocated_task_count_max()
+               , sched.allocated_task_count_accum()
+               );
 #endif
-          Kokkos::abort("TestFib insufficient memory");
 
-        }
+        Kokkos::abort( "TestFib insufficient memory" );
+
       }
     }
+  }
 
-  static void run( int i , size_t MemoryCapacity = 16000 )
-    {
-      typedef typename policy_type::memory_space memory_space ;
+  static void run( int i, size_t MemoryCapacity = 16000 )
+  {
+    typedef typename sched_type::memory_space memory_space;
 
-      enum { Log2_SuperBlockSize = 12 };
+    enum { Log2_SuperBlockSize = 12 };
 
-      policy_type root_policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+    sched_type root_sched( memory_space(), MemoryCapacity, Log2_SuperBlockSize );
 
-      future_type f = root_policy.host_spawn( TestFib(root_policy,i) , Kokkos::TaskSingle );
-      Kokkos::wait( root_policy );
-      ASSERT_EQ( eval_fib(i) , f.get() );
+    future_type f = Kokkos::host_spawn( Kokkos::TaskSingle( root_sched )
+                                      , TestFib( root_sched, i ) );
+
+    Kokkos::wait( root_sched );
+
+    ASSERT_EQ( eval_fib( i ), f.get() );
 
 #if 0
-      fprintf( stdout , "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
-             , i
-             , int(root_policy.template spawn_allocation_size<TestFib>())
-             , int(root_policy.when_all_allocation_size(2))
-             , root_policy.allocation_capacity()
-             , root_policy.allocated_task_count_max()
-             , root_policy.allocated_task_count_accum()
-             );
-      fflush( stdout );
+    fprintf( stdout, "\nTestFib::run(%d) spawn_size(%d) when_all_size(%d) alloc_capacity(%d) task_max(%d) task_accum(%ld)\n"
+           , i
+           , int(root_sched.template spawn_allocation_size<TestFib>())
+           , int(root_sched.when_all_allocation_size(2))
+           , root_sched.allocation_capacity()
+           , root_sched.allocated_task_count_max()
+           , root_sched.allocated_task_count_accum()
+           );
+    fflush( stdout );
 #endif
-    }
-
+  }
 };
 
 } // namespace TestTaskScheduler
@@ -181,73 +170,71 @@ namespace TestTaskScheduler {
 
 template< class Space >
 struct TestTaskDependence {
+  typedef Kokkos::TaskScheduler< Space >  sched_type;
+  typedef Kokkos::Future< Space >         future_type;
+  typedef Kokkos::View< long, Space >     accum_type;
+  typedef void                            value_type;
 
-  typedef Kokkos::TaskScheduler<Space>  policy_type ;
-  typedef Kokkos::Future<Space>      future_type ;
-  typedef Kokkos::View<long,Space>   accum_type ;
-  typedef void value_type ;
-
-  policy_type  m_policy ;
-  accum_type   m_accum ;
-  long         m_count ;
+  sched_type  m_sched;
+  accum_type  m_accum;
+  long        m_count;
 
   KOKKOS_INLINE_FUNCTION
   TestTaskDependence( long n
-                    , const policy_type & arg_policy
-                    , const accum_type  & arg_accum )
-    : m_policy( arg_policy )
+                    , const sched_type & arg_sched
+                    , const accum_type & arg_accum )
+    : m_sched( arg_sched )
     , m_accum( arg_accum )
-    , m_count( n )
-    {}
+    , m_count( n ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type & )
-    {
-       enum { CHUNK = 8 };
-       const int n = CHUNK < m_count ? CHUNK : m_count ;
+  void operator()( typename sched_type::member_type & )
+  {
+    enum { CHUNK = 8 };
+    const int n = CHUNK < m_count ? CHUNK : m_count;
 
-       if ( 1 < m_count ) {
-         future_type f[ CHUNK ] ;
+    if ( 1 < m_count ) {
+      future_type f[ CHUNK ];
 
-         const int inc = ( m_count + n - 1 ) / n ;
+      const int inc = ( m_count + n - 1 ) / n;
 
-         for ( int i = 0 ; i < n ; ++i ) {
-           long begin = i * inc ;
-           long count = begin + inc < m_count ? inc : m_count - begin ;
-           f[i] = m_policy.task_spawn( TestTaskDependence(count,m_policy,m_accum) , Kokkos::TaskSingle );
-         }
+      for ( int i = 0; i < n; ++i ) {
+        long begin = i * inc;
+        long count = begin + inc < m_count ? inc : m_count - begin;
+        f[i] = Kokkos::task_spawn( Kokkos::TaskSingle( m_sched )
+                                 , TestTaskDependence( count, m_sched, m_accum ) );
+      }
 
-         m_count = 0 ;
+      m_count = 0;
 
-         m_policy.respawn( this , m_policy.when_all( n , f ) );
-       }
-       else if ( 1 == m_count ) {
-         Kokkos::atomic_increment( & m_accum() );
-       }
+      Kokkos::respawn( this, Kokkos::when_all( f, n ) );
+    }
+    else if ( 1 == m_count ) {
+      Kokkos::atomic_increment( & m_accum() );
     }
+  }
 
   static void run( int n )
-    {
-      typedef typename policy_type::memory_space memory_space ;
+  {
+    typedef typename sched_type::memory_space memory_space;
 
-      // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool
-      enum { MemoryCapacity = 16000 };
-      enum { Log2_SuperBlockSize = 12 };
-      policy_type policy( memory_space() , MemoryCapacity , Log2_SuperBlockSize );
+    // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool.
+    enum { MemoryCapacity = 16000 };
+    enum { Log2_SuperBlockSize = 12 };
+    sched_type sched( memory_space(), MemoryCapacity, Log2_SuperBlockSize );
 
-      accum_type accum("accum");
+    accum_type accum( "accum" );
 
-      typename accum_type::HostMirror host_accum =
-        Kokkos::create_mirror_view( accum );
+    typename accum_type::HostMirror host_accum = Kokkos::create_mirror_view( accum );
 
-      policy.host_spawn( TestTaskDependence(n,policy,accum) , Kokkos::TaskSingle );
+    Kokkos::host_spawn( Kokkos::TaskSingle( sched ), TestTaskDependence( n, sched, accum ) );
 
-      Kokkos::wait( policy );
+    Kokkos::wait( sched );
 
-      Kokkos::deep_copy( host_accum , accum );
+    Kokkos::deep_copy( host_accum, accum );
 
-      ASSERT_EQ( host_accum() , n );
-    }
+    ASSERT_EQ( host_accum(), n );
+  }
 };
 
 } // namespace TestTaskScheduler
@@ -258,294 +245,317 @@ namespace TestTaskScheduler {
 
 template< class ExecSpace >
 struct TestTaskTeam {
-
   //enum { SPAN = 8 };
   enum { SPAN = 33 };
   //enum { SPAN = 1 };
 
-  typedef void value_type ;
-  typedef Kokkos::TaskScheduler<ExecSpace>  policy_type ;
-  typedef Kokkos::Future<ExecSpace>      future_type ;
-  typedef Kokkos::View<long*,ExecSpace>  view_type ;
+  typedef void                                value_type;
+  typedef Kokkos::TaskScheduler< ExecSpace >  sched_type;
+  typedef Kokkos::Future< ExecSpace >         future_type;
+  typedef Kokkos::View< long*, ExecSpace >    view_type;
 
-  policy_type  policy ;
-  future_type  future ;
+  sched_type   sched;
+  future_type  future;
 
-  view_type  parfor_result ;
-  view_type  parreduce_check ;
-  view_type  parscan_result ;
-  view_type  parscan_check ;
-  const long nvalue ;
+  view_type   parfor_result;
+  view_type   parreduce_check;
+  view_type   parscan_result;
+  view_type   parscan_check;
+  const long  nvalue;
 
   KOKKOS_INLINE_FUNCTION
-  TestTaskTeam( const policy_type & arg_policy
-              , const view_type   & arg_parfor_result
-              , const view_type   & arg_parreduce_check
-              , const view_type   & arg_parscan_result
-              , const view_type   & arg_parscan_check
-              , const long          arg_nvalue )
-    : policy(arg_policy)
+  TestTaskTeam( const sched_type & arg_sched
+              , const view_type  & arg_parfor_result
+              , const view_type  & arg_parreduce_check
+              , const view_type  & arg_parscan_result
+              , const view_type  & arg_parscan_check
+              , const long         arg_nvalue )
+    : sched( arg_sched )
     , future()
     , parfor_result( arg_parfor_result )
     , parreduce_check( arg_parreduce_check )
     , parscan_result( arg_parscan_result )
     , parscan_check( arg_parscan_check )
-    , nvalue( arg_nvalue )
-    {}
+    , nvalue( arg_nvalue ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type & member )
-    {
-      const long end   = nvalue + 1 ;
-      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
-
-      if ( 0 < begin && future.is_null() ) {
-        if ( member.team_rank() == 0 ) {
-          future = policy.task_spawn
-            ( TestTaskTeam( policy ,
-                            parfor_result ,
-                            parreduce_check,
-                            parscan_result,
-                            parscan_check,
-                            begin - 1 )
-            , Kokkos::TaskTeam );
-
-          assert( ! future.is_null() );
-
-          policy.respawn( this , future );
-        }
-        return ;
-      }
+  void operator()( typename sched_type::member_type & member )
+  {
+    const long end   = nvalue + 1;
+    const long begin = 0 < end - SPAN ? end - SPAN : 0;
 
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { parfor_result[i] = i ; }
-                          );
-
-      // test parallel_reduce without join
-    
-      long tot = 0;
-      long expected = (begin+end-1)*(end-begin)*0.5;
-      
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &res) { res += parfor_result[i]; }
-                          , tot);
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { parreduce_check[i] = expected-tot ; }
-                          );
-
-      // test parallel_reduce with join
-
-      tot = 0;
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &res) { res += parfor_result[i]; }
-                          , [&]( long& val1, const long& val2) { val1 += val2; }
-                          , tot);
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { parreduce_check[i] += expected-tot ; }
-                          );
-
-      // test parallel_scan
-
-      // Exclusive scan
-      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &val , const bool final ) {
-                              if ( final ) { parscan_result[i] = val; }
-                              val += i;
-                            }
-                          );
+    if ( 0 < begin && future.is_null() ) {
       if ( member.team_rank() == 0 ) {
-        for ( long i = begin ; i < end ; ++i ) {
-          parscan_check[i] = (i*(i-1)-begin*(begin-1))*0.5-parscan_result[i];
-        }
+        future = Kokkos::task_spawn( Kokkos::TaskTeam( sched )
+                                   , TestTaskTeam( sched
+                                                 , parfor_result
+                                                 , parreduce_check
+                                                 , parscan_result
+                                                 , parscan_check
+                                                 , begin - 1 )
+                                   );
+
+        assert( !future.is_null() );
+
+        Kokkos::respawn( this, future );
       }
 
-      // Inclusive scan
-      Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i, long &val , const bool final ) {
-                              val += i;
-                              if ( final ) { parscan_result[i] = val; }
-                            }
-                          );
-      if ( member.team_rank() == 0 ) {
-        for ( long i = begin ; i < end ; ++i ) {
-          parscan_check[i] += (i*(i+1)-begin*(begin-1))*0.5-parscan_result[i];
-        }
+      return;
+    }
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parfor_result[i] = i; }
+                        );
+
+    // Test parallel_reduce without join.
+
+    long tot = 0;
+    long expected = ( begin + end - 1 ) * ( end - begin ) * 0.5;
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, begin, end )
+                           , [&] ( int i, long & res ) { res += parfor_result[i]; }
+                           , tot
+                           );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parreduce_check[i] = expected - tot; }
+                        );
+
+    // Test parallel_reduce with join.
+
+    tot = 0;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, begin, end )
+                           , [&] ( int i, long & res ) { res += parfor_result[i]; }
+#if 0
+                           , Kokkos::Sum( tot )
+#else
+                           , [] ( long & dst, const long & src ) { dst += src; }
+                           , tot
+#endif
+                           );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { parreduce_check[i] += expected - tot; }
+                        );
+
+    // Test parallel_scan.
+
+    // Exclusive scan.
+    Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange( member, begin, end )
+                               , [&] ( int i, long & val, const bool final )
+    {
+      if ( final ) { parscan_result[i] = val; }
+
+      val += i;
+    });
+
+    // Wait for 'parscan_result' before testing it.
+    member.team_barrier();
+
+    if ( member.team_rank() == 0 ) {
+      for ( long i = begin; i < end; ++i ) {
+        parscan_check[i] = ( i * ( i - 1 ) - begin * ( begin - 1 ) ) * 0.5 - parscan_result[i];
       }
-      // ThreadVectorRange check
-      /*
-      long result = 0;
-      expected = (begin+end-1)*(end-begin)*0.5;
-      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member , 0 , 1 )
-                             , [&] ( const int i , long & outerUpdate ) {
-                                 long sum_j = 0.0;
-                                 Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( member , end - begin )
-                                                        , [&] ( const int j , long &innerUpdate ) {
-                                                            innerUpdate += begin+j;
-                                                          } , sum_j );
-                                 outerUpdate += sum_j ;
-                               } , result );
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) {
-                              parreduce_check[i] += result-expected ;
-                            }
-                          );
-      */
     }
 
-  static void run( long n )
+    // Don't overwrite 'parscan_result' until it has been tested.
+    member.team_barrier();
+
+    // Inclusive scan.
+    Kokkos::parallel_scan<long>( Kokkos::TeamThreadRange( member, begin, end )
+                               , [&] ( int i, long & val, const bool final )
     {
-      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
-      // const unsigned memory_capacity = 100000 ; // fails with SPAN=1 for serial and OMP
-      const unsigned memory_capacity = 400000 ;
-
-      policy_type root_policy( typename policy_type::memory_space()
-                        , memory_capacity );
-
-      view_type   root_parfor_result("parfor_result",n+1);
-      view_type   root_parreduce_check("parreduce_check",n+1);
-      view_type   root_parscan_result("parscan_result",n+1);
-      view_type   root_parscan_check("parscan_check",n+1);
-
-      typename view_type::HostMirror
-        host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
-      typename view_type::HostMirror
-        host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
-      typename view_type::HostMirror
-        host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
-      typename view_type::HostMirror
-        host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
-
-      future_type f = root_policy.host_spawn(
-                        TestTaskTeam( root_policy ,
-                                      root_parfor_result ,
-                                      root_parreduce_check ,
-                                      root_parscan_result,
-                                      root_parscan_check,
-                                      n ) ,
-                        Kokkos::TaskTeam );
-
-      Kokkos::wait( root_policy );
-
-      Kokkos::deep_copy( host_parfor_result , root_parfor_result );
-      Kokkos::deep_copy( host_parreduce_check , root_parreduce_check );
-      Kokkos::deep_copy( host_parscan_result , root_parscan_result );
-      Kokkos::deep_copy( host_parscan_check , root_parscan_check );
-
-      for ( long i = 0 ; i <= n ; ++i ) {
-        const long answer = i ;
-        if ( host_parfor_result(i) != answer ) {
-          std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
-                    << host_parfor_result(i) << " != " << answer << std::endl ;
-        }
-        if ( host_parreduce_check(i) != 0 ) {
-          std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
-                    << host_parreduce_check(i) << " != 0" << std::endl ;
-        }
-        if ( host_parscan_check(i) != 0 ) {
-          std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
-                    << host_parscan_check(i) << " != 0" << std::endl ;
-        }
+      val += i;
+
+      if ( final ) { parscan_result[i] = val; }
+    });
+
+    // Wait for 'parscan_result' before testing it.
+    member.team_barrier();
+
+    if ( member.team_rank() == 0 ) {
+      for ( long i = begin; i < end; ++i ) {
+        parscan_check[i] += ( i * ( i + 1 ) - begin * ( begin - 1 ) ) * 0.5 - parscan_result[i];
       }
     }
+
+    // ThreadVectorRange check.
+/*
+    long result = 0;
+    expected = ( begin + end - 1 ) * ( end - begin ) * 0.5;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( member, 0, 1 )
+                           , [&] ( const int i, long & outerUpdate )
+    {
+      long sum_j = 0.0;
+
+      Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( member, end - begin )
+                             , [&] ( const int j, long & innerUpdate )
+      {
+        innerUpdate += begin + j;
+      }, sum_j );
+
+      outerUpdate += sum_j;
+    }, result );
+
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i )
+    {
+      parreduce_check[i] += result - expected;
+    });
+*/
+  }
+
+  static void run( long n )
+  {
+    //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
+    //const unsigned memory_capacity = 100000; // Fails with SPAN=1 for serial and OMP.
+    const unsigned memory_capacity = 400000;
+
+    sched_type root_sched( typename sched_type::memory_space(), memory_capacity );
+
+    view_type root_parfor_result( "parfor_result", n + 1 );
+    view_type root_parreduce_check( "parreduce_check", n + 1 );
+    view_type root_parscan_result( "parscan_result", n + 1 );
+    view_type root_parscan_check( "parscan_check", n + 1 );
+
+    typename view_type::HostMirror
+      host_parfor_result = Kokkos::create_mirror_view( root_parfor_result );
+    typename view_type::HostMirror
+      host_parreduce_check = Kokkos::create_mirror_view( root_parreduce_check );
+    typename view_type::HostMirror
+      host_parscan_result = Kokkos::create_mirror_view( root_parscan_result );
+    typename view_type::HostMirror
+      host_parscan_check = Kokkos::create_mirror_view( root_parscan_check );
+
+    future_type f = Kokkos::host_spawn( Kokkos::TaskTeam( root_sched )
+                                      , TestTaskTeam( root_sched
+                                                    , root_parfor_result
+                                                    , root_parreduce_check
+                                                    , root_parscan_result
+                                                    , root_parscan_check
+                                                    , n )
+                                      );
+
+    Kokkos::wait( root_sched );
+
+    Kokkos::deep_copy( host_parfor_result, root_parfor_result );
+    Kokkos::deep_copy( host_parreduce_check, root_parreduce_check );
+    Kokkos::deep_copy( host_parscan_result, root_parscan_result );
+    Kokkos::deep_copy( host_parscan_check, root_parscan_check );
+
+    for ( long i = 0; i <= n; ++i ) {
+      const long answer = i;
+
+      if ( host_parfor_result( i ) != answer ) {
+        std::cerr << "TestTaskTeam::run ERROR parallel_for result(" << i << ") = "
+                  << host_parfor_result( i ) << " != " << answer << std::endl;
+      }
+
+      if ( host_parreduce_check( i ) != 0 ) {
+        std::cerr << "TestTaskTeam::run ERROR parallel_reduce check(" << i << ") = "
+                  << host_parreduce_check( i ) << " != 0" << std::endl;
+      }
+
+      if ( host_parscan_check( i ) != 0 ) {
+        std::cerr << "TestTaskTeam::run ERROR parallel_scan check(" << i << ") = "
+                  << host_parscan_check( i ) << " != 0" << std::endl;
+      }
+    }
+  }
 };
 
 template< class ExecSpace >
 struct TestTaskTeamValue {
-
   enum { SPAN = 8 };
 
-  typedef long value_type ;
-  typedef Kokkos::TaskScheduler<ExecSpace>         policy_type ;
-  typedef Kokkos::Future<value_type,ExecSpace>  future_type ;
-  typedef Kokkos::View<long*,ExecSpace>         view_type ;
+  typedef long                                     value_type;
+  typedef Kokkos::TaskScheduler< ExecSpace >       sched_type;
+  typedef Kokkos::Future< value_type, ExecSpace >  future_type;
+  typedef Kokkos::View< long*, ExecSpace >         view_type;
 
-  policy_type  policy ;
-  future_type  future ;
+  sched_type   sched;
+  future_type  future;
 
-  view_type  result ;
-  const long nvalue ;
+  view_type   result;
+  const long  nvalue;
 
   KOKKOS_INLINE_FUNCTION
-  TestTaskTeamValue( const policy_type & arg_policy
-                   , const view_type   & arg_result
-                   , const long          arg_nvalue )
-    : policy(arg_policy)
+  TestTaskTeamValue( const sched_type & arg_sched
+                   , const view_type  & arg_result
+                   , const long         arg_nvalue )
+    : sched( arg_sched )
     , future()
     , result( arg_result )
-    , nvalue( arg_nvalue )
-    {}
+    , nvalue( arg_nvalue ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( typename policy_type::member_type const & member
+  void operator()( typename sched_type::member_type const & member
                  , value_type & final )
-    {
-      const long end   = nvalue + 1 ;
-      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+  {
+    const long end   = nvalue + 1;
+    const long begin = 0 < end - SPAN ? end - SPAN : 0;
 
-      if ( 0 < begin && future.is_null() ) {
-        if ( member.team_rank() == 0 ) {
-
-          future = policy.task_spawn
-            ( TestTaskTeamValue( policy , result , begin - 1 )
-            , Kokkos::TaskTeam );
+    if ( 0 < begin && future.is_null() ) {
+      if ( member.team_rank() == 0 ) {
+        future = sched.task_spawn( TestTaskTeamValue( sched, result, begin - 1 )
+                                 , Kokkos::TaskTeam );
 
-          assert( ! future.is_null() );
+        assert( !future.is_null() );
 
-          policy.respawn( this , future );
-        }
-        return ;
+        sched.respawn( this , future );
       }
 
-      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
-                          , [&]( int i ) { result[i] = i + 1 ; }
-                          );
+      return;
+    }
 
-      if ( member.team_rank() == 0 ) {
-        final = result[nvalue] ;
-      }
+    Kokkos::parallel_for( Kokkos::TeamThreadRange( member, begin, end )
+                        , [&] ( int i ) { result[i] = i + 1; }
+                        );
 
-      Kokkos::memory_fence();
+    if ( member.team_rank() == 0 ) {
+      final = result[nvalue];
     }
 
+    Kokkos::memory_fence();
+  }
+
   static void run( long n )
-    {
-      // const unsigned memory_capacity = 10000 ; // causes memory pool infinite loop
-      const unsigned memory_capacity = 100000 ;
+  {
+    //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
+    const unsigned memory_capacity = 100000;
 
-      policy_type root_policy( typename policy_type::memory_space()
-                             , memory_capacity );
+    sched_type root_sched( typename sched_type::memory_space()
+                          , memory_capacity );
 
-      view_type   root_result("result",n+1);
+    view_type root_result( "result", n + 1 );
 
-      typename view_type::HostMirror
-        host_result = Kokkos::create_mirror_view( root_result );
+    typename view_type::HostMirror host_result = Kokkos::create_mirror_view( root_result );
 
-      future_type fv = root_policy.host_spawn
-        ( TestTaskTeamValue( root_policy, root_result, n ) , Kokkos::TaskTeam );
+    future_type fv = root_sched.host_spawn( TestTaskTeamValue( root_sched, root_result, n )
+                                          , Kokkos::TaskTeam );
 
-      Kokkos::wait( root_policy );
+    Kokkos::wait( root_sched );
 
-      Kokkos::deep_copy( host_result , root_result );
+    Kokkos::deep_copy( host_result, root_result );
 
-      if ( fv.get() != n + 1 ) {
-        std::cerr << "TestTaskTeamValue ERROR future = "
-                  << fv.get() << " != " << n + 1 << std::endl ;
-      }
-      for ( long i = 0 ; i <= n ; ++i ) {
-        const long answer = i + 1 ;
-        if ( host_result(i) != answer ) {
-          std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
-                    << host_result(i) << " != " << answer << std::endl ;
-        }
+    if ( fv.get() != n + 1 ) {
+      std::cerr << "TestTaskTeamValue ERROR future = "
+                << fv.get() << " != " << n + 1 << std::endl;
+    }
+
+    for ( long i = 0; i <= n; ++i ) {
+      const long answer = i + 1;
+
+      if ( host_result( i ) != answer ) {
+        std::cerr << "TestTaskTeamValue ERROR result(" << i << ") = "
+                  << host_result( i ) << " != " << answer << std::endl;
       }
     }
+  }
 };
-} // namespace TestTaskScheduler
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
 
-#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
-#endif /* #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP */
+} // namespace TestTaskScheduler
 
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
 
+#endif // #ifndef KOKKOS_UNITTEST_TASKSCHEDULER_HPP
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
index bcf4d3a173686ad8b1d14abc45ee957bb8650389..11a523921db9995c18d38ac5e18661244acd0ecb 100644
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -48,177 +48,169 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
+
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestTeamPolicy {
+  typedef typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type team_member;
+  typedef Kokkos::View< int**, ExecSpace > view_type;
 
-  typedef typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type team_member ;
-  typedef Kokkos::View<int**,ExecSpace> view_type ;
-
-  view_type m_flags ;
+  view_type m_flags;
 
   TestTeamPolicy( const size_t league_size )
-    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags")
-             , Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( *this )
-             , league_size )
-    {}
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ),
+               Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( *this ),
+               league_size ) {}
 
   struct VerifyInitTag {};
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const team_member & member ) const
-    {
-      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+  {
+    const int tid = member.team_rank() + member.team_size() * member.league_rank();
 
-      m_flags( member.team_rank() , member.league_rank() ) = tid ;
-    }
+    m_flags( member.team_rank(), member.league_rank() ) = tid;
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const VerifyInitTag & , const team_member & member ) const
-    {
-      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+  void operator()( const VerifyInitTag &, const team_member & member ) const
+  {
+    const int tid = member.team_rank() + member.team_size() * member.league_rank();
 
-      if ( tid != m_flags( member.team_rank() , member.league_rank() ) ) {
-        printf("TestTeamPolicy member(%d,%d) error %d != %d\n"
-              , member.league_rank() , member.team_rank()
-              , tid , m_flags( member.team_rank() , member.league_rank() ) );
-      }
+    if ( tid != m_flags( member.team_rank(), member.league_rank() ) ) {
+      printf( "TestTeamPolicy member(%d,%d) error %d != %d\n",
+               member.league_rank(), member.team_rank(),
+               tid, m_flags( member.team_rank(), member.league_rank() ) );
     }
+  }
 
-  // included for test_small_league_size
-  TestTeamPolicy()
-    : m_flags()
-  {}
+  // Included for test_small_league_size.
+  TestTeamPolicy() : m_flags() {}
+
+  // Included for test_small_league_size.
+  struct NoOpTag {};
 
-  // included for test_small_league_size
-  struct NoOpTag {} ;
   KOKKOS_INLINE_FUNCTION
-  void operator()( const NoOpTag & , const team_member & member ) const
-    {}
+  void operator()( const NoOpTag &, const team_member & member ) const {}
 
 
   static void test_small_league_size() {
-
     int bs = 8; // batch size (number of elements per batch)
     int ns = 16; // total number of "problems" to process
 
-    // calculate total scratch memory space size
+    // Calculate total scratch memory space size.
     const int level = 0;
     int mem_size = 960;
-    const int num_teams = ns/bs;
-    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy(num_teams, Kokkos::AUTO());
+    const int num_teams = ns / bs;
+    const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy( num_teams, Kokkos::AUTO() );
 
-    Kokkos::parallel_for ( policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), Kokkos::PerThread(0))
-                         , TestTeamPolicy()
-                         );
+    Kokkos::parallel_for( policy.set_scratch_size( level, Kokkos::PerTeam( mem_size ), Kokkos::PerThread( 0 ) ),
+                          TestTeamPolicy() );
   }
 
   static void test_for( const size_t league_size )
-    {
-      TestTeamPolicy functor( league_size );
+  {
+    TestTeamPolicy functor( league_size );
 
-      const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+    const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
 
-      Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor );
-      Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , VerifyInitTag >( league_size , team_size ) , functor );
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size, team_size ), functor );
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType,  ExecSpace, VerifyInitTag >( league_size, team_size ), functor );
 
-      test_small_league_size();
-    }
+    test_small_league_size();
+  }
 
   struct ReduceTag {};
 
-  typedef long value_type ;
+  typedef long value_type;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const team_member & member , value_type & update ) const
-    {
-      update += member.team_rank() + member.team_size() * member.league_rank();
-    }
+  void operator()( const team_member & member, value_type & update ) const
+  {
+    update += member.team_rank() + member.team_size() * member.league_rank();
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const ReduceTag & , const team_member & member , value_type & update ) const
-    {
-      update += 1 + member.team_rank() + member.team_size() * member.league_rank();
-    }
+  void operator()( const ReduceTag &, const team_member & member, value_type & update ) const
+  {
+    update += 1 + member.team_rank() + member.team_size() * member.league_rank();
+  }
 
   static void test_reduce( const size_t league_size )
-    {
-      TestTeamPolicy functor( league_size );
+  {
+    TestTeamPolicy functor( league_size );
 
-      const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
-      const long N = team_size * league_size ;
+    const int team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( functor );
+    const long N = team_size * league_size;
 
-      long total = 0 ;
+    long total = 0;
 
-      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType,  ExecSpace >( league_size , team_size ) , functor , total );
-      ASSERT_EQ( size_t((N-1)*(N))/2 , size_t(total) );
+    Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size, team_size ), functor, total );
+    ASSERT_EQ( size_t( ( N - 1 ) * ( N ) ) / 2, size_t( total ) );
 
-      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType,  ExecSpace , ReduceTag >( league_size , team_size ) , functor , total );
-      ASSERT_EQ( (size_t(N)*size_t(N+1))/2 , size_t(total) );
-    }
+    Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace, ReduceTag >( league_size, team_size ), functor, total );
+    ASSERT_EQ( ( size_t( N ) * size_t( N + 1 ) ) / 2, size_t( total ) );
+  }
 };
 
-}
-}
+} // namespace
+
+} // namespace Test
 
 /*--------------------------------------------------------------------------*/
 
 namespace Test {
 
-template< typename ScalarType , class DeviceType, class ScheduleType >
+template< typename ScalarType, class DeviceType, class ScheduleType >
 class ReduceTeamFunctor
 {
 public:
-  typedef DeviceType execution_space ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
-  typedef typename execution_space::size_type        size_type ;
+  typedef DeviceType                                           execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType, execution_space >  policy_type;
+  typedef typename execution_space::size_type                  size_type;
 
   struct value_type {
-    ScalarType value[3] ;
+    ScalarType value[3];
   };
 
-  const size_type nwork ;
+  const size_type nwork;
 
   ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
 
-  ReduceTeamFunctor( const ReduceTeamFunctor & rhs )
-    : nwork( rhs.nwork ) {}
+  ReduceTeamFunctor( const ReduceTeamFunctor & rhs ) : nwork( rhs.nwork ) {}
 
   KOKKOS_INLINE_FUNCTION
   void init( value_type & dst ) const
   {
-    dst.value[0] = 0 ;
-    dst.value[1] = 0 ;
-    dst.value[2] = 0 ;
+    dst.value[0] = 0;
+    dst.value[1] = 0;
+    dst.value[2] = 0;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join( volatile value_type & dst ,
-             const volatile value_type & src ) const
+  void join( volatile value_type & dst, const volatile value_type & src ) const
   {
-    dst.value[0] += src.value[0] ;
-    dst.value[1] += src.value[1] ;
-    dst.value[2] += src.value[2] ;
+    dst.value[0] += src.value[0];
+    dst.value[1] += src.value[1];
+    dst.value[2] += src.value[2];
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type ind , value_type & dst ) const
+  void operator()( const typename policy_type::member_type ind, value_type & dst ) const
   {
     const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
     const int thread_size = ind.team_size() * ind.league_size();
-    const int chunk = ( nwork + thread_size - 1 ) / thread_size ;
+    const int chunk = ( nwork + thread_size - 1 ) / thread_size;
 
-    size_type iwork = chunk * thread_rank ;
-    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork ;
+    size_type iwork = chunk * thread_rank;
+    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
 
-    for ( ; iwork < iwork_end ; ++iwork ) {
-      dst.value[0] += 1 ;
-      dst.value[1] += iwork + 1 ;
-      dst.value[2] += nwork - iwork ;
+    for ( ; iwork < iwork_end; ++iwork ) {
+      dst.value[0] += 1;
+      dst.value[1] += iwork + 1;
+      dst.value[2] += nwork - iwork;
     }
   }
 };
@@ -227,58 +219,53 @@ public:
 
 namespace {
 
-template< typename ScalarType , class DeviceType, class ScheduleType >
+template< typename ScalarType, class DeviceType, class ScheduleType >
 class TestReduceTeam
 {
 public:
-  typedef DeviceType    execution_space ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
-  typedef typename execution_space::size_type    size_type ;
-
-  //------------------------------------
+  typedef DeviceType                                            execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef typename execution_space::size_type                   size_type;
 
-  TestReduceTeam( const size_type & nwork )
-  {
-    run_test(nwork);
-  }
+  TestReduceTeam( const size_type & nwork ) { run_test( nwork ); }
 
   void run_test( const size_type & nwork )
   {
-    typedef Test::ReduceTeamFunctor< ScalarType , execution_space , ScheduleType> functor_type ;
-    typedef typename functor_type::value_type value_type ;
-    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type ;
+    typedef Test::ReduceTeamFunctor< ScalarType, execution_space, ScheduleType> functor_type;
+    typedef typename functor_type::value_type value_type;
+    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
 
     enum { Count = 3 };
     enum { Repeat = 100 };
 
     value_type result[ Repeat ];
 
-    const unsigned long nw   = nwork ;
-    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
-                                      : (nw/2) * ( nw + 1 );
+    const unsigned long nw   = nwork;
+    const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
+                                      : ( nw / 2 ) * ( nw + 1 );
 
-    const unsigned team_size   = policy_type::team_size_recommended( functor_type(nwork) );
-    const unsigned league_size = ( nwork + team_size - 1 ) / team_size ;
+    const unsigned team_size   = policy_type::team_size_recommended( functor_type( nwork ) );
+    const unsigned league_size = ( nwork + team_size - 1 ) / team_size;
 
-    policy_type team_exec( league_size , team_size );
+    policy_type team_exec( league_size, team_size );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+    for ( unsigned i = 0; i < Repeat; ++i ) {
       result_type tmp( & result[i] );
-      Kokkos::parallel_reduce( team_exec , functor_type(nwork) , tmp );
+      Kokkos::parallel_reduce( team_exec, functor_type( nwork ), tmp );
     }
 
     execution_space::fence();
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      for ( unsigned j = 0 ; j < Count ; ++j ) {
-        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
-        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      for ( unsigned j = 0; j < Count; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum;
+        ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
       }
     }
   }
 };
 
-}
+} // namespace
 
 /*--------------------------------------------------------------------------*/
 
@@ -288,53 +275,51 @@ template< class DeviceType, class ScheduleType >
 class ScanTeamFunctor
 {
 public:
-  typedef DeviceType  execution_space ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef DeviceType                                            execution_space;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef long int                                              value_type;
 
-  typedef long int    value_type ;
-  Kokkos::View< value_type , execution_space > accum ;
-  Kokkos::View< value_type , execution_space > total ;
+  Kokkos::View< value_type, execution_space > accum;
+  Kokkos::View< value_type, execution_space > total;
 
-  ScanTeamFunctor() : accum("accum"), total("total") {}
+  ScanTeamFunctor() : accum( "accum" ), total( "total" ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init( value_type & error ) const { error = 0 ; }
+  void init( value_type & error ) const { error = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  void join( value_type volatile & error ,
-             value_type volatile const & input ) const
-    { if ( input ) error = 1 ; }
+  void join( value_type volatile & error, value_type volatile const & input ) const
+  { if ( input ) error = 1; }
 
   struct JoinMax {
-    typedef long int value_type ;
+    typedef long int value_type;
+
     KOKKOS_INLINE_FUNCTION
-    void join( value_type volatile & dst
-             , value_type volatile const & input ) const
-      { if ( dst < input ) dst = input ; }
+    void join( value_type volatile & dst, value_type volatile const & input ) const
+    { if ( dst < input ) dst = input; }
   };
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type ind , value_type & error ) const
+  void operator()( const typename policy_type::member_type ind, value_type & error ) const
   {
     if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
       const long int thread_count = ind.league_size() * ind.team_size();
-      total() = ( thread_count * ( thread_count + 1 ) ) / 2 ;
+      total() = ( thread_count * ( thread_count + 1 ) ) / 2;
     }
 
     // Team max:
-    const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() );
+    const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ), JoinMax() );
 
     if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
-      printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n"
-            , ind.league_rank(), ind.team_rank()
-            , ind.league_size(), ind.team_size()
-            , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m );
+      printf( "ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n",
+               ind.league_rank(), ind.team_rank(),
+               ind.league_size(), ind.team_size(),
+               (long int) ( ind.league_rank() + ( ind.team_size() - 1 ) ), m );
     }
 
     // Scan:
     const long int answer =
-      ( ind.league_rank() + 1 ) * ind.team_rank() +
-      ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ;
+      ( ind.league_rank() + 1 ) * ind.team_rank() + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2;
 
     const long int result =
       ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
@@ -343,16 +328,17 @@ public:
       ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
 
     if ( answer != result || answer != result2 ) {
-      printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
-             ind.league_rank(), ind.team_rank(),
-             ind.league_size(), ind.team_size(),
-             answer,result,result2);
-      error = 1 ;
+      printf( "ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
+              ind.league_rank(), ind.team_rank(),
+              ind.league_size(), ind.team_size(),
+              answer, result, result2 );
+
+      error = 1;
     }
 
     const long int thread_rank = ind.team_rank() +
                                  ind.team_size() * ind.league_rank();
-    ind.team_scan( 1 + thread_rank , accum.ptr_on_device() );
+    ind.team_scan( 1 + thread_rank, accum.ptr_on_device() );
   }
 };
 
@@ -360,47 +346,45 @@ template< class DeviceType, class ScheduleType >
 class TestScanTeam
 {
 public:
-  typedef DeviceType  execution_space ;
-  typedef long int    value_type ;
-
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space > policy_type ;
-  typedef Test::ScanTeamFunctor<DeviceType, ScheduleType> functor_type ;
+  typedef DeviceType                                            execution_space;
+  typedef long int                                              value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
+  typedef Test::ScanTeamFunctor<DeviceType, ScheduleType>       functor_type;
 
-  //------------------------------------
-
-  TestScanTeam( const size_t nteam )
-  {
-    run_test(nteam);
-  }
+  TestScanTeam( const size_t nteam ) { run_test( nteam ); }
 
   void run_test( const size_t nteam )
   {
-    typedef Kokkos::View< long int , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
-    const unsigned REPEAT = 100000 ;
+    typedef Kokkos::View< long int, Kokkos::HostSpace, Kokkos::MemoryUnmanaged >  result_type;
+
+    const unsigned REPEAT = 100000;
     unsigned Repeat;
-    if ( nteam == 0 )
-    {
+
+    if ( nteam == 0 ) {
       Repeat = 1;
-    } else {
-      Repeat = ( REPEAT + nteam - 1 ) / nteam ; //error here
     }
+    else {
+      Repeat = ( REPEAT + nteam - 1 ) / nteam; // Error here.
+    }
+
+    functor_type functor;
 
-    functor_type functor ;
+    policy_type team_exec( nteam, policy_type::team_size_max( functor ) );
 
-    policy_type team_exec( nteam , policy_type::team_size_max( functor ) );
+    for ( unsigned i = 0; i < Repeat; ++i ) {
+      long int accum = 0;
+      long int total = 0;
+      long int error = 0;
+      Kokkos::deep_copy( functor.accum, total );
 
-    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
-      long int accum = 0 ;
-      long int total = 0 ;
-      long int error = 0 ;
-      Kokkos::deep_copy( functor.accum , total );
-      Kokkos::parallel_reduce( team_exec , functor , result_type( & error ) );
+      Kokkos::parallel_reduce( team_exec, functor, result_type( & error ) );
       DeviceType::fence();
-      Kokkos::deep_copy( accum , functor.accum );
-      Kokkos::deep_copy( total , functor.total );
 
-      ASSERT_EQ( error , 0 );
-      ASSERT_EQ( total , accum );
+      Kokkos::deep_copy( accum, functor.accum );
+      Kokkos::deep_copy( total, functor.total );
+
+      ASSERT_EQ( error, 0 );
+      ASSERT_EQ( total, accum );
     }
 
     execution_space::fence();
@@ -416,18 +400,18 @@ namespace Test {
 template< class ExecSpace, class ScheduleType >
 struct SharedTeamFunctor {
 
-  typedef ExecSpace  execution_space ;
-  typedef int        value_type ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef ExecSpace                                             execution_space;
+  typedef int                                                   value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type;
 
   enum { SHARED_COUNT = 1000 };
 
-  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+  typedef typename ExecSpace::scratch_memory_space  shmem_space;
 
-  // tbd: MemoryUnmanaged should be the default for shared memory space
-  typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+  // TBD: MemoryUnmanaged should be the default for shared memory space.
+  typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
 
-  // Tell how much shared memory will be required by this functor:
+  // Tell how much shared memory will be required by this functor.
   inline
   unsigned team_shmem_size( int team_size ) const
   {
@@ -436,19 +420,26 @@ struct SharedTeamFunctor {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  void operator()( const typename policy_type::member_type & ind, value_type & update ) const
   {
-    const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
-    const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
-
-    if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
-        (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
-      printf ("Failed to allocate shared memory of size %lu\n",
-              static_cast<unsigned long> (SHARED_COUNT));
-      ++update; // failure to allocate is an error
+    const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
+    const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
+
+    if ( ( shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0 ) ||
+         ( shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0 ) )
+    {
+      printf ("member( %d/%d , %d/%d ) Failed to allocate shared memory of size %lu\n"
+             , ind.league_rank()
+             , ind.league_size()
+             , ind.team_rank()
+             , ind.team_size()
+             , static_cast<unsigned long>( SHARED_COUNT )
+             );
+
+      ++update; // Failure to allocate is an error.
     }
     else {
-      for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+      for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
         shared_A[i] = i + ind.league_rank();
         shared_B[i] = 2 * i + ind.league_rank();
       }
@@ -456,12 +447,13 @@ struct SharedTeamFunctor {
       ind.team_barrier();
 
       if ( ind.team_rank() + 1 == ind.team_size() ) {
-        for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+        for ( int i = 0; i < SHARED_COUNT; ++i ) {
           if ( shared_A[i] != i + ind.league_rank() ) {
-            ++update ;
+            ++update;
           }
+
           if ( shared_B[i] != 2 * i + ind.league_rank() ) {
-            ++update ;
+            ++update;
           }
         }
       }
@@ -469,78 +461,79 @@ struct SharedTeamFunctor {
   }
 };
 
-}
+} // namespace Test
 
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestSharedTeam {
-
-  TestSharedTeam()
-  { run(); }
+  TestSharedTeam() { run(); }
 
   void run()
   {
-    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
-    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor;
+    typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
 
-    const size_t team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( Functor() );
+    const size_t team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( Functor() );
 
-    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size );
+    Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size, team_size );
 
-    typename Functor::value_type error_count = 0 ;
+    typename Functor::value_type error_count = 0;
 
-    Kokkos::parallel_reduce( team_exec , Functor() , result_type( & error_count ) );
+    Kokkos::parallel_reduce( team_exec, Functor(), result_type( & error_count ) );
 
-    ASSERT_EQ( error_count , 0 );
+    ASSERT_EQ( error_count, 0 );
   }
 };
-}
+
+} // namespace
 
 namespace Test {
 
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
 template< class MemorySpace, class ExecSpace, class ScheduleType >
 struct TestLambdaSharedTeam {
-
-  TestLambdaSharedTeam()
-  { run(); }
+  TestLambdaSharedTeam() { run(); }
 
   void run()
   {
-    typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
-    //typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
-    typedef Kokkos::View< typename Functor::value_type , MemorySpace, Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Test::SharedTeamFunctor< ExecSpace, ScheduleType > Functor;
+    //typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
+    typedef Kokkos::View< typename Functor::value_type, MemorySpace, Kokkos::MemoryUnmanaged > result_type;
 
-    typedef typename ExecSpace::scratch_memory_space shmem_space ;
+    typedef typename ExecSpace::scratch_memory_space shmem_space;
 
-    // tbd: MemoryUnmanaged should be the default for shared memory space
-    typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+    // TBD: MemoryUnmanaged should be the default for shared memory space.
+    typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
 
     const int SHARED_COUNT = 1000;
     int team_size = 1;
+
 #ifdef KOKKOS_ENABLE_CUDA
-    if(std::is_same<ExecSpace,Kokkos::Cuda>::value)
-      team_size = 128;
+    if ( std::is_same< ExecSpace, Kokkos::Cuda >::value ) team_size = 128;
 #endif
-    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size);
-    team_exec = team_exec.set_scratch_size(0,Kokkos::PerTeam(SHARED_COUNT*2*sizeof(int)));
 
-    typename Functor::value_type error_count = 0 ;
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size, team_size );
+    team_exec = team_exec.set_scratch_size( 0, Kokkos::PerTeam( SHARED_COUNT * 2 * sizeof( int ) ) );
+
+    typename Functor::value_type error_count = 0;
 
-    Kokkos::parallel_reduce( team_exec , KOKKOS_LAMBDA
-        ( const typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type & ind , int & update ) {
+    Kokkos::parallel_reduce( team_exec, KOKKOS_LAMBDA
+        ( const typename Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::member_type & ind, int & update )
+    {
+      const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
+      const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
 
-      const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
-      const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
+      if ( ( shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0 ) ||
+           ( shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0 ) )
+      {
+        printf( "Failed to allocate shared memory of size %lu\n",
+                static_cast<unsigned long>( SHARED_COUNT ) );
 
-      if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
-          (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
-        printf ("Failed to allocate shared memory of size %lu\n",
-                static_cast<unsigned long> (SHARED_COUNT));
-        ++update; // failure to allocate is an error
-      } else {
-        for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+        ++update; // Failure to allocate is an error.
+      }
+      else {
+        for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
           shared_A[i] = i + ind.league_rank();
           shared_B[i] = 2 * i + ind.league_rank();
         }
@@ -548,196 +541,213 @@ struct TestLambdaSharedTeam {
         ind.team_barrier();
 
         if ( ind.team_rank() + 1 == ind.team_size() ) {
-          for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+          for ( int i = 0; i < SHARED_COUNT; ++i ) {
             if ( shared_A[i] != i + ind.league_rank() ) {
-              ++update ;
+              ++update;
             }
+
             if ( shared_B[i] != 2 * i + ind.league_rank() ) {
-              ++update ;
+              ++update;
             }
           }
         }
       }
     }, result_type( & error_count ) );
 
-    ASSERT_EQ( error_count , 0 );
+    ASSERT_EQ( error_count, 0 );
   }
 };
 #endif
-}
+
+} // namespace Test
 
 namespace Test {
 
 template< class ExecSpace, class ScheduleType >
 struct ScratchTeamFunctor {
 
-  typedef ExecSpace  execution_space ;
-  typedef int        value_type ;
-  typedef Kokkos::TeamPolicy< ScheduleType,  execution_space >  policy_type ;
+  typedef ExecSpace                                            execution_space;
+  typedef int                                                  value_type;
+  typedef Kokkos::TeamPolicy< ScheduleType, execution_space >  policy_type;
 
   enum { SHARED_TEAM_COUNT = 100 };
   enum { SHARED_THREAD_COUNT = 10 };
 
-  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+  typedef typename ExecSpace::scratch_memory_space shmem_space;
 
-  // tbd: MemoryUnmanaged should be the default for shared memory space
-  typedef Kokkos::View<size_t*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+  // TBD: MemoryUnmanaged should be the default for shared memory space.
+  typedef Kokkos::View< size_t*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  void operator()( const typename policy_type::member_type & ind, value_type & update ) const
   {
-    const shared_int_array_type scratch_ptr( ind.team_scratch(1) , 3*ind.team_size() );
-    const shared_int_array_type scratch_A( ind.team_scratch(1) , SHARED_TEAM_COUNT );
-    const shared_int_array_type scratch_B( ind.thread_scratch(1) , SHARED_THREAD_COUNT );
-
-    if ((scratch_ptr.ptr_on_device () == NULL ) ||
-        (scratch_A.  ptr_on_device () == NULL && SHARED_TEAM_COUNT > 0) ||
-        (scratch_B.  ptr_on_device () == NULL && SHARED_THREAD_COUNT > 0)) {
-      printf ("Failed to allocate shared memory of size %lu\n",
-              static_cast<unsigned long> (SHARED_TEAM_COUNT));
-      ++update; // failure to allocate is an error
+    const shared_int_array_type scratch_ptr( ind.team_scratch( 1 ), 3 * ind.team_size() );
+    const shared_int_array_type scratch_A( ind.team_scratch( 1 ), SHARED_TEAM_COUNT );
+    const shared_int_array_type scratch_B( ind.thread_scratch( 1 ), SHARED_THREAD_COUNT );
+
+    if ( ( scratch_ptr.ptr_on_device () == NULL ) ||
+         ( scratch_A.  ptr_on_device () == NULL && SHARED_TEAM_COUNT > 0 ) ||
+         ( scratch_B.  ptr_on_device () == NULL && SHARED_THREAD_COUNT > 0 ) )
+    {
+      printf( "Failed to allocate shared memory of size %lu\n",
+              static_cast<unsigned long>( SHARED_TEAM_COUNT ) );
+
+      ++update; // Failure to allocate is an error.
     }
     else {
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(ind,0,(int)SHARED_TEAM_COUNT),[&] (const int &i) {
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( ind, 0, (int) SHARED_TEAM_COUNT ), [&] ( const int & i ) {
         scratch_A[i] = i + ind.league_rank();
       });
-      for(int i=0; i<SHARED_THREAD_COUNT; i++)
-        scratch_B[i] = 10000*ind.league_rank() + 100*ind.team_rank() + i;
+
+      for ( int i = 0; i < SHARED_THREAD_COUNT; i++ ) {
+        scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
+      }
 
       scratch_ptr[ind.team_rank()] = (size_t) scratch_A.ptr_on_device();
       scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t) scratch_B.ptr_on_device();
 
       ind.team_barrier();
 
-      for( int i = 0; i<SHARED_TEAM_COUNT; i++) {
-        if(scratch_A[i] != size_t(i + ind.league_rank()))
-          ++update;
+      for ( int i = 0; i < SHARED_TEAM_COUNT; i++ ) {
+        if ( scratch_A[i] != size_t( i + ind.league_rank() ) ) ++update;
       }
-      for( int i = 0; i < ind.team_size(); i++) {
-        if(scratch_ptr[0]!=scratch_ptr[i]) ++update;
+
+      for ( int i = 0; i < ind.team_size(); i++ ) {
+        if ( scratch_ptr[0] != scratch_ptr[i] ) ++update;
       }
-      if(scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
-         SHARED_THREAD_COUNT*sizeof(size_t))
+
+      if ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] < SHARED_THREAD_COUNT * sizeof( size_t ) ) {
         ++update;
-      for( int i = 1; i < ind.team_size(); i++) {
-        if((scratch_ptr[i+ind.team_size()] - scratch_ptr[i-1+ind.team_size()]) !=
-           (scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()])) ++update;
+      }
 
+      for ( int i = 1; i < ind.team_size(); i++ ) {
+        if ( ( scratch_ptr[i + ind.team_size()] - scratch_ptr[i - 1 + ind.team_size()] ) !=
+             ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] ) )
+        {
+          ++update;
+        }
       }
     }
   }
 };
 
-}
+} // namespace Test
 
 namespace {
 
 template< class ExecSpace, class ScheduleType >
 struct TestScratchTeam {
-
-  TestScratchTeam()
-  { run(); }
+  TestScratchTeam() { run(); }
 
   void run()
   {
-    typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor ;
-    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+    typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor;
+    typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged >  result_type;
 
     const size_t team_size = Kokkos::TeamPolicy< ScheduleType,  ExecSpace >::team_size_max( Functor() );
 
-    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size , team_size );
+    Kokkos::TeamPolicy< ScheduleType,  ExecSpace > team_exec( 8192 / team_size, team_size );
+
+    typename Functor::value_type error_count = 0;
+
+    int team_scratch_size   = Functor::shared_int_array_type::shmem_size( Functor::SHARED_TEAM_COUNT ) +
+                              Functor::shared_int_array_type::shmem_size( 3 * team_size );
 
-    typename Functor::value_type error_count = 0 ;
+    int thread_scratch_size = Functor::shared_int_array_type::shmem_size( Functor::SHARED_THREAD_COUNT );
 
-    int team_scratch_size   = Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
-                              Functor::shared_int_array_type::shmem_size(3*team_size);
-    int thread_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_THREAD_COUNT);
-    Kokkos::parallel_reduce( team_exec.set_scratch_size(0,Kokkos::PerTeam(team_scratch_size),
-                                                          Kokkos::PerThread(thread_scratch_size)) ,
-                             Functor() , result_type( & error_count ) );
+    Kokkos::parallel_reduce( team_exec.set_scratch_size( 0, Kokkos::PerTeam( team_scratch_size ),
+                                                         Kokkos::PerThread( thread_scratch_size ) ),
+                             Functor(), result_type( & error_count ) );
 
-    ASSERT_EQ( error_count , 0 );
+    ASSERT_EQ( error_count, 0 );
   }
 };
-}
+
+} // namespace
 
 namespace Test {
-template< class ExecSpace>
+
+template< class ExecSpace >
 KOKKOS_INLINE_FUNCTION
-int test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team1(team.team_scratch(0),128);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread1(team.thread_scratch(0),16);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team2(team.team_scratch(0),128);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread2(team.thread_scratch(0),16);
-
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team1(team.team_scratch(1),128000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread1(team.thread_scratch(1),16000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team2(team.team_scratch(1),128000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread2(team.thread_scratch(1),16000);
-
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team3(team.team_scratch(0),128);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread3(team.thread_scratch(0),16);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team3(team.team_scratch(1),128000);
-  Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread3(team.thread_scratch(1),16000);
+int test_team_mulit_level_scratch_loop_body( const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team ) {
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team1( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread1( team.thread_scratch( 0 ), 16 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team2( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread2( team.thread_scratch( 0 ), 16 );
+
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team1( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread1( team.thread_scratch( 1 ), 16000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team2( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread2( team.thread_scratch( 1 ), 16000 );
+
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team3( team.team_scratch( 0 ), 128 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread3( team.thread_scratch( 0 ), 16 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team3( team.team_scratch( 1 ), 128000 );
+  Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread3( team.thread_scratch( 1 ), 16000 );
 
   // The explicit types for 0 and 128 are here to test TeamThreadRange accepting different
   // types for begin and end.
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,int(0),unsigned(128)), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, int( 0 ), unsigned( 128 ) ), [&] ( const int & i )
   {
-    a_team1(i) = 1000000 + i;
-    a_team2(i) = 2000000 + i;
-    a_team3(i) = 3000000 + i;
+    a_team1( i ) = 1000000 + i + team.league_rank() * 100000;
+    a_team2( i ) = 2000000 + i + team.league_rank() * 100000;
+    a_team3( i ) = 3000000 + i + team.league_rank() * 100000;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
   {
-    a_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
-    a_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
-    a_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+    a_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    a_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    a_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
   });
 
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
   {
-    b_team1(i) = 1000000 + i;
-    b_team2(i) = 2000000 + i;
-    b_team3(i) = 3000000 + i;
+    b_team1( i ) = 1000000 + i + team.league_rank() * 100000;
+    b_team2( i ) = 2000000 + i + team.league_rank() * 100000;
+    b_team3( i ) = 3000000 + i + team.league_rank() * 100000;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
   {
-    b_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
-    b_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
-    b_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
+    b_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    b_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
+    b_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
   });
 
   team.team_barrier();
+
   int error = 0;
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128 ), [&] ( const int & i )
   {
-    if(a_team1(i) != 1000000 + i) error++;
-    if(a_team2(i) != 2000000 + i) error++;
-    if(a_team3(i) != 3000000 + i) error++;
+    if ( a_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
+    if ( a_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
+    if ( a_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
   {
-    if(a_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
-    if(a_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
-    if(a_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+    if ( a_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( a_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( a_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
   });
 
-  Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i)
+  Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
   {
-    if(b_team1(i) != 1000000 + i) error++;
-    if(b_team2(i) != 2000000 + i) error++;
-    if(b_team3(i) != 3000000 + i) error++;
+    if ( b_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
+    if ( b_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
+    if ( b_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
   });
   team.team_barrier();
-  Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i)
+
+  Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
   {
-    if(b_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
-    if(b_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
-    if( b_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
+    if ( b_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( b_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
+    if ( b_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
   });
 
   return error;
@@ -748,93 +758,107 @@ struct TagFor {};
 
 template< class ExecSpace, class ScheduleType >
 struct ClassNoShmemSizeFunction {
-  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
+
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
-    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator()( const TagFor &, const member_type & team ) const {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
     errors() += error;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
-    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator() ( const TagReduce &, const member_type & team, int & error ) const {
+    error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
   }
 
   void run() {
-    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
     errors = d_errors;
 
-    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
-    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+    const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+    const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+
+    const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+    const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
 
-    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
-    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
     {
-    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this);
-    Kokkos::fence();
-    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors = Kokkos::create_mirror_view(d_errors);
-    Kokkos::deep_copy(h_errors,d_errors);
-    ASSERT_EQ(h_errors(),0);
+      Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this );
+      Kokkos::fence();
+
+      typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
+      Kokkos::deep_copy( h_errors, d_errors );
+      ASSERT_EQ( h_errors(), 0 );
     }
 
     {
-    int error = 0;
-    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this,error);
-    Kokkos::fence();
-    ASSERT_EQ(error,0);
+      int error = 0;
+      Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this, error );
+      Kokkos::fence();
+
+      ASSERT_EQ( error, 0 );
     }
   };
 };
 
 template< class ExecSpace, class ScheduleType >
 struct ClassWithShmemSizeFunction {
-  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
+
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
-    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator()( const TagFor &, const member_type & team ) const {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
     errors() += error;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
-    error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  void operator() ( const TagReduce &, const member_type & team, int & error ) const {
+    error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
   }
 
   void run() {
-    Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
+    Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
     errors = d_errors;
 
-    const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
-    const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+    const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+    const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
+
     {
-    Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_for(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this);
-    Kokkos::fence();
-    typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(d_errors);
-    Kokkos::deep_copy(h_errors,d_errors);
-    ASSERT_EQ(h_errors(),0);
+      Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_for( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
+                                                     Kokkos::PerThread( per_thread1 ) ),
+                            *this );
+      Kokkos::fence();
+
+      typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
+      Kokkos::deep_copy( h_errors, d_errors );
+      ASSERT_EQ( h_errors(), 0 );
     }
 
     {
-    int error = 0;
-    Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
-    Kokkos::parallel_reduce(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-      *this,error);
-    Kokkos::fence();
-    ASSERT_EQ(error,0);
+      int error = 0;
+      Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
+
+      Kokkos::parallel_reduce( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
+                                                        Kokkos::PerThread( per_thread1 ) ),
+                               *this, error );
+      Kokkos::fence();
+
+      ASSERT_EQ( error, 0 );
     }
   };
 
-  unsigned team_shmem_size(int team_size) const {
-    const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
-    const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+  unsigned team_shmem_size( int team_size ) const {
+    const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+    const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
     return per_team0 + team_size * per_thread0;
   }
 };
@@ -842,67 +866,68 @@ struct ClassWithShmemSizeFunction {
 template< class ExecSpace, class ScheduleType >
 void test_team_mulit_level_scratch_test_lambda() {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-  Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
-  Kokkos::View<int,ExecSpace> d_errors("Errors");
+  Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
+  Kokkos::View< int, ExecSpace > d_errors( "Errors" );
   errors = d_errors;
 
-  const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
-  const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
+  const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
+  const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
+
+  const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
+  const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
 
-  const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
-  const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
+  Kokkos::TeamPolicy< ExecSpace, ScheduleType > policy( 10, 8, 16 );
 
-  Kokkos::TeamPolicy<ExecSpace,ScheduleType> policy(10,8,16);
-  Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
-    int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
+  Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
+                        KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team )
+  {
+    int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
     errors() += error;
   });
   Kokkos::fence();
-  typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(errors);
-  Kokkos::deep_copy(h_errors,d_errors);
-  ASSERT_EQ(h_errors(),0);
+
+  typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( errors );
+  Kokkos::deep_copy( h_errors, d_errors );
+  ASSERT_EQ( h_errors(), 0 );
 
   int error = 0;
-  Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
-    KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team, int& count) {
-      count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
-  },error);
-  ASSERT_EQ(error,0);
+  Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
+                           KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team, int & count )
+  {
+    count += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
+  }, error );
+  ASSERT_EQ( error, 0 );
   Kokkos::fence();
 #endif
 }
 
-
-}
+} // namespace Test
 
 namespace {
+
 template< class ExecSpace, class ScheduleType >
 struct TestMultiLevelScratchTeam {
-
-  TestMultiLevelScratchTeam()
-  { run(); }
+  TestMultiLevelScratchTeam() { run(); }
 
   void run()
   {
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
-    Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
+    Test::test_team_mulit_level_scratch_test_lambda< ExecSpace, ScheduleType >();
 #endif
-    Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
+    Test::ClassNoShmemSizeFunction< ExecSpace, ScheduleType > c1;
     c1.run();
 
-    Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
+    Test::ClassWithShmemSizeFunction< ExecSpace, ScheduleType > c2;
     c2.run();
-
   }
 };
-}
+
+} // namespace
 
 namespace Test {
 
 template< class ExecSpace >
 struct TestShmemSize {
-
   TestShmemSize() { run(); }
 
   void run()
@@ -915,9 +940,8 @@ struct TestShmemSize {
 
     size_t size = view_type::shmem_size( d1, d2, d3 );
 
-    ASSERT_EQ( size, d1 * d2 * d3 * sizeof(long) );
+    ASSERT_EQ( size, d1 * d2 * d3 * sizeof( long ) );
   }
 };
-}
 
-/*--------------------------------------------------------------------------*/
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
index d9b06c29e49d0362226168861b0d5e818d1d82f9..8d16ac66db8abbf1b5afc3f12aaff7afe0159307 100644
--- a/lib/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,36 +50,47 @@
 namespace TestTeamVector {
 
 struct my_complex {
-  double re,im;
+  double re, im;
   int dummy;
+
   KOKKOS_INLINE_FUNCTION
   my_complex() {
     re = 0.0;
     im = 0.0;
     dummy = 0;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex(const my_complex& src) {
+  my_complex( const my_complex & src ) {
     re = src.re;
     im = src.im;
     dummy = src.dummy;
   }
 
   KOKKOS_INLINE_FUNCTION
-  my_complex(const volatile my_complex& src) {
+  my_complex & operator=( const my_complex & src ) {
     re = src.re;
     im = src.im;
     dummy = src.dummy;
+    return *this ;
   }
 
   KOKKOS_INLINE_FUNCTION
-  my_complex(const double& val) {
+  my_complex( const volatile my_complex & src ) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex( const double & val ) {
     re = val;
     im = 0.0;
     dummy = 0;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator += (const my_complex& src) {
+  my_complex & operator+=( const my_complex & src ) {
     re += src.re;
     im += src.im;
     dummy += src.dummy;
@@ -87,252 +98,278 @@ struct my_complex {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile my_complex& src) volatile {
+  void operator+=( const volatile my_complex & src ) volatile {
     re += src.re;
     im += src.im;
     dummy += src.dummy;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator *= (const my_complex& src) {
-    double re_tmp = re*src.re - im*src.im;
+  my_complex & operator*=( const my_complex & src ) {
+    double re_tmp = re * src.re - im * src.im;
     double im_tmp = re * src.im + im * src.re;
     re = re_tmp;
     im = im_tmp;
     dummy *= src.dummy;
     return *this;
   }
+
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile my_complex& src) volatile {
-    double re_tmp = re*src.re - im*src.im;
+  void operator*=( const volatile my_complex & src ) volatile {
+    double re_tmp = re * src.re - im * src.im;
     double im_tmp = re * src.im + im * src.re;
     re = re_tmp;
     im = im_tmp;
     dummy *= src.dummy;
   }
+
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const my_complex& src) {
-    return (re == src.re) && (im == src.im) && ( dummy == src.dummy );
+  bool operator==( const my_complex & src ) {
+    return ( re == src.re ) && ( im == src.im ) && ( dummy == src.dummy );
   }
+
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const my_complex& src) {
-      return (re != src.re) || (im != src.im) || ( dummy != src.dummy );
+  bool operator!=( const my_complex & src ) {
+    return ( re != src.re ) || ( im != src.im ) || ( dummy != src.dummy );
   }
+
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const double& val) {
-    return (re != val) ||
-           (im != 0) || (dummy != 0);
+  bool operator!=( const double & val ) {
+    return ( re != val ) || ( im != 0 ) || ( dummy != 0 );
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator= (const int& val) {
+  my_complex & operator=( const int & val ) {
     re = val;
     im = 0.0;
     dummy = 0;
     return *this;
   }
+
   KOKKOS_INLINE_FUNCTION
-  my_complex& operator= (const double& val) {
+  my_complex & operator=( const double & val ) {
     re = val;
     im = 0.0;
     dummy = 0;
     return *this;
   }
+
   KOKKOS_INLINE_FUNCTION
   operator double() {
     return re;
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_for {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
-    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
     typedef typename shared_int::size_type size_type;
 
-    const size_type shmemSize = team.team_size () * 13;
-    shared_int values = shared_int (team.team_shmem (), shmemSize);
+    const size_type shmemSize = team.team_size() * 13;
+    shared_int values = shared_int( team.team_shmem(), shmemSize );
 
-    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
-      printf ("FAILED to allocate shared memory of size %u\n",
-              static_cast<unsigned int> (shmemSize));
+    if ( values.ptr_on_device() == NULL || values.dimension_0() < shmemSize ) {
+      printf( "FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int>( shmemSize ) );
     }
     else {
+      // Initialize shared memory.
+      values( team.team_rank() ) = 0;
 
-      // Initialize shared memory
-      values(team.team_rank ()) = 0;
-
-      // Accumulate value into per thread shared memory
-      // This is non blocking
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i)
+      // Accumulate value into per thread shared memory.
+      // This is non blocking.
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i )
       {
-        values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+        values( team.team_rank() ) += i - team.league_rank() + team.league_size() + team.team_size();
       });
-      // Wait for all memory to be written
-      team.team_barrier ();
-      // One thread per team executes the comparison
-      Kokkos::single(Kokkos::PerTeam(team),[&]()
+
+      // Wait for all memory to be written.
+      team.team_barrier();
+
+      // One thread per team executes the comparison.
+      Kokkos::single( Kokkos::PerTeam( team ), [&] ()
       {
-            Scalar test = 0;
-            Scalar value = 0;
-            for (int i = 0; i < 131; ++i) {
-              test += i - team.league_rank () + team.league_size () + team.team_size ();
-            }
-            for (int i = 0; i < team.team_size (); ++i) {
-              value += values(i);
-            }
-            if (test != value) {
-              printf ("FAILED team_parallel_for %i %i %f %f\n",
-                      team.league_rank (), team.team_rank (),
-                      static_cast<double> (test), static_cast<double> (value));
-              flag() = 1;
-            }
+        Scalar test = 0;
+        Scalar value = 0;
+
+        for ( int i = 0; i < 131; ++i ) {
+          test += i - team.league_rank() + team.league_size() + team.team_size();
+        }
+
+        for ( int i = 0; i < team.team_size(); ++i ) {
+          value += values( i );
+        }
+
+        if ( test != value ) {
+          printf ( "FAILED team_parallel_for %i %i %f %f\n",
+                   team.league_rank(), team.team_rank(),
+                   static_cast<double>( test ), static_cast<double>( value ) );
+          flag() = 1;
+        }
       });
     }
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_reduce {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = Scalar();
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val)
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
     {
-      val += i - team.league_rank () + team.league_size () + team.team_size ();
-    },value);
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    }, value );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
-        {
-         Scalar test = 0;
-         for (int i = 0; i < 131; ++i) {
-           test += i - team.league_rank () + team.league_size () + team.team_size ();
-         }
-         if (test != value) {
-           if(team.league_rank() == 0)
-           printf ("FAILED team_parallel_reduce %i %i %f %f %lu\n",
-             team.league_rank (), team.team_rank (),
-             static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
-              flag() = 1;
-         }
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
+    {
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        if ( team.league_rank() == 0 ) {
+          printf( "FAILED team_parallel_reduce %i %i %f %f %lu\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ), sizeof( Scalar ) );
+        }
+
+        flag() = 1;
+      }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_reduce_join {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_reduce_join( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = 0;
 
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
-      , [&] (int i, Scalar& val)
-      {
-        val += i - team.league_rank () + team.league_size () + team.team_size ();
-      }
-      , [&] (volatile Scalar& val, const volatile Scalar& src)
-        {val+=src;}
-      , value
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    },
+      [] ( volatile Scalar & val, const volatile Scalar & src ) { val += src; },
+      value
     );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
     {
-         Scalar test = 0;
-         for (int i = 0; i < 131; ++i) {
-           test += i - team.league_rank () + team.league_size () + team.team_size ();
-         }
-         if (test != value) {
-           printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
-             team.league_rank (), team.team_rank (),
-             static_cast<double> (test), static_cast<double> (value));
-              flag() = 1;
-         }
+      Scalar test = 0;
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
+      }
+
+      if ( test != value ) {
+        printf( "FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+                team.league_rank(), team.team_rank(),
+                static_cast<double>( test ), static_cast<double>( value ) );
+
+        flag() = 1;
+      }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_vector_for {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_vector_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_vector_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
-    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
     typedef typename shared_int::size_type size_type;
 
-    const size_type shmemSize = team.team_size () * 13;
-    shared_int values = shared_int (team.team_shmem (), shmemSize);
+    const size_type shmemSize = team.team_size() * 13;
+    shared_int values = shared_int( team.team_shmem(), shmemSize );
 
-    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
-      printf ("FAILED to allocate shared memory of size %u\n",
-              static_cast<unsigned int> (shmemSize));
+    if ( values.ptr_on_device() == NULL || values.dimension_0() < shmemSize ) {
+      printf( "FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int>( shmemSize ) );
     }
     else {
-      Kokkos::single(Kokkos::PerThread(team),[&] ()
+      team.team_barrier();
+
+      Kokkos::single( Kokkos::PerThread( team ), [&] ()
       {
-        values(team.team_rank ()) = 0;
+        values( team.team_rank() ) = 0;
       });
 
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i)
+      Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i )
       {
-        Kokkos::single(Kokkos::PerThread(team),[&] ()
+        Kokkos::single( Kokkos::PerThread( team ), [&] ()
         {
-          values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+          values( team.team_rank() ) += i - team.league_rank() + team.league_size() + team.team_size();
         });
       });
 
-      team.team_barrier ();
-      Kokkos::single(Kokkos::PerTeam(team),[&]()
+      team.team_barrier();
+
+      Kokkos::single( Kokkos::PerTeam( team ), [&] ()
       {
         Scalar test = 0;
         Scalar value = 0;
-        for (int i = 0; i < 131; ++i) {
-          test += i - team.league_rank () + team.league_size () + team.team_size ();
+
+        for ( int i = 0; i < 131; ++i ) {
+          test += i - team.league_rank() + team.league_size() + team.team_size();
         }
-        for (int i = 0; i < team.team_size (); ++i) {
-          value += values(i);
+
+        for ( int i = 0; i < team.team_size(); ++i ) {
+          value += values( i );
         }
-        if (test != value) {
-          printf ("FAILED team_vector_parallel_for %i %i %f %f\n",
-                  team.league_rank (), team.team_rank (),
-                  static_cast<double> (test), static_cast<double> (value));
+
+        if ( test != value ) {
+          printf( "FAILED team_vector_parallel_for %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ) );
+
           flag() = 1;
         }
       });
@@ -340,164 +377,176 @@ struct functor_team_vector_for {
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_vector_reduce {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_vector_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_team_vector_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
-
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = Scalar();
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val)
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
     {
-        val += i - team.league_rank () + team.league_size () + team.team_size ();
-    },value);
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    }, value );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
     {
       Scalar test = 0;
-      for (int i = 0; i < 131; ++i) {
-        test += i - team.league_rank () + team.league_size () + team.team_size ();
+
+      for ( int i = 0; i < 131; ++i ) {
+        test += i - team.league_rank() + team.league_size() + team.team_size();
       }
-      if (test != value) {
-        if(team.league_rank() == 0)
-        printf ("FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
-          team.league_rank (), team.team_rank (),
-          static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
-           flag() = 1;
+
+      if ( test != value ) {
+        if ( team.league_rank() == 0 ) {
+          printf( "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ), sizeof( Scalar ) );
+        }
+
+        flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_team_vector_reduce_join {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_team_vector_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  functor_team_vector_reduce_join( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = 0;
-    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
-      , [&] (int i, Scalar& val)
-      {
-        val += i - team.league_rank () + team.league_size () + team.team_size ();
-      }
-      , [&] (volatile Scalar& val, const volatile Scalar& src)
-        {val+=src;}
-      , value
+
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, 131 ), [&] ( int i, Scalar & val )
+    {
+      val += i - team.league_rank() + team.league_size() + team.team_size();
+    },
+      [] ( volatile Scalar & val, const volatile Scalar & src ) { val += src; },
+      value
     );
 
-    team.team_barrier ();
-    Kokkos::single(Kokkos::PerTeam(team),[&]()
+    team.team_barrier();
+
+    Kokkos::single( Kokkos::PerTeam( team ), [&] ()
     {
       Scalar test = 0;
-      for (int i = 0; i < 131; ++i) {
-         test += i - team.league_rank () + team.league_size () + team.team_size ();
+
+      for ( int i = 0; i < 131; ++i ) {
+         test += i - team.league_rank() + team.league_size() + team.team_size();
       }
-      if (test != value) {
-        printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
-          team.league_rank (), team.team_rank (),
-          static_cast<double> (test), static_cast<double> (value));
+
+      if ( test != value ) {
+        printf( "FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+                team.league_rank(), team.team_rank(),
+                static_cast<double>( test ), static_cast<double>( value ) );
+
         flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_single {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_single(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_vec_single( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
-
-    // Warning: this test case intentionally violates permissable semantics
+  void operator()( typename policy_type::member_type team ) const {
+    // Warning: this test case intentionally violates permissable semantics.
     // It is not valid to get references to members of the enclosing region
     // inside a parallel_for and write to it.
     Scalar value = 0;
 
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13),[&] (int i)
+    Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i )
     {
-      value = i; // This write is violating Kokkos semantics for nested parallelism
+      value = i; // This write is violating Kokkos semantics for nested parallelism.
     });
 
-    Kokkos::single(Kokkos::PerThread(team),[&] (Scalar& val)
+    Kokkos::single( Kokkos::PerThread( team ), [&] ( Scalar & val )
     {
       val = 1;
-    },value);
+    }, value );
 
     Scalar value2 = 0;
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13), [&] (int i, Scalar& val)
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
     {
       val += value;
-    },value2);
+    }, value2 );
+
+    if ( value2 != ( value * 13 ) ) {
+      printf( "FAILED vector_single broadcast %i %i %f %f\n",
+              team.league_rank(), team.team_rank(), (double) value2, (double) value );
 
-    if(value2!=(value*13)) {
-      printf("FAILED vector_single broadcast %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) value2,(double) value);
-      flag()=1;
+      flag() = 1;
     }
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_for {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_for( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
-  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+  unsigned team_shmem_size( int team_size ) const { return team_size * 13 * sizeof( Scalar ) + 8; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  void operator()( typename policy_type::member_type team ) const {
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space;
+    typedef Kokkos::View< Scalar*, shmem_space, Kokkos::MemoryUnmanaged > shared_int;
 
-    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
-    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
-    shared_int values = shared_int(team.team_shmem(),team.team_size()*13);
+    shared_int values = shared_int( team.team_shmem(), team.team_size() * 13 );
 
-    if (values.ptr_on_device () == NULL ||
-        values.dimension_0() < (unsigned) team.team_size() * 13) {
-      printf ("FAILED to allocate memory of size %i\n",
-              static_cast<int> (team.team_size () * 13));
+    if ( values.ptr_on_device() == NULL || values.dimension_0() < (unsigned) team.team_size() * 13 ) {
+      printf( "FAILED to allocate memory of size %i\n", static_cast<int>( team.team_size() * 13 ) );
       flag() = 1;
     }
     else {
-      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13), [&] (int i)
+      Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i )
       {
-        values(13*team.team_rank() + i) = i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+        values( 13 * team.team_rank() + i ) =
+          i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
       });
 
-      Kokkos::single(Kokkos::PerThread(team),[&] ()
+      Kokkos::single( Kokkos::PerThread( team ), [&] ()
       {
         Scalar test = 0;
         Scalar value = 0;
-        for (int i = 0; i < 13; ++i) {
+
+        for ( int i = 0; i < 13; ++i ) {
           test += i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
-          value += values(13*team.team_rank() + i);
+          value += values( 13 * team.team_rank() + i );
         }
-        if (test != value) {
-          printf ("FAILED vector_par_for %i %i %f %f\n",
-                  team.league_rank (), team.team_rank (),
-                  static_cast<double> (test), static_cast<double> (value));
+
+        if ( test != value ) {
+          printf( "FAILED vector_par_for %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(),
+                  static_cast<double>( test ), static_cast<double>( value ) );
+
           flag() = 1;
         }
       });
@@ -505,169 +554,192 @@ struct functor_vec_for {
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_red {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_red(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_red( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  void operator()( typename policy_type::member_type team ) const {
     Scalar value = 0;
 
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val)
+    // When no reducer is given the default is summation.
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
     {
       val += i;
-    }, value);
+    }, value );
 
-    Kokkos::single(Kokkos::PerThread(team),[&] ()
+    Kokkos::single( Kokkos::PerThread( team ), [&] ()
     {
       Scalar test = 0;
-      for(int i = 0; i < 13; i++) {
-        test+=i;
-      }
-      if(test!=value) {
-        printf("FAILED vector_par_reduce %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
-        flag()=1;
+
+      for ( int i = 0; i < 13; i++ ) test += i;
+
+      if ( test != value ) {
+        printf( "FAILED vector_par_reduce %i %i %f %f\n",
+                team.league_rank(), team.team_rank(), (double) test, (double) value );
+
+        flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_red_join {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_red_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+
+  functor_vec_red_join( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
+  void operator()( typename policy_type::member_type team ) const {
+    // Must initialize to the identity value for the reduce operation
+    // for this test:
+    //   ( identity, operation ) = ( 1 , *= )
     Scalar value = 1;
 
-    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13)
-      , [&] (int i, Scalar& val)
-      { val *= i; }
-      , [&] (Scalar& val, const Scalar& src)
-      {val*=src;}
-      , value
+    Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val )
+    {
+      val *= ( i % 5 + 1 );
+    },
+      [&] ( Scalar & val, const Scalar & src ) { val *= src; },
+      value
     );
 
-    Kokkos::single(Kokkos::PerThread(team),[&] ()
+    Kokkos::single( Kokkos::PerThread( team ), [&] ()
     {
       Scalar test = 1;
-      for(int i = 0; i < 13; i++) {
-        test*=i;
-      }
-      if(test!=value) {
-        printf("FAILED vector_par_reduce_join %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
-        flag()=1;
+
+      for ( int i = 0; i < 13; i++ ) test *= ( i % 5 + 1 );
+
+      if ( test != value ) {
+        printf( "FAILED vector_par_reduce_join %i %i %f %f\n",
+                team.league_rank(), team.team_rank(), (double) test, (double) value );
+
+        flag() = 1;
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_vec_scan {
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_vec_scan(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_vec_scan( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team) const {
-    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val, bool final)
+  void operator()( typename policy_type::member_type team ) const {
+    Kokkos::parallel_scan( Kokkos::ThreadVectorRange( team, 13 ), [&] ( int i, Scalar & val, bool final )
     {
       val += i;
-      if(final) {
+
+      if ( final ) {
         Scalar test = 0;
-        for(int k = 0; k <= i; k++) {
-          test+=k;
-        }
-        if(test!=val) {
-          printf("FAILED vector_par_scan %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) val);
-          flag()=1;
+        for ( int k = 0; k <= i; k++ ) test += k;
+
+        if ( test != val ) {
+          printf( "FAILED vector_par_scan %i %i %f %f\n",
+                  team.league_rank(), team.team_rank(), (double) test, (double) val );
+
+          flag() = 1;
         }
       }
     });
   }
 };
 
-template<typename Scalar, class ExecutionSpace>
+template< typename Scalar, class ExecutionSpace >
 struct functor_reduce {
   typedef double value_type;
-  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef Kokkos::TeamPolicy< ExecutionSpace > policy_type;
   typedef ExecutionSpace execution_space;
 
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
-  functor_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag;
+  functor_reduce( Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > flag_ ) : flag( flag_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (typename policy_type::member_type team, double& sum) const {
+  void operator()( typename policy_type::member_type team, double & sum ) const {
     sum += team.league_rank() * 100 + team.thread_rank();
   }
 };
 
-template<typename Scalar,class ExecutionSpace>
-bool test_scalar(int nteams, int team_size, int test) {
-  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> d_flag("flag");
-  typename Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace>::HostMirror h_flag("h_flag");
-  h_flag() = 0 ;
-  Kokkos::deep_copy(d_flag,h_flag);
-  
-  if(test==0)
-  Kokkos::parallel_for( std::string("A") , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_red<Scalar, ExecutionSpace>(d_flag));
-  if(test==1)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_red_join<Scalar, ExecutionSpace>(d_flag));
-  if(test==2)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_scan<Scalar, ExecutionSpace>(d_flag));
-  if(test==3)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_for<Scalar, ExecutionSpace>(d_flag));
-  if(test==4)
-  Kokkos::parallel_for( "B" , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_vec_single<Scalar, ExecutionSpace>(d_flag));
-  if(test==5)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
-      functor_team_for<Scalar, ExecutionSpace>(d_flag));
-  if(test==6)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
-      functor_team_reduce<Scalar, ExecutionSpace>(d_flag));
-  if(test==7)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
-      functor_team_reduce_join<Scalar, ExecutionSpace>(d_flag));
-  if(test==8)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_team_vector_for<Scalar, ExecutionSpace>(d_flag));
-  if(test==9)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag));
-  if(test==10)
-  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
-      functor_team_vector_reduce_join<Scalar, ExecutionSpace>(d_flag));
-  
-  Kokkos::deep_copy(h_flag,d_flag);
-
-  return (h_flag() == 0);
+template< typename Scalar, class ExecutionSpace >
+bool test_scalar( int nteams, int team_size, int test ) {
+  Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace > d_flag( "flag" );
+  typename Kokkos::View< int, Kokkos::LayoutLeft, ExecutionSpace >::HostMirror h_flag( "h_flag" );
+  h_flag() = 0;
+  Kokkos::deep_copy( d_flag, h_flag );
+
+  if ( test == 0 ) {
+    Kokkos::parallel_for( std::string( "A" ), Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_red< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 1 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_red_join< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 2 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_scan< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 3 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 4 ) {
+    Kokkos::parallel_for( "B", Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_vec_single< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 5 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 6 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_reduce< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 7 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size ),
+                          functor_team_reduce_join< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 8 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_for< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 9 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_reduce< Scalar, ExecutionSpace >( d_flag ) );
+  }
+  else if ( test == 10 ) {
+    Kokkos::parallel_for( Kokkos::TeamPolicy< ExecutionSpace >( nteams, team_size, 8 ),
+                          functor_team_vector_reduce_join< Scalar, ExecutionSpace >( d_flag ) );
+  }
+
+  Kokkos::deep_copy( h_flag, d_flag );
+
+  return ( h_flag() == 0 );
 }
 
-template<class ExecutionSpace>
-bool Test(int test) {
+template< class ExecutionSpace >
+bool Test( int test ) {
   bool passed = true;
-  passed = passed && test_scalar<int, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<long long int, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<float, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<double, ExecutionSpace>(317,33,test);
-  passed = passed && test_scalar<my_complex, ExecutionSpace>(317,33,test);
-  return passed;
-}
+  passed = passed && test_scalar< int, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< long long int, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< float, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< double, ExecutionSpace >( 317, 33, test );
+  passed = passed && test_scalar< my_complex, ExecutionSpace >( 317, 33, test );
 
+  return passed;
 }
 
+} // namespace TestTeamVector
diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
index 203c9526792f8a5bbef9dbcb0582ce2d8d3a80e2..7bcf3f8a32691ee8a27bac5ed997ed68c6c39082 100644
--- a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -47,152 +47,162 @@
 
 namespace {
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumPlain {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
+
   type view;
-  SumPlain(type view_):view(view_) {}
+
+  SumPlain( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, Scalar& val) {
+  void operator() ( int i, Scalar & val ) {
     val += Scalar();
   }
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumInitJoinFinalValueType {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type;
-  SumInitJoinFinalValueType(type view_):view(view_) {}
+
+  type view;
+
+  SumInitJoinFinalValueType( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(value_type& val) const {
+  void init( value_type & val ) const {
     val = value_type();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, volatile value_type& src) const {
+  void join( volatile value_type & val, volatile value_type & src ) const {
     val += src;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type& val) const {
+  void operator()( int i, value_type & val ) const {
     val += value_type();
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumInitJoinFinalValueType2 {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type;
-  SumInitJoinFinalValueType2(type view_):view(view_) {}
+
+  type view;
+
+  SumInitJoinFinalValueType2( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(volatile value_type& val) const {
+  void init( volatile value_type & val ) const {
     val = value_type();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const volatile value_type& src) const {
+  void join( volatile value_type & val, const volatile value_type & src ) const {
     val += src;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type& val) const {
+  void operator()( int i, value_type & val ) const {
     val += value_type();
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumInitJoinFinalValueTypeArray {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type[];
+
+  type view;
   int n;
-  SumInitJoinFinalValueTypeArray(type view_, int n_):view(view_),n(n_) {}
+
+  SumInitJoinFinalValueTypeArray( type view_, int n_ ) : view( view_ ), n( n_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(value_type val) const {
-    for(int k=0;k<n;k++)
+  void init( value_type val ) const {
+    for ( int k = 0; k < n; k++ ) {
       val[k] = 0;
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type val, const volatile value_type src) const {
-    for(int k=0;k<n;k++)
+  void join( volatile value_type val, const volatile value_type src ) const {
+    for ( int k = 0; k < n; k++ ) {
       val[k] += src[k];
+    }
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type val) const {
-    for(int k=0;k<n;k++)
-      val[k] += k*i;
+  void operator()( int i, value_type val ) const {
+    for ( int k = 0; k < n; k++ ) {
+      val[k] += k * i;
+    }
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 struct SumWrongInitJoinFinalValueType {
   typedef ExecutionSpace execution_space;
-  typedef typename Kokkos::View<Scalar*,execution_space> type;
-  type view;
+  typedef typename Kokkos::View< Scalar*, execution_space > type;
   typedef Scalar value_type;
-  SumWrongInitJoinFinalValueType(type view_):view(view_) {}
+
+  type view;
+
+  SumWrongInitJoinFinalValueType( type view_ ) : view( view_ ) {}
 
   KOKKOS_INLINE_FUNCTION
-  void init(double& val) const {
+  void init( double & val ) const {
     val = double();
   }
 
   KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const value_type& src) const {
+  void join( volatile value_type & val, const value_type & src ) const {
     val += src;
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator() (int i, value_type& val) const {
+  void operator()( int i, value_type & val ) const {
     val += value_type();
   }
-
 };
 
-template<class Scalar, class ExecutionSpace>
+template< class Scalar, class ExecutionSpace >
 void TestTemplateMetaFunctions() {
-  typedef typename Kokkos::View<Scalar*,ExecutionSpace> type;
-  type a("A",100);
+  typedef typename Kokkos::View< Scalar*, ExecutionSpace > type;
+  type a( "A", 100 );
 /*
-  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
-  ASSERT_EQ(sum_plain_has_init_arg,0);
-  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1);
-  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg2,1);
-  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,0);
-
-  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueTypeArray<Scalar,ExecutionSpace>, Scalar[] >::value;
-  //ASSERT_EQ(sum_initjoinfinalvaluetypearray_has_init_arg,1);
-
-  //printf("Values Init: %i %i %i\n",sum_plain_has_init_arg,sum_initjoinfinalvaluetype_has_init_arg,sum_wronginitjoinfinalvaluetype_has_init_arg);
-
-  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_plain_has_join_arg,0);
-  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1);
-  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1);
-  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
-  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,0);
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar, ExecutionSpace>, Scalar & >::value;
+  ASSERT_EQ( sum_plain_has_init_arg, 0 );
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
+  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
+
+  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>, Scalar[] >::value;
+  //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
+
+  //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg, sum_initjoinfinalvaluetype_has_init_arg, sum_wronginitjoinfinalvaluetype_has_init_arg );
+
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_plain_has_join_arg, 0 );
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin< SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ( sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
+
+  //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg, sum_initjoinfinalvaluetype_has_join_arg, sum_wronginitjoinfinalvaluetype_has_join_arg );
 */
-  //printf("Values Join: %i %i %i\n",sum_plain_has_join_arg,sum_initjoinfinalvaluetype_has_join_arg,sum_wronginitjoinfinalvaluetype_has_join_arg);
 }
 
-}
+} // namespace
diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp
index 842131debb69b54ad08fd0eb90836510be50d7ca..7d096c24c38ee82a6930ed192858e538e345dc29 100644
--- a/lib/kokkos/core/unit_test/TestTile.hpp
+++ b/lib/kokkos/core/unit_test/TestTile.hpp
@@ -1,12 +1,12 @@
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -35,7 +35,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 
@@ -47,108 +47,96 @@
 
 namespace TestTile {
 
-template < typename Device , typename TileLayout>
+template < typename Device, typename TileLayout >
 struct ReduceTileErrors
 {
-  typedef Device execution_space ;
-
-  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device>  array_type;
-  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft , Device >  tile_type ;
-
-  array_type m_array ;
-
+  typedef Device execution_space;
+  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device >  array_type;
+  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft, Device >  tile_type;
   typedef ptrdiff_t value_type;
 
-  ReduceTileErrors( array_type a )
-    : m_array(a)
-  {}
+  array_type m_array;
 
+  ReduceTileErrors( array_type a ) : m_array( a ) {}
 
   KOKKOS_INLINE_FUNCTION
-  static void init( value_type & errors )
-  {
-    errors = 0;
-  }
+  static void init( value_type & errors ) { errors = 0; }
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & errors ,
+  static void join( volatile value_type & errors,
                     const volatile value_type & src_errors )
   {
     errors += src_errors;
   }
 
-  // Initialize
+  // Initialize.
   KOKKOS_INLINE_FUNCTION
   void operator()( size_t iwork ) const
   {
     const size_t i = iwork % m_array.dimension_0();
     const size_t j = iwork / m_array.dimension_0();
-    if ( j < m_array.dimension_1() ) {
-      m_array(i,j) = & m_array(i,j) - & m_array(0,0);
 
-// printf("m_array(%d,%d) = %d\n",int(i),int(j),int(m_array(i,j)));
+    if ( j < m_array.dimension_1() ) {
+      m_array( i, j ) = &m_array( i, j ) - &m_array( 0, 0 );
 
+      //printf( "m_array(%d, %d) = %d\n", int( i ), int( j ), int( m_array( i, j ) ) );
     }
   }
 
   // Verify:
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_t iwork , value_type & errors ) const
+  void operator()( size_t iwork, value_type & errors ) const
   {
-    const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0 ;
-    const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1 ;
+    const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0;
+    const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1;
 
-    const size_t itile = iwork % tile_dim0 ;
-    const size_t jtile = iwork / tile_dim0 ;
+    const size_t itile = iwork % tile_dim0;
+    const size_t jtile = iwork / tile_dim0;
 
     if ( jtile < tile_dim1 ) {
+      tile_type tile = Kokkos::Experimental::tile_subview( m_array, itile, jtile );
 
-      tile_type tile = Kokkos::Experimental::tile_subview( m_array , itile , jtile );
-
-      if ( tile(0,0) != ptrdiff_t(( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
-        ++errors ;
+      if ( tile( 0, 0 ) != ptrdiff_t( ( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
+        ++errors;
       }
       else {
+        for ( size_t j = 0; j < size_t( TileLayout::N1 ); ++j ) {
+          for ( size_t i = 0; i < size_t( TileLayout::N0 ); ++i ) {
+            const size_t iglobal = i + itile * TileLayout::N0;
+            const size_t jglobal = j + jtile * TileLayout::N1;
 
-        for ( size_t j = 0 ; j < size_t(TileLayout::N1) ; ++j ) {
-        for ( size_t i = 0 ; i < size_t(TileLayout::N0) ; ++i ) {
-          const size_t iglobal = i + itile * TileLayout::N0 ;
-          const size_t jglobal = j + jtile * TileLayout::N1 ;
-
-          if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) {
-            if ( tile(i,j) != ptrdiff_t( tile(0,0) + i + j * TileLayout::N0 ) ) ++errors ;
-
-// printf("tile(%d,%d)(%d,%d) = %d\n",int(itile),int(jtile),int(i),int(j),int(tile(i,j)));
+            if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) {
+              if ( tile( i, j ) != ptrdiff_t( tile( 0, 0 ) + i + j * TileLayout::N0 ) ) ++errors;
 
+              //printf( "tile(%d, %d)(%d, %d) = %d\n", int( itile ), int( jtile ), int( i ), int( j ), int( tile( i, j ) ) );
+            }
           }
         }
-        }
       }
     }
   }
 };
 
-template< class Space , unsigned N0 , unsigned N1 >
-void test( const size_t dim0 , const size_t dim1 )
+template< class Space, unsigned N0, unsigned N1 >
+void test( const size_t dim0, const size_t dim1 )
 {
-  typedef Kokkos::LayoutTileLeft<N0,N1>  array_layout ;
-  typedef ReduceTileErrors< Space , array_layout > functor_type ;
+  typedef Kokkos::LayoutTileLeft< N0, N1 >  array_layout;
+  typedef ReduceTileErrors< Space, array_layout > functor_type;
 
-  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0 ;
-  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1 ;
-  
-  typename functor_type::array_type array("",dim0,dim1);
+  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0;
+  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1;
 
-  Kokkos::parallel_for( Kokkos::RangePolicy<Space,size_t>(0,dim0*dim1) , functor_type( array ) );
+  typename functor_type::array_type array( "", dim0, dim1 );
 
-  ptrdiff_t error = 0 ;
+  Kokkos::parallel_for( Kokkos::RangePolicy< Space, size_t >( 0, dim0 * dim1 ), functor_type( array ) );
 
-  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space,size_t>(0,tile_dim0*tile_dim1) , functor_type( array ) , error );
+  ptrdiff_t error = 0;
 
-  EXPECT_EQ( error , ptrdiff_t(0) );
+  Kokkos::parallel_reduce( Kokkos::RangePolicy< Space, size_t >( 0, tile_dim0 * tile_dim1 ), functor_type( array ), error );
+
+  EXPECT_EQ( error, ptrdiff_t( 0 ) );
 }
 
-} /* namespace TestTile */
+} // namespace TestTile
 
 #endif //TEST_TILE_HPP
-
diff --git a/lib/kokkos/core/unit_test/TestUtilities.hpp b/lib/kokkos/core/unit_test/TestUtilities.hpp
index 947be03e399bee3c23f4c4f333c34c0e6a9d4d08..be4a93b8942cdfd69e97f68b9ea109a2be10de19 100644
--- a/lib/kokkos/core/unit_test/TestUtilities.hpp
+++ b/lib/kokkos/core/unit_test/TestUtilities.hpp
@@ -49,258 +49,253 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
 inline
 void test_utilities()
 {
   using namespace Kokkos::Impl;
+
   {
-    using i = integer_sequence<int>;
-    using j = make_integer_sequence<int,0>;
+    using i = integer_sequence< int >;
+    using j = make_integer_sequence< int, 0 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 0u, "Error: integer_sequence.size()" );
   }
 
-
   {
-    using i = integer_sequence<int,0>;
-    using j = make_integer_sequence<int,1>;
+    using i = integer_sequence< int, 0 >;
+    using j = make_integer_sequence< int, 1 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 1u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
   }
 
-
   {
-    using i = integer_sequence<int,0,1>;
-    using j = make_integer_sequence<int,2>;
+    using i = integer_sequence< int, 0, 1 >;
+    using j = make_integer_sequence< int, 2 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 2u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2>;
-    using j = make_integer_sequence<int,3>;
+    using i = integer_sequence< int, 0, 1, 2 >;
+    using j = make_integer_sequence< int, 3 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 3u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3>;
-    using j = make_integer_sequence<int,4>;
+    using i = integer_sequence< int, 0, 1, 2, 3 >;
+    using j = make_integer_sequence< int, 4 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 4u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
 
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4>;
-    using j = make_integer_sequence<int,5>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4 >;
+    using j = make_integer_sequence< int, 5 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 5u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5>;
-    using j = make_integer_sequence<int,6>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5 >;
+    using j = make_integer_sequence< int, 6 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 6u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6>;
-    using j = make_integer_sequence<int,7>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6 >;
+    using j = make_integer_sequence< int, 7 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 7u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6,7>;
-    using j = make_integer_sequence<int,8>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7 >;
+    using j = make_integer_sequence< int, 8 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 8u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<7, i>::value == 7, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6,7,8>;
-    using j = make_integer_sequence<int,9>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7, 8 >;
+    using j = make_integer_sequence< int, 9 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 9u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<7, i>::value == 7, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<8, i>::value == 8, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(8, i{}) == 8, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 8, i >::value == 8, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 8, i{} ) == 8, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = integer_sequence<int,0,1,2,3,4,5,6,7,8,9>;
-    using j = make_integer_sequence<int,10>;
+    using i = integer_sequence< int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 >;
+    using j = make_integer_sequence< int, 10 >;
 
-    static_assert( std::is_same<i,j>::value, "Error: make_integer_sequence" );
+    static_assert( std::is_same< i, j >::value, "Error: make_integer_sequence" );
     static_assert( i::size() == 10u, "Error: integer_sequence.size()" );
 
-    static_assert( integer_sequence_at<0, i>::value == 0, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<1, i>::value == 1, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<2, i>::value == 2, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<3, i>::value == 3, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<4, i>::value == 4, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<5, i>::value == 5, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<6, i>::value == 6, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<7, i>::value == 7, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<8, i>::value == 8, "Error: integer_sequence_at" );
-    static_assert( integer_sequence_at<9, i>::value == 9, "Error: integer_sequence_at" );
-
-    static_assert( at(0, i{}) == 0, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(1, i{}) == 1, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(2, i{}) == 2, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(3, i{}) == 3, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(4, i{}) == 4, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(5, i{}) == 5, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(6, i{}) == 6, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(7, i{}) == 7, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(8, i{}) == 8, "Error: at(unsigned, integer_sequence)" );
-    static_assert( at(9, i{}) == 9, "Error: at(unsigned, integer_sequence)" );
+    static_assert( integer_sequence_at< 0, i >::value == 0, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 1, i >::value == 1, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 2, i >::value == 2, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 3, i >::value == 3, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 4, i >::value == 4, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 5, i >::value == 5, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 6, i >::value == 6, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 7, i >::value == 7, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 8, i >::value == 8, "Error: integer_sequence_at" );
+    static_assert( integer_sequence_at< 9, i >::value == 9, "Error: integer_sequence_at" );
+
+    static_assert( at( 0, i{} ) == 0, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 1, i{} ) == 1, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 2, i{} ) == 2, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 3, i{} ) == 3, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 4, i{} ) == 4, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 5, i{} ) == 5, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 6, i{} ) == 6, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 7, i{} ) == 7, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 8, i{} ) == 8, "Error: at(unsigned, integer_sequence)" );
+    static_assert( at( 9, i{} ) == 9, "Error: at(unsigned, integer_sequence)" );
   }
 
   {
-    using i = make_integer_sequence<int, 5>;
-    using r = reverse_integer_sequence<i>;
-    using gr = integer_sequence<int, 4, 3, 2, 1, 0>;
+    using i = make_integer_sequence< int, 5 >;
+    using r = reverse_integer_sequence< i >;
+    using gr = integer_sequence< int, 4, 3, 2, 1, 0 >;
 
-    static_assert( std::is_same<r,gr>::value, "Error: reverse_integer_sequence" );
+    static_assert( std::is_same< r, gr >::value, "Error: reverse_integer_sequence" );
   }
 
   {
-    using s = make_integer_sequence<int,10>;
-    using e = exclusive_scan_integer_sequence<s>;
-    using i = inclusive_scan_integer_sequence<s>;
+    using s = make_integer_sequence< int, 10 >;
+    using e = exclusive_scan_integer_sequence< s >;
+    using i = inclusive_scan_integer_sequence< s >;
 
-    using ge = integer_sequence<int, 0, 0, 1, 3, 6, 10, 15, 21, 28, 36>;
-    using gi = integer_sequence<int, 0, 1, 3, 6, 10, 15, 21, 28, 36, 45>;
+    using ge = integer_sequence< int, 0, 0, 1, 3, 6, 10, 15, 21, 28, 36 >;
+    using gi = integer_sequence< int, 0, 1, 3, 6, 10, 15, 21, 28, 36, 45 >;
 
-    static_assert( e::value == 45, "Error: scan value");
-    static_assert( i::value == 45, "Error: scan value");
+    static_assert( e::value == 45, "Error: scan value" );
+    static_assert( i::value == 45, "Error: scan value" );
 
-    static_assert( std::is_same< e::type, ge >::value, "Error: exclusive_scan");
-    static_assert( std::is_same< i::type, gi >::value, "Error: inclusive_scan");
+    static_assert( std::is_same< e::type, ge >::value, "Error: exclusive_scan" );
+    static_assert( std::is_same< i::type, gi >::value, "Error: inclusive_scan" );
   }
-
-
 }
 
 } // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
index a96f31cc12f227a66097c595e1f0fb44dd17a8c4..cbf86dc58c78fb44442d08497874a667f3923efb 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,103 +48,92 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
-
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
-template< class T , class ... P >
-size_t allocation_count( const Kokkos::View<T,P...> & view )
+template< class T, class ... P >
+size_t allocation_count( const Kokkos::View< T, P... > & view )
 {
   const size_t card  = view.size();
   const size_t alloc = view.span();
 
-  const int memory_span = Kokkos::View<int*>::required_allocation_size(100);
+  const int memory_span = Kokkos::View< int* >::required_allocation_size( 100 );
 
-  return (card <= alloc && memory_span == 400) ? alloc : 0 ;
+  return ( card <= alloc && memory_span == 400 ) ? alloc : 0;
 }
 
 /*--------------------------------------------------------------------------*/
 
-template< typename T, class DeviceType>
+template< typename T, class DeviceType >
 struct TestViewOperator
 {
-  typedef typename DeviceType::execution_space  execution_space ;
+  typedef typename DeviceType::execution_space  execution_space;
 
-  static const unsigned N = 100 ;
-  static const unsigned D = 3 ;
+  static const unsigned N = 100;
+  static const unsigned D = 3;
 
-  typedef Kokkos::View< T*[D] , execution_space > view_type ;
+  typedef Kokkos::View< T*[D], execution_space > view_type;
 
-  const view_type v1 ;
-  const view_type v2 ;
+  const view_type v1;
+  const view_type v2;
 
   TestViewOperator()
-    : v1( "v1" , N )
-    , v2( "v2" , N )
+    : v1( "v1", N )
+    , v2( "v2", N )
     {}
 
   static void testit()
   {
-    Kokkos::parallel_for( N , TestViewOperator() );
+    Kokkos::parallel_for( N, TestViewOperator() );
   }
 
   KOKKOS_INLINE_FUNCTION
   void operator()( const unsigned i ) const
   {
-    const unsigned X = 0 ;
-    const unsigned Y = 1 ;
-    const unsigned Z = 2 ;
+    const unsigned X = 0;
+    const unsigned Y = 1;
+    const unsigned Z = 2;
 
-    v2(i,X) = v1(i,X);
-    v2(i,Y) = v1(i,Y);
-    v2(i,Z) = v1(i,Z);
+    v2( i, X ) = v1( i, X );
+    v2( i, Y ) = v1( i, Y );
+    v2( i, Z ) = v1( i, Z );
   }
 };
 
 /*--------------------------------------------------------------------------*/
 
-template< class DataType ,
-          class DeviceType ,
+template< class DataType,
+          class DeviceType,
           unsigned Rank = Kokkos::ViewTraits< DataType >::rank >
-struct TestViewOperator_LeftAndRight ;
+struct TestViewOperator_LeftAndRight;
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 8 >
 {
-  typedef typename DeviceType::execution_space    execution_space ;
-  typedef typename DeviceType::memory_space       memory_space ;
-  typedef typename execution_space::size_type     size_type ;
+  typedef typename DeviceType::execution_space    execution_space;
+  typedef typename DeviceType::memory_space       memory_space;
+  typedef typename execution_space::size_type     size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -157,93 +146,89 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i7 = 0 ; i7 < unsigned(left.dimension_7()) ; ++i7 )
-    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i7 = 0; i7 < unsigned( left.dimension_7() ); ++i7 )
+    for ( unsigned i6 = 0; i6 < unsigned( left.dimension_6() ); ++i6 )
+    for ( unsigned i5 = 0; i5 < unsigned( left.dimension_5() ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4, i5, i6, i7 ) -
                      & left(  0,  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
 
-      if ( & left(i0,i1,i2,i3,i4,i5,i6,i7) !=
-           & left_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
-        update |= 4 ;
+      if ( & left( i0, i1, i2, i3, i4, i5, i6, i7 ) !=
+           & left_stride( i0, i1, i2, i3, i4, i5, i6, i7 ) ) {
+        update |= 4;
       }
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
-    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
-    for ( unsigned i7 = 0 ; i7 < unsigned(right.dimension_7()) ; ++i7 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.dimension_5() ); ++i5 )
+    for ( unsigned i6 = 0; i6 < unsigned( right.dimension_6() ); ++i6 )
+    for ( unsigned i7 = 0; i7 < unsigned( right.dimension_7() ); ++i7 )
     {
       const long j = & right( i0, i1, i2, i3, i4, i5, i6, i7 ) -
                      & right(  0,  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
 
-      if ( & right(i0,i1,i2,i3,i4,i5,i6,i7) !=
-           & right_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
-        update |= 8 ;
+      if ( & right( i0, i1, i2, i3, i4, i5, i6, i7 ) !=
+           & right_stride( i0, i1, i2, i3, i4, i5, i6, i7 ) ) {
+        update |= 8;
       }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 7 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -254,81 +239,77 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i6 = 0 ; i6 < unsigned(left.dimension_6()) ; ++i6 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i6 = 0; i6 < unsigned( left.dimension_6() ); ++i6 )
+    for ( unsigned i5 = 0; i5 < unsigned( left.dimension_5() ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
                      & left(  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
-    for ( unsigned i6 = 0 ; i6 < unsigned(right.dimension_6()) ; ++i6 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.dimension_5() ); ++i5 )
+    for ( unsigned i6 = 0; i6 < unsigned( right.dimension_6() ); ++i6 )
     {
       const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
                      & right(  0,  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 6 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -339,84 +320,78 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i5 = 0 ; i5 < unsigned(left.dimension_5()) ; ++i5 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i5 = 0; i5 < unsigned( left.dimension_5() ); ++i5 )
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4, i5 ) -
                      & left(  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
-    for ( unsigned i5 = 0 ; i5 < unsigned(right.dimension_5()) ; ++i5 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
+    for ( unsigned i5 = 0; i5 < unsigned( right.dimension_5() ); ++i5 )
     {
       const long j = & right( i0, i1, i2, i3, i4, i5 ) -
                      & right(  0,  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 5 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -429,83 +404,79 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
-
-    offset = -1 ;
-    for ( unsigned i4 = 0 ; i4 < unsigned(left.dimension_4()) ; ++i4 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    long offset = -1;
+
+    for ( unsigned i4 = 0; i4 < unsigned( left.dimension_4() ); ++i4 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3, i4 ) -
                      & left(  0,  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
 
       if ( & left( i0, i1, i2, i3, i4 ) !=
-           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; }
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4; }
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
-    for ( unsigned i4 = 0 ; i4 < unsigned(right.dimension_4()) ; ++i4 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
+    for ( unsigned i4 = 0; i4 < unsigned( right.dimension_4() ); ++i4 )
     {
       const long j = & right( i0, i1, i2, i3, i4 ) -
                      & right(  0,  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
 
       if ( & right( i0, i1, i2, i3, i4 ) !=
-           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; }
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8; }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 4 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -516,84 +487,78 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
+    long offset = -1;
 
-    offset = -1 ;
-    for ( unsigned i3 = 0 ; i3 < unsigned(left.dimension_3()) ; ++i3 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i3 = 0; i3 < unsigned( left.dimension_3() ); ++i3 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2, i3 ) -
                      & left(  0,  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
-    for ( unsigned i3 = 0 ; i3 < unsigned(right.dimension_3()) ; ++i3 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
+    for ( unsigned i3 = 0; i3 < unsigned( right.dimension_3() ); ++i3 )
     {
       const long j = & right( i0, i1, i2, i3 ) -
                      & right(  0,  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 3 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
-    : left(  std::string("left") )
-    , right( std::string("right") )
+    : left(  std::string( "left" ) )
+    , right( std::string( "right" ) )
     , left_stride( left )
     , right_stride( right )
     , left_alloc( allocation_count( left ) )
@@ -602,85 +567,81 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
+    long offset = -1;
 
-    offset = -1 ;
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1, i2 ) -
                      & left(  0,  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
 
-      if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; }
+      if ( & left( i0, i1, i2 ) != & left_stride( i0, i1, i2 ) ) { update |= 4; }
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(right.dimension_2()) ; ++i2 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( right.dimension_2() ); ++i2 )
     {
       const long j = & right( i0, i1, i2 ) -
                      & right(  0,  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
 
-      if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; }
+      if ( & right( i0, i1, i2 ) != & right_stride( i0, i1, i2 ) ) { update |= 8; }
     }
 
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i2 = 0 ; i2 < unsigned(left.dimension_2()) ; ++i2 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i2 = 0; i2 < unsigned( left.dimension_2() ); ++i2 )
     {
-      if ( & left(i0,i1,i2)  != & left(i0,i1,i2,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0,i1,i2) != & right(i0,i1,i2,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left( i0, i1, i2 )  != & left( i0, i1, i2, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0, i1, i2 ) != & right( i0, i1, i2, 0, 0, 0, 0, 0 ) ) { update |= 3; }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 2 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
 
-  left_view    left ;
-  right_view   right ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -691,83 +652,77 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    long offset ;
+    long offset = -1;
 
-    offset = -1 ;
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
       const long j = & left( i0, i1 ) -
                      & left(  0,  0 );
-      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
-      offset = j ;
+      if ( j <= offset || left_alloc <= j ) { update |= 1; }
+      offset = j;
     }
 
-    offset = -1 ;
-    for ( unsigned i0 = 0 ; i0 < unsigned(right.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(right.dimension_1()) ; ++i1 )
+    offset = -1;
+
+    for ( unsigned i0 = 0; i0 < unsigned( right.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( right.dimension_1() ); ++i1 )
     {
       const long j = & right( i0, i1 ) -
                      & right(  0,  0 );
-      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
-      offset = j ;
+      if ( j <= offset || right_alloc <= j ) { update |= 2; }
+      offset = j;
     }
 
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
-    for ( unsigned i1 = 0 ; i1 < unsigned(left.dimension_1()) ; ++i1 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
+    for ( unsigned i1 = 0; i1 < unsigned( left.dimension_1() ); ++i1 )
     {
-      if ( & left(i0,i1)  != & left(i0,i1,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0,i1) != & right(i0,i1,0,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left( i0, i1 )  != & left( i0, i1, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0, i1 ) != & right( i0, i1, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
     }
   }
 };
 
-template< class DataType , class DeviceType >
-struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+template< class DataType, class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType, DeviceType, 1 >
 {
-  typedef typename DeviceType::execution_space  execution_space ;
-  typedef typename DeviceType::memory_space     memory_space ;
-  typedef typename execution_space::size_type   size_type ;
+  typedef typename DeviceType::execution_space  execution_space;
+  typedef typename DeviceType::memory_space     memory_space;
+  typedef typename execution_space::size_type   size_type;
 
-  typedef int value_type ;
+  typedef int value_type;
 
   KOKKOS_INLINE_FUNCTION
-  static void join( volatile value_type & update ,
+  static void join( volatile value_type & update,
                     const volatile value_type & input )
-    { update |= input ; }
+  { update |= input; }
 
   KOKKOS_INLINE_FUNCTION
   static void init( value_type & update )
-    { update = 0 ; }
-
+  { update = 0; }
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+  typedef Kokkos::View< DataType, Kokkos::LayoutLeft, execution_space > left_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutRight, execution_space > right_view;
+  typedef Kokkos::View< DataType, Kokkos::LayoutStride, execution_space > stride_view;
 
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
-
-  typedef Kokkos::
-    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
-
-  left_view    left ;
-  right_view   right ;
-  stride_view  left_stride ;
-  stride_view  right_stride ;
-  long         left_alloc ;
-  long         right_alloc ;
+  left_view    left;
+  right_view   right;
+  stride_view  left_stride;
+  stride_view  right_stride;
+  long         left_alloc;
+  long         right_alloc;
 
   TestViewOperator_LeftAndRight()
     : left(  "left" )
@@ -780,78 +735,75 @@ struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
 
   static void testit()
   {
-    TestViewOperator_LeftAndRight driver ;
+    TestViewOperator_LeftAndRight driver;
 
-    int error_flag = 0 ;
+    int error_flag = 0;
 
-    Kokkos::parallel_reduce( 1 , driver , error_flag );
+    Kokkos::parallel_reduce( 1, driver, error_flag );
 
-    ASSERT_EQ( error_flag , 0 );
+    ASSERT_EQ( error_flag, 0 );
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const size_type , value_type & update ) const
+  void operator()( const size_type, value_type & update ) const
   {
-    for ( unsigned i0 = 0 ; i0 < unsigned(left.dimension_0()) ; ++i0 )
+    for ( unsigned i0 = 0; i0 < unsigned( left.dimension_0() ); ++i0 )
     {
-      if ( & left(i0)  != & left(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
-      if ( & right(i0) != & right(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
-      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
-      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
+      if ( & left( i0 )  != & left( i0, 0, 0, 0, 0, 0, 0, 0 ) )  { update |= 3; }
+      if ( & right( i0 ) != & right( i0, 0, 0, 0, 0, 0, 0, 0 ) ) { update |= 3; }
+      if ( & left( i0 )  != & left_stride( i0 ) ) { update |= 4; }
+      if ( & right( i0 ) != & right_stride( i0 ) ) { update |= 8; }
     }
   }
 };
 
-template<class Layout, class DeviceType>
-struct TestViewMirror {
-
-  template<class MemoryTraits>
+template< class Layout, class DeviceType >
+struct TestViewMirror
+{
+  template< class MemoryTraits >
   void static test_mirror() {
-    Kokkos::View<double*, Layout, Kokkos::HostSpace> a_org("A",1000);
-    Kokkos::View<double*, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org;
-    auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
-    auto a_d = Kokkos::create_mirror(DeviceType(),a_h);
-
-    int equal_ptr_h_h2  = (a_h.data() ==a_h2.data())?1:0;
-    int equal_ptr_h_d   = (a_h.data() ==a_d. data())?1:0;
-    int equal_ptr_h2_d  = (a_h2.data()==a_d. data())?1:0;
-
-    ASSERT_EQ(equal_ptr_h_h2,0);
-    ASSERT_EQ(equal_ptr_h_d ,0);
-    ASSERT_EQ(equal_ptr_h2_d,0);
-    
-
-    ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
-    ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
-  }
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 1000 );
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror( DeviceType(), a_h );
 
+    int equal_ptr_h_h2 = ( a_h.data()  == a_h2.data() ) ? 1 : 0;
+    int equal_ptr_h_d  = ( a_h.data()  ==  a_d.data() ) ? 1 : 0;
+    int equal_ptr_h2_d = ( a_h2.data() ==  a_d.data() ) ? 1 : 0;
 
-  template<class MemoryTraits>
-  void static test_mirror_view() {
-    Kokkos::View<double*, Layout, Kokkos::HostSpace> a_org("A",1000);
-    Kokkos::View<double*, Layout, Kokkos::HostSpace, MemoryTraits> a_h = a_org;
-    auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
-    auto a_d = Kokkos::create_mirror_view(DeviceType(),a_h);
-
-    int equal_ptr_h_h2  = a_h.data() ==a_h2.data()?1:0;
-    int equal_ptr_h_d   = a_h.data() ==a_d. data()?1:0;
-    int equal_ptr_h2_d  = a_h2.data()==a_d. data()?1:0;
-
-    int is_same_memspace = std::is_same<Kokkos::HostSpace,typename DeviceType::memory_space>::value?1:0; 
-    ASSERT_EQ(equal_ptr_h_h2,1);
-    ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
-    ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
+    ASSERT_EQ( equal_ptr_h_h2, 0 );
+    ASSERT_EQ( equal_ptr_h_d, 0 );
+    ASSERT_EQ( equal_ptr_h2_d, 0 );
 
+    ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() );
+    ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
+  }
 
-    ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
-    ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
-  } 
+  template< class MemoryTraits >
+  void static test_mirror_view() {
+    Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 1000 );
+    Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
+    auto a_h2 = Kokkos::create_mirror_view( Kokkos::HostSpace(), a_h );
+    auto a_d = Kokkos::create_mirror_view( DeviceType(), a_h );
+
+    int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+    int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+    int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+
+    int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+    ASSERT_EQ( equal_ptr_h_h2, 1 );
+    ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+    ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+
+    ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() );
+    ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
+  }
 
   void static testit() {
-    test_mirror<Kokkos::MemoryTraits<0>>();
-    test_mirror<Kokkos::MemoryTraits<Kokkos::Unmanaged>>();
-    test_mirror_view<Kokkos::MemoryTraits<0>>();
-    test_mirror_view<Kokkos::MemoryTraits<Kokkos::Unmanaged>>();
+    test_mirror< Kokkos::MemoryTraits<0> >();
+    test_mirror< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
+    test_mirror_view< Kokkos::MemoryTraits<0> >();
+    test_mirror_view< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
   }
 };
 
@@ -861,23 +813,21 @@ template< typename T, class DeviceType >
 class TestViewAPI
 {
 public:
-  typedef DeviceType        device ;
+  typedef DeviceType device;
 
-  enum { N0 = 1000 ,
-         N1 = 3 ,
-         N2 = 5 ,
+  enum { N0 = 1000,
+         N1 = 3,
+         N2 = 5,
          N3 = 7 };
 
-  typedef Kokkos::View< T , device > dView0 ;
-  typedef Kokkos::View< T* , device > dView1 ;
-  typedef Kokkos::View< T*[N1] , device > dView2 ;
-  typedef Kokkos::View< T*[N1][N2] , device > dView3 ;
-  typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ;
-  typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ;
-
-  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ;
-
-  typedef typename dView0::host_mirror_space host ;
+  typedef Kokkos::View< T, device > dView0;
+  typedef Kokkos::View< T*, device > dView1;
+  typedef Kokkos::View< T*[N1], device > dView2;
+  typedef Kokkos::View< T*[N1][N2], device > dView3;
+  typedef Kokkos::View< T*[N1][N2][N3], device > dView4;
+  typedef Kokkos::View< const T*[N1][N2][N3], device > const_dView4;
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged;
+  typedef typename dView0::host_mirror_space host;
 
   TestViewAPI()
   {
@@ -889,41 +839,38 @@ public:
     run_test_subview_strided();
     run_test_vector();
 
-    TestViewOperator< T , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2][3] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4][2] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3][4] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2][3] , device >::testit();
-    TestViewOperator_LeftAndRight< int[2] , device >::testit();
-    TestViewMirror<Kokkos::LayoutLeft, device >::testit(); 
-    TestViewMirror<Kokkos::LayoutRight, device >::testit(); 
-
+    TestViewOperator< T, device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4], device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3], device >::testit();
+    TestViewOperator_LeftAndRight< int[2], device >::testit();
+    TestViewMirror< Kokkos::LayoutLeft, device >::testit();
+    TestViewMirror< Kokkos::LayoutRight, device >::testit();
   }
 
   static void run_test_mirror()
   {
-    typedef Kokkos::View< int , host > view_type ;
-    typedef typename view_type::HostMirror mirror_type ;
+    typedef Kokkos::View< int, host > view_type;
+    typedef typename view_type::HostMirror mirror_type;
 
-    static_assert( std::is_same< typename view_type::memory_space
-                               , typename mirror_type::memory_space
-                               >::value , "" );
+    static_assert( std::is_same< typename view_type::memory_space, typename mirror_type::memory_space >::value, "" );
 
-    view_type a("a");
-    mirror_type am = Kokkos::create_mirror_view(a);
-    mirror_type ax = Kokkos::create_mirror(a);
-    ASSERT_EQ( & a() , & am() );
+    view_type a( "a" );
+    mirror_type am = Kokkos::create_mirror_view( a );
+    mirror_type ax = Kokkos::create_mirror( a );
+    ASSERT_EQ( & a(), & am() );
   }
 
   static void run_test_scalar()
   {
-    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView0::HostMirror  hView0;
 
-    dView0 dx , dy ;
-    hView0 hx , hy ;
+    dView0 dx, dy;
+    hView0 hx, hy;
 
     dx = dView0( "dx" );
     dy = dView0( "dy" );
@@ -931,11 +878,11 @@ public:
     hx = Kokkos::create_mirror( dx );
     hy = Kokkos::create_mirror( dy );
 
-    hx() = 1 ;
+    hx() = 1;
 
-    Kokkos::deep_copy( dx , hx );
-    Kokkos::deep_copy( dy , dx );
-    Kokkos::deep_copy( hy , dy );
+    Kokkos::deep_copy( dx, hx );
+    Kokkos::deep_copy( dy, dx );
+    Kokkos::deep_copy( hy, dy );
 
     ASSERT_EQ( hx(), hy() );
   }
@@ -948,11 +895,11 @@ public:
     // usual "(void)" marker to avoid compiler warnings for unused
     // variables.
 
-    typedef typename dView0::HostMirror  hView0 ;
-    typedef typename dView1::HostMirror  hView1 ;
-    typedef typename dView2::HostMirror  hView2 ;
-    typedef typename dView3::HostMirror  hView3 ;
-    typedef typename dView4::HostMirror  hView4 ;
+    typedef typename dView0::HostMirror  hView0;
+    typedef typename dView1::HostMirror  hView1;
+    typedef typename dView2::HostMirror  hView2;
+    typedef typename dView3::HostMirror  hView3;
+    typedef typename dView4::HostMirror  hView4;
 
     {
       hView0 thing;
@@ -975,8 +922,8 @@ public:
       (void) thing;
     }
 
-    dView4 dx , dy , dz ;
-    hView4 hx , hy , hz ;
+    dView4 dx, dy, dz;
+    hView4 hx, hy, hz;
 
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
@@ -984,220 +931,239 @@ public:
     ASSERT_TRUE( hx.ptr_on_device() == 0 );
     ASSERT_TRUE( hy.ptr_on_device() == 0 );
     ASSERT_TRUE( hz.ptr_on_device() == 0 );
-    ASSERT_EQ( dx.dimension_0() , 0u );
-    ASSERT_EQ( dy.dimension_0() , 0u );
-    ASSERT_EQ( dz.dimension_0() , 0u );
-    ASSERT_EQ( hx.dimension_0() , 0u );
-    ASSERT_EQ( hy.dimension_0() , 0u );
-    ASSERT_EQ( hz.dimension_0() , 0u );
-    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dz.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( hz.dimension_1() , unsigned(N1) );
-
-    dx = dView4( "dx" , N0 );
-    dy = dView4( "dy" , N0 );
-
-    ASSERT_EQ( dx.use_count() , size_t(1) );
+    ASSERT_EQ( dx.dimension_0(), 0u );
+    ASSERT_EQ( dy.dimension_0(), 0u );
+    ASSERT_EQ( dz.dimension_0(), 0u );
+    ASSERT_EQ( hx.dimension_0(), 0u );
+    ASSERT_EQ( hy.dimension_0(), 0u );
+    ASSERT_EQ( hz.dimension_0(), 0u );
+    ASSERT_EQ( dx.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dy.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dz.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( hx.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( hy.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( hz.dimension_1(), unsigned( N1 ) );
+
+    dx = dView4( "dx", N0 );
+    dy = dView4( "dy", N0 );
+
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
 
     dView4_unmanaged unmanaged_dx = dx;
-    ASSERT_EQ( dx.use_count() , size_t(1) );
+    ASSERT_EQ( dx.use_count(), size_t( 1 ) );
 
-    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged(dx.ptr_on_device(),
-                                                              dx.dimension_0(),
-                                                              dx.dimension_1(),
-                                                              dx.dimension_2(),
-                                                              dx.dimension_3());
+    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged( dx.ptr_on_device(),
+                                                               dx.dimension_0(),
+                                                               dx.dimension_1(),
+                                                               dx.dimension_2(),
+                                                               dx.dimension_3() );
 
     {
-      // Destruction of this view should be harmless
-      const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
-                                                dx.dimension_0() ,
-                                                dx.dimension_1() ,
-                                                dx.dimension_2() ,
+      // Destruction of this view should be harmless.
+      const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device(),
+                                                dx.dimension_0(),
+                                                dx.dimension_1(),
+                                                dx.dimension_2(),
                                                 dx.dimension_3() );
     }
 
-    const_dView4 const_dx = dx ;
-    ASSERT_EQ( dx.use_count() , size_t(2) );
+    const_dView4 const_dx = dx;
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
     {
       const_dView4 const_dx2;
       const_dx2 = const_dx;
-      ASSERT_EQ( dx.use_count() , size_t(3) );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
 
       const_dx2 = dy;
-      ASSERT_EQ( dx.use_count() , size_t(2) );
+      ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
-      const_dView4 const_dx3(dx);
-      ASSERT_EQ( dx.use_count() , size_t(3) );
-      
-      dView4_unmanaged dx4_unmanaged(dx);
-      ASSERT_EQ( dx.use_count() , size_t(3) );
-    }
+      const_dView4 const_dx3( dx );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
 
-    ASSERT_EQ( dx.use_count() , size_t(2) );
+      dView4_unmanaged dx4_unmanaged( dx );
+      ASSERT_EQ( dx.use_count(), size_t( 3 ) );
+    }
 
+    ASSERT_EQ( dx.use_count(), size_t( 2 ) );
 
     ASSERT_FALSE( dx.ptr_on_device() == 0 );
     ASSERT_FALSE( const_dx.ptr_on_device() == 0 );
     ASSERT_FALSE( unmanaged_dx.ptr_on_device() == 0 );
     ASSERT_FALSE( unmanaged_from_ptr_dx.ptr_on_device() == 0 );
     ASSERT_FALSE( dy.ptr_on_device() == 0 );
-    ASSERT_NE( dx , dy );
+    ASSERT_NE( dx, dy );
 
-    ASSERT_EQ( dx.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dx.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( dx.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( dx.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( dx.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dx.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( dx.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( dy.dimension_0() , unsigned(N0) );
-    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
-    ASSERT_EQ( dy.dimension_2() , unsigned(N2) );
-    ASSERT_EQ( dy.dimension_3() , unsigned(N3) );
+    ASSERT_EQ( dy.dimension_0(), unsigned( N0 ) );
+    ASSERT_EQ( dy.dimension_1(), unsigned( N1 ) );
+    ASSERT_EQ( dy.dimension_2(), unsigned( N2 ) );
+    ASSERT_EQ( dy.dimension_3(), unsigned( N3 ) );
 
-    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(), unsigned( N0 ) * unsigned( N1 ) * unsigned( N2 ) * unsigned( N3 ) );
 
     hx = Kokkos::create_mirror( dx );
     hy = Kokkos::create_mirror( dy );
 
-    // T v1 = hx() ;    // Generates compile error as intended
-    // T v2 = hx(0,0) ; // Generates compile error as intended
-    // hx(0,0) = v2 ;   // Generates compile error as intended
+    // T v1 = hx();       // Generates compile error as intended.
+    // T v2 = hx( 0, 0 ); // Generates compile error as intended.
+    // hx( 0, 0 ) = v2;   // Generates compile error as intended.
 
     // Testing with asynchronous deep copy with respect to device
     {
-      size_t count = 0 ;
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
-        hx(ip,i1,i2,i3) = ++count ;
-      }}}}
-
-
-      Kokkos::deep_copy(typename hView4::execution_space(), dx , hx );
-      Kokkos::deep_copy(typename hView4::execution_space(), dy , dx );
-      Kokkos::deep_copy(typename hView4::execution_space(), hy , dy );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
-      }}}}
-
-      Kokkos::deep_copy(typename hView4::execution_space(), dx , T(0) );
-      Kokkos::deep_copy(typename hView4::execution_space(), hx , dx );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
-      }}}}
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.dimension_1(); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.dimension_2(); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.dimension_3(); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( typename hView4::execution_space(), dx, hx );
+      Kokkos::deep_copy( typename hView4::execution_space(), dy, dx );
+      Kokkos::deep_copy( typename hView4::execution_space(), hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( typename hView4::execution_space(), dx, T( 0 ) );
+      Kokkos::deep_copy( typename hView4::execution_space(), hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
     }
 
-    // Testing with asynchronous deep copy with respect to host
+    // Testing with asynchronous deep copy with respect to host.
     {
-      size_t count = 0 ;
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
-        hx(ip,i1,i2,i3) = ++count ;
-      }}}}
-
-      Kokkos::deep_copy(typename dView4::execution_space(), dx , hx );
-      Kokkos::deep_copy(typename dView4::execution_space(), dy , dx );
-      Kokkos::deep_copy(typename dView4::execution_space(), hy , dy );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
-      }}}}
-
-      Kokkos::deep_copy(typename dView4::execution_space(), dx , T(0) );
-      Kokkos::deep_copy(typename dView4::execution_space(), hx , dx );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
-      }}}}
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.dimension_1(); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.dimension_2(); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.dimension_3(); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( typename dView4::execution_space(), dx, hx );
+      Kokkos::deep_copy( typename dView4::execution_space(), dy, dx );
+      Kokkos::deep_copy( typename dView4::execution_space(), hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( typename dView4::execution_space(), dx, T( 0 ) );
+      Kokkos::deep_copy( typename dView4::execution_space(), hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
     }
 
-    // Testing with synchronous deep copy
+    // Testing with synchronous deep copy.
     {
-      size_t count = 0 ;
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
-        hx(ip,i1,i2,i3) = ++count ;
-      }}}}
-
-      Kokkos::deep_copy( dx , hx );
-      Kokkos::deep_copy( dy , dx );
-      Kokkos::deep_copy( hy , dy );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
-      }}}}
-
-      Kokkos::deep_copy( dx , T(0) );
-      Kokkos::deep_copy( hx , dx );
-
-      for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
-      for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-      for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-      for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-        { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
-      }}}}
+      size_t count = 0;
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < hx.dimension_1(); ++i1 )
+      for ( size_t i2 = 0; i2 < hx.dimension_2(); ++i2 )
+      for ( size_t i3 = 0; i3 < hx.dimension_3(); ++i3 )
+      {
+        hx( ip, i1, i2, i3 ) = ++count;
+      }
+
+      Kokkos::deep_copy( dx, hx );
+      Kokkos::deep_copy( dy, dx );
+      Kokkos::deep_copy( hy, dy );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), hy( ip, i1, i2, i3 ) );
+      }
+
+      Kokkos::deep_copy( dx, T( 0 ) );
+      Kokkos::deep_copy( hx, dx );
+
+      for ( size_t ip = 0; ip < N0; ++ip )
+      for ( size_t i1 = 0; i1 < N1; ++i1 )
+      for ( size_t i2 = 0; i2 < N2; ++i2 )
+      for ( size_t i3 = 0; i3 < N3; ++i3 )
+      {
+        ASSERT_EQ( hx( ip, i1, i2, i3 ), T( 0 ) );
+      }
     }
-    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
-    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
+
+    dz = dx;
+    ASSERT_EQ( dx, dz );
+    ASSERT_NE( dy, dz );
+
+    dz = dy;
+    ASSERT_EQ( dy, dz );
+    ASSERT_NE( dx, dz );
 
     dx = dView4();
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_FALSE( dy.ptr_on_device() == 0 );
     ASSERT_FALSE( dz.ptr_on_device() == 0 );
+
     dy = dView4();
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
     ASSERT_FALSE( dz.ptr_on_device() == 0 );
+
     dz = dView4();
     ASSERT_TRUE( dx.ptr_on_device() == 0 );
     ASSERT_TRUE( dy.ptr_on_device() == 0 );
     ASSERT_TRUE( dz.ptr_on_device() == 0 );
   }
 
-  typedef T DataType[2] ;
+  typedef T DataType[2];
 
   static void
   check_auto_conversion_to_const(
-     const Kokkos::View< const DataType , device > & arg_const ,
-     const Kokkos::View< DataType , device > & arg )
+     const Kokkos::View< const DataType, device > & arg_const,
+     const Kokkos::View< DataType, device > & arg )
   {
     ASSERT_TRUE( arg_const == arg );
   }
 
   static void run_test_const()
   {
-    typedef Kokkos::View< DataType , device > typeX ;
-    typedef Kokkos::View< const DataType , device > const_typeX ;
-    typedef Kokkos::View< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ;
+    typedef Kokkos::View< DataType, device > typeX;
+    typedef Kokkos::View< const DataType, device > const_typeX;
+    typedef Kokkos::View< const DataType, device, Kokkos::MemoryRandomAccess > const_typeR;
+
     typeX x( "X" );
-    const_typeX xc = x ;
-    const_typeR xr = x ;
+    const_typeX xc = x;
+    const_typeR xr = x;
 
     ASSERT_TRUE( xc == x );
     ASSERT_TRUE( x == xc );
@@ -1206,144 +1172,142 @@ public:
     // an lvalue reference due to retrieving through texture cache
     // therefore not allowed to query the underlying pointer.
 #if defined( KOKKOS_ENABLE_CUDA )
-    if ( ! std::is_same< typename device::execution_space , Kokkos::Cuda >::value )
+    if ( !std::is_same< typename device::execution_space, Kokkos::Cuda >::value )
 #endif
     {
       ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() );
     }
 
-    // typeX xf = xc ; // setting non-const from const must not compile
+    // typeX xf = xc; // Setting non-const from const must not compile.
 
-    check_auto_conversion_to_const( x , x );
+    check_auto_conversion_to_const( x, x );
   }
 
   static void run_test_subview()
   {
-    typedef Kokkos::View< const T , device > sView ;
+    typedef Kokkos::View< const T, device > sView;
 
     dView0 d0( "d0" );
-    dView1 d1( "d1" , N0 );
-    dView2 d2( "d2" , N0 );
-    dView3 d3( "d3" , N0 );
-    dView4 d4( "d4" , N0 );
-
-    sView s0 = d0 ;
-    sView s1 = Kokkos::subview( d1 , 1 );
-    sView s2 = Kokkos::subview( d2 , 1 , 1 );
-    sView s3 = Kokkos::subview( d3 , 1 , 1 , 1 );
-    sView s4 = Kokkos::subview( d4 , 1 , 1 , 1 , 1 );
+    dView1 d1( "d1", N0 );
+    dView2 d2( "d2", N0 );
+    dView3 d3( "d3", N0 );
+    dView4 d4( "d4", N0 );
+
+    sView s0 = d0;
+    sView s1 = Kokkos::subview( d1, 1 );
+    sView s2 = Kokkos::subview( d2, 1, 1 );
+    sView s3 = Kokkos::subview( d3, 1, 1, 1 );
+    sView s4 = Kokkos::subview( d4, 1, 1, 1, 1 );
   }
 
   static void run_test_subview_strided()
   {
-    typedef Kokkos::View< int **** , Kokkos::LayoutLeft  , host >  view_left_4 ;
-    typedef Kokkos::View< int **** , Kokkos::LayoutRight , host >  view_right_4 ;
-    typedef Kokkos::View< int **   , Kokkos::LayoutLeft  , host >  view_left_2 ;
-    typedef Kokkos::View< int **   , Kokkos::LayoutRight , host >  view_right_2 ;
-
-    typedef Kokkos::View< int * ,  Kokkos::LayoutStride , host >  view_stride_1 ;
-    typedef Kokkos::View< int ** ,  Kokkos::LayoutStride , host >  view_stride_2 ;
-
-    view_left_2  xl2("xl2", 100 , 200 );
-    view_right_2 xr2("xr2", 100 , 200 );
-    view_stride_1  yl1 = Kokkos::subview( xl2 , 0 , Kokkos::ALL() );
-    view_stride_1  yl2 = Kokkos::subview( xl2 , 1 , Kokkos::ALL() );
-    view_stride_1  yr1 = Kokkos::subview( xr2 , 0 , Kokkos::ALL() );
-    view_stride_1  yr2 = Kokkos::subview( xr2 , 1 , Kokkos::ALL() );
-
-    ASSERT_EQ( yl1.dimension_0() , xl2.dimension_1() );
-    ASSERT_EQ( yl2.dimension_0() , xl2.dimension_1() );
-    ASSERT_EQ( yr1.dimension_0() , xr2.dimension_1() );
-    ASSERT_EQ( yr2.dimension_0() , xr2.dimension_1() );
-
-    ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 );
-    ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 );
-    ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 );
-    ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 );
-
-    view_left_4 xl4( "xl4" , 10 , 20 , 30 , 40 );
-    view_right_4 xr4( "xr4" , 10 , 20 , 30 , 40 );
-
-    view_stride_2 yl4 = Kokkos::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
-    view_stride_2 yr4 = Kokkos::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
-
-    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
-    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
-    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
-    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
-
-    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
-    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
+    typedef Kokkos::View< int ****, Kokkos::LayoutLeft , host >  view_left_4;
+    typedef Kokkos::View< int ****, Kokkos::LayoutRight, host >  view_right_4;
+    typedef Kokkos::View< int **  , Kokkos::LayoutLeft , host >  view_left_2;
+    typedef Kokkos::View< int **  , Kokkos::LayoutRight, host >  view_right_2;
+
+    typedef Kokkos::View< int * ,  Kokkos::LayoutStride, host >  view_stride_1;
+    typedef Kokkos::View< int **,  Kokkos::LayoutStride, host >  view_stride_2;
+
+    view_left_2  xl2( "xl2", 100, 200 );
+    view_right_2 xr2( "xr2", 100, 200 );
+    view_stride_1 yl1 = Kokkos::subview( xl2, 0, Kokkos::ALL() );
+    view_stride_1 yl2 = Kokkos::subview( xl2, 1, Kokkos::ALL() );
+    view_stride_1 yr1 = Kokkos::subview( xr2, 0, Kokkos::ALL() );
+    view_stride_1 yr2 = Kokkos::subview( xr2, 1, Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.dimension_0(), xl2.dimension_1() );
+    ASSERT_EQ( yl2.dimension_0(), xl2.dimension_1() );
+    ASSERT_EQ( yr1.dimension_0(), xr2.dimension_1() );
+    ASSERT_EQ( yr2.dimension_0(), xr2.dimension_1() );
+
+    ASSERT_EQ( & yl1( 0 ) - & xl2( 0, 0 ), 0 );
+    ASSERT_EQ( & yl2( 0 ) - & xl2( 1, 0 ), 0 );
+    ASSERT_EQ( & yr1( 0 ) - & xr2( 0, 0 ), 0 );
+    ASSERT_EQ( & yr2( 0 ) - & xr2( 1, 0 ), 0 );
+
+    view_left_4 xl4( "xl4", 10, 20, 30, 40 );
+    view_right_4 xr4( "xr4", 10, 20, 30, 40 );
+
+    view_stride_2 yl4 = Kokkos::subview( xl4, 1, Kokkos::ALL(), 2, Kokkos::ALL() );
+    view_stride_2 yr4 = Kokkos::subview( xr4, 1, Kokkos::ALL(), 2, Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.dimension_0(), xl4.dimension_1() );
+    ASSERT_EQ( yl4.dimension_1(), xl4.dimension_3() );
+    ASSERT_EQ( yr4.dimension_0(), xr4.dimension_1() );
+    ASSERT_EQ( yr4.dimension_1(), xr4.dimension_3() );
+
+    ASSERT_EQ( & yl4( 4, 4 ) - & xl4( 1, 4, 2, 4 ), 0 );
+    ASSERT_EQ( & yr4( 4, 4 ) - & xr4( 1, 4, 2, 4 ), 0 );
   }
 
   static void run_test_vector()
   {
-    static const unsigned Length = 1000 , Count = 8 ;
+    static const unsigned Length = 1000, Count = 8;
 
-    typedef Kokkos::View< T* ,  Kokkos::LayoutLeft , host > vector_type ;
-    typedef Kokkos::View< T** , Kokkos::LayoutLeft , host > multivector_type ;
+    typedef Kokkos::View< T*,  Kokkos::LayoutLeft, host > vector_type;
+    typedef Kokkos::View< T**, Kokkos::LayoutLeft, host > multivector_type;
 
-    typedef Kokkos::View< T* ,  Kokkos::LayoutRight , host > vector_right_type ;
-    typedef Kokkos::View< T** , Kokkos::LayoutRight , host > multivector_right_type ;
+    typedef Kokkos::View< T*,  Kokkos::LayoutRight, host > vector_right_type;
+    typedef Kokkos::View< T**, Kokkos::LayoutRight, host > multivector_right_type;
 
-    typedef Kokkos::View< const T* , Kokkos::LayoutRight, host > const_vector_right_type ;
-    typedef Kokkos::View< const T* , Kokkos::LayoutLeft , host > const_vector_type ;
-    typedef Kokkos::View< const T** , Kokkos::LayoutLeft , host > const_multivector_type ;
+    typedef Kokkos::View< const T*,  Kokkos::LayoutRight, host > const_vector_right_type;
+    typedef Kokkos::View< const T*,  Kokkos::LayoutLeft,  host > const_vector_type;
+    typedef Kokkos::View< const T**, Kokkos::LayoutLeft,  host > const_multivector_type;
 
-    multivector_type mv = multivector_type( "mv" , Length , Count );
-    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
+    multivector_type mv = multivector_type( "mv", Length, Count );
+    multivector_right_type mv_right = multivector_right_type( "mv", Length, Count );
 
-    vector_type v1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
-    vector_type v2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
-    vector_type v3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+    vector_type v1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    vector_type v2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    vector_type v3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    vector_type rv1 = Kokkos::subview( mv_right , 0 , Kokkos::ALL() );
-    vector_type rv2 = Kokkos::subview( mv_right , 1 , Kokkos::ALL() );
-    vector_type rv3 = Kokkos::subview( mv_right , 2 , Kokkos::ALL() );
+    vector_type rv1 = Kokkos::subview( mv_right, 0, Kokkos::ALL() );
+    vector_type rv2 = Kokkos::subview( mv_right, 1, Kokkos::ALL() );
+    vector_type rv3 = Kokkos::subview( mv_right, 2, Kokkos::ALL() );
 
-    multivector_type mv1 = Kokkos::subview( mv , std::make_pair( 1 , 998 ) ,
-                                                 std::make_pair( 2 , 5 ) );
+    multivector_type mv1 = Kokkos::subview( mv, std::make_pair( 1, 998 ),
+                                                std::make_pair( 2, 5 ) );
 
-    multivector_right_type mvr1 =
-      Kokkos::subview( mv_right ,
-                       std::make_pair( 1 , 998 ) ,
-                       std::make_pair( 2 , 5 ) );
+    multivector_right_type mvr1 = Kokkos::subview( mv_right, std::make_pair( 1, 998 ),
+                                                             std::make_pair( 2, 5 ) );
 
-    const_vector_type cv1 = Kokkos::subview( mv , Kokkos::ALL(), 0 );
-    const_vector_type cv2 = Kokkos::subview( mv , Kokkos::ALL(), 1 );
-    const_vector_type cv3 = Kokkos::subview( mv , Kokkos::ALL(), 2 );
+    const_vector_type cv1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    const_vector_type cv2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    const_vector_type cv3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    vector_right_type vr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
-    vector_right_type vr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
-    vector_right_type vr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+    vector_right_type vr1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    vector_right_type vr2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    vector_right_type vr3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    const_vector_right_type cvr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
-    const_vector_right_type cvr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
-    const_vector_right_type cvr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+    const_vector_right_type cvr1 = Kokkos::subview( mv, Kokkos::ALL(), 0 );
+    const_vector_right_type cvr2 = Kokkos::subview( mv, Kokkos::ALL(), 1 );
+    const_vector_right_type cvr3 = Kokkos::subview( mv, Kokkos::ALL(), 2 );
 
-    ASSERT_TRUE( & v1[0] == & v1(0) );
-    ASSERT_TRUE( & v1[0] == & mv(0,0) );
-    ASSERT_TRUE( & v2[0] == & mv(0,1) );
-    ASSERT_TRUE( & v3[0] == & mv(0,2) );
+    ASSERT_TRUE( & v1[0] == & v1( 0 ) );
+    ASSERT_TRUE( & v1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & v2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & v3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & cv1[0] == & mv(0,0) );
-    ASSERT_TRUE( & cv2[0] == & mv(0,1) );
-    ASSERT_TRUE( & cv3[0] == & mv(0,2) );
+    ASSERT_TRUE( & cv1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & cv2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & cv3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & vr1[0] == & mv(0,0) );
-    ASSERT_TRUE( & vr2[0] == & mv(0,1) );
-    ASSERT_TRUE( & vr3[0] == & mv(0,2) );
+    ASSERT_TRUE( & vr1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & vr2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & vr3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & cvr1[0] == & mv(0,0) );
-    ASSERT_TRUE( & cvr2[0] == & mv(0,1) );
-    ASSERT_TRUE( & cvr3[0] == & mv(0,2) );
+    ASSERT_TRUE( & cvr1[0] == & mv( 0, 0 ) );
+    ASSERT_TRUE( & cvr2[0] == & mv( 0, 1 ) );
+    ASSERT_TRUE( & cvr3[0] == & mv( 0, 2 ) );
 
-    ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) );
-    ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) );
-    ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) );
-    ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) );
-    ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) );
-    ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) );
+    ASSERT_TRUE( & mv1( 0, 0 ) == & mv( 1, 2 ) );
+    ASSERT_TRUE( & mv1( 1, 1 ) == & mv( 2, 3 ) );
+    ASSERT_TRUE( & mv1( 3, 2 ) == & mv( 4, 4 ) );
+    ASSERT_TRUE( & mvr1( 0, 0 ) == & mv_right( 1, 2 ) );
+    ASSERT_TRUE( & mvr1( 1, 1 ) == & mv_right( 2, 3 ) );
+    ASSERT_TRUE( & mvr1( 3, 2 ) == & mv_right( 4, 4 ) );
 
     const_vector_type c_cv1( v1 );
     typename vector_type::const_type c_cv2( v2 );
@@ -1356,6 +1320,3 @@ public:
 };
 
 } // namespace Test
-
-/*--------------------------------------------------------------------------*/
-
diff --git a/lib/kokkos/core/unit_test/TestViewMapping.hpp b/lib/kokkos/core/unit_test/TestViewMapping.hpp
index 324f02e94730d99365804684776e48ac64c3a351..71604bed51d93e374c8de9776bb24d2135c95182 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -49,1126 +49,1140 @@
 
 #include <Kokkos_Core.hpp>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
 template< class Space >
 void test_view_mapping()
 {
-  typedef typename Space::execution_space ExecSpace ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<2> dim_s2 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<2,3> dim_s2_s3 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<2,3,4> dim_s2_s3_s4 ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<0> dim_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,3> dim_s0_s3 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,3,4> dim_s0_s3_s4 ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0> dim_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,4> dim_s0_s0_s4 ;
-
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0> dim_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0> dim_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0> dim_s0_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0 ;
-  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0_s0 ;
-
-  // Fully static dimensions should not be larger than an int
-  ASSERT_LE( sizeof(dim_0) , sizeof(int) );
-  ASSERT_LE( sizeof(dim_s2) , sizeof(int) );
-  ASSERT_LE( sizeof(dim_s2_s3) , sizeof(int) );
-  ASSERT_LE( sizeof(dim_s2_s3_s4) , sizeof(int) );
-
-  // Rank 1 is size_t
-  ASSERT_EQ( sizeof(dim_s0) , sizeof(size_t) );
-  ASSERT_EQ( sizeof(dim_s0_s3) , sizeof(size_t) );
-  ASSERT_EQ( sizeof(dim_s0_s3_s4) , sizeof(size_t) );
-
-  // Allow for padding
-  ASSERT_LE( sizeof(dim_s0_s0) , 2 * sizeof(size_t) );
-  ASSERT_LE( sizeof(dim_s0_s0_s4) , 2 * sizeof(size_t) );
-
-  ASSERT_LE( sizeof(dim_s0_s0_s0) , 4 * sizeof(size_t) );
-  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0) , 4 * sizeof(unsigned) );
-  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
-  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
-  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
-  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
-
-  static_assert( int(dim_0::rank) == int(0) , "" );
-  static_assert( int(dim_0::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_0::ArgN0) == 1 , "" );
-  static_assert( int(dim_0::ArgN1) == 1 , "" );
-  static_assert( int(dim_0::ArgN2) == 1 , "" );
-
-  static_assert( int(dim_s2::rank) == int(1) , "" );
-  static_assert( int(dim_s2::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_s2::ArgN0) == 2 , "" );
-  static_assert( int(dim_s2::ArgN1) == 1 , "" );
-
-  static_assert( int(dim_s2_s3::rank) == int(2) , "" );
-  static_assert( int(dim_s2_s3::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_s2_s3::ArgN0) == 2 , "" );
-  static_assert( int(dim_s2_s3::ArgN1) == 3 , "" );
-  static_assert( int(dim_s2_s3::ArgN2) == 1 , "" );
-
-  static_assert( int(dim_s2_s3_s4::rank) == int(3) , "" );
-  static_assert( int(dim_s2_s3_s4::rank_dynamic) == int(0) , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN0) == 2 , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN1) == 3 , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN2) == 4 , "" );
-  static_assert( int(dim_s2_s3_s4::ArgN3) == 1 , "" );
-
-  static_assert( int(dim_s0::rank) == int(1) , "" );
-  static_assert( int(dim_s0::rank_dynamic) == int(1) , "" );
-
-  static_assert( int(dim_s0_s3::rank) == int(2) , "" );
-  static_assert( int(dim_s0_s3::rank_dynamic) == int(1) , "" );
-  static_assert( int(dim_s0_s3::ArgN0) == 0 , "" );
-  static_assert( int(dim_s0_s3::ArgN1) == 3 , "" );
-
-  static_assert( int(dim_s0_s3_s4::rank) == int(3) , "" );
-  static_assert( int(dim_s0_s3_s4::rank_dynamic) == int(1) , "" );
-  static_assert( int(dim_s0_s3_s4::ArgN0) == 0 , "" );
-  static_assert( int(dim_s0_s3_s4::ArgN1) == 3 , "" );
-  static_assert( int(dim_s0_s3_s4::ArgN2) == 4 , "" );
-
-  static_assert( int(dim_s0_s0_s4::rank) == int(3) , "" );
-  static_assert( int(dim_s0_s0_s4::rank_dynamic) == int(2) , "" );
-  static_assert( int(dim_s0_s0_s4::ArgN0) == 0 , "" );
-  static_assert( int(dim_s0_s0_s4::ArgN1) == 0 , "" );
-  static_assert( int(dim_s0_s0_s4::ArgN2) == 4 , "" );
-
-  static_assert( int(dim_s0_s0_s0::rank) == int(3) , "" );
-  static_assert( int(dim_s0_s0_s0::rank_dynamic) == int(3) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0::rank) == int(4) , "" );
-  static_assert( int(dim_s0_s0_s0_s0::rank_dynamic) == int(4) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0::rank) == int(5) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0::rank_dynamic) == int(5) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0::rank) == int(6) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(6) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0::rank) == int(7) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(7) , "" );
-
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) == int(8) , "" );
-  static_assert( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) == int(8) , "" );
-
-  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 ); 
+  typedef typename Space::execution_space ExecSpace;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 2 > dim_s2;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3 > dim_s2_s3;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 2, 3, 4 > dim_s2_s3_s4;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0 > dim_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3 > dim_s0_s3;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 3, 4 > dim_s0_s3_s4;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0 > dim_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 4 > dim_s0_s0_s4;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 > dim_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0 > dim_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0;
+  typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > dim_s0_s0_s0_s0_s0_s0_s0_s0;
+
+  // Fully static dimensions should not be larger than an int.
+  ASSERT_LE( sizeof( dim_0 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2_s3 ), sizeof( int ) );
+  ASSERT_LE( sizeof( dim_s2_s3_s4 ), sizeof( int ) );
+
+  // Rank 1 is size_t.
+  ASSERT_EQ( sizeof( dim_s0 ), sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s3 ), sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s3_s4 ), sizeof( size_t ) );
+
+  // Allow for padding.
+  ASSERT_LE( sizeof( dim_s0_s0 ), 2 * sizeof( size_t ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s4 ), 2 * sizeof( size_t ) );
+
+  ASSERT_LE( sizeof( dim_s0_s0_s0 ), 4 * sizeof( size_t ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0 ), 4 * sizeof( unsigned ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s0_s0_s0 ), 6 * sizeof( unsigned ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0_s0_s0 ), 6 * sizeof( unsigned ) );
+  ASSERT_LE( sizeof( dim_s0_s0_s0_s0_s0_s0_s0 ), 8 * sizeof( unsigned ) );
+  ASSERT_EQ( sizeof( dim_s0_s0_s0_s0_s0_s0_s0_s0 ), 8 * sizeof( unsigned ) );
+
+  static_assert( int( dim_0::rank ) == int( 0 ), "" );
+  static_assert( int( dim_0::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_0::ArgN0 ) == 1, "" );
+  static_assert( int( dim_0::ArgN1 ) == 1, "" );
+  static_assert( int( dim_0::ArgN2 ) == 1, "" );
+
+  static_assert( int( dim_s2::rank ) == int( 1 ), "" );
+  static_assert( int( dim_s2::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2::ArgN1 ) == 1, "" );
+
+  static_assert( int( dim_s2_s3::rank ) == int( 2 ), "" );
+  static_assert( int( dim_s2_s3::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2_s3::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2_s3::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s2_s3::ArgN2 ) == 1, "" );
+
+  static_assert( int( dim_s2_s3_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s2_s3_s4::rank_dynamic ) == int( 0 ), "" );
+  static_assert( int( dim_s2_s3_s4::ArgN0 ) == 2, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN2 ) == 4, "" );
+  static_assert( int( dim_s2_s3_s4::ArgN3 ) == 1, "" );
+
+  static_assert( int( dim_s0::rank ) == int( 1 ), "" );
+  static_assert( int( dim_s0::rank_dynamic ) == int( 1 ), "" );
+
+  static_assert( int( dim_s0_s3::rank ) == int( 2 ), "" );
+  static_assert( int( dim_s0_s3::rank_dynamic ) == int( 1 ), "" );
+  static_assert( int( dim_s0_s3::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s3::ArgN1 ) == 3, "" );
+
+  static_assert( int( dim_s0_s3_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s3_s4::rank_dynamic ) == int( 1 ), "" );
+  static_assert( int( dim_s0_s3_s4::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s3_s4::ArgN1 ) == 3, "" );
+  static_assert( int( dim_s0_s3_s4::ArgN2 ) == 4, "" );
+
+  static_assert( int( dim_s0_s0_s4::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s0_s4::rank_dynamic ) == int( 2 ), "" );
+  static_assert( int( dim_s0_s0_s4::ArgN0 ) == 0, "" );
+  static_assert( int( dim_s0_s0_s4::ArgN1 ) == 0, "" );
+  static_assert( int( dim_s0_s0_s4::ArgN2 ) == 4, "" );
+
+  static_assert( int( dim_s0_s0_s0::rank ) == int( 3 ), "" );
+  static_assert( int( dim_s0_s0_s0::rank_dynamic ) == int( 3 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0::rank ) == int( 4 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0::rank_dynamic ) == int( 4 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0::rank ) == int( 5 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0::rank_dynamic ) == int( 5 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0::rank ) == int( 6 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 6 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0::rank ) == int( 7 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 7 ), "" );
+
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0_s0::rank ) == int( 8 ), "" );
+  static_assert( int( dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic ) == int( 8 ), "" );
+
+  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 );
   dim_s0_s0       d2( 2, 3, 4, 5, 6, 7, 8, 9 );
   dim_s0_s0_s0    d3( 2, 3, 4, 5, 6, 7, 8, 9 );
   dim_s0_s0_s0_s0 d4( 2, 3, 4, 5, 6, 7, 8, 9 );
 
-  ASSERT_EQ( d1.N0 , 2 );
-  ASSERT_EQ( d2.N0 , 2 );
-  ASSERT_EQ( d3.N0 , 2 );
-  ASSERT_EQ( d4.N0 , 2 );
+  ASSERT_EQ( d1.N0, 2 );
+  ASSERT_EQ( d2.N0, 2 );
+  ASSERT_EQ( d3.N0, 2 );
+  ASSERT_EQ( d4.N0, 2 );
 
-  ASSERT_EQ( d1.N1 , 1 );
-  ASSERT_EQ( d2.N1 , 3 );
-  ASSERT_EQ( d3.N1 , 3 );
-  ASSERT_EQ( d4.N1 , 3 );
+  ASSERT_EQ( d1.N1, 1 );
+  ASSERT_EQ( d2.N1, 3 );
+  ASSERT_EQ( d3.N1, 3 );
+  ASSERT_EQ( d4.N1, 3 );
 
-  ASSERT_EQ( d1.N2 , 1 );
-  ASSERT_EQ( d2.N2 , 1 );
-  ASSERT_EQ( d3.N2 , 4 );
-  ASSERT_EQ( d4.N2 , 4 );
+  ASSERT_EQ( d1.N2, 1 );
+  ASSERT_EQ( d2.N2, 1 );
+  ASSERT_EQ( d3.N2, 4 );
+  ASSERT_EQ( d4.N2, 4 );
 
-  ASSERT_EQ( d1.N3 , 1 );
-  ASSERT_EQ( d2.N3 , 1 );
-  ASSERT_EQ( d3.N3 , 1 );
-  ASSERT_EQ( d4.N3 , 5 );
+  ASSERT_EQ( d1.N3, 1 );
+  ASSERT_EQ( d2.N3, 1 );
+  ASSERT_EQ( d3.N3, 1 );
+  ASSERT_EQ( d4.N3, 5 );
 
   //----------------------------------------
 
-  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0 , Kokkos::LayoutStride >  stride_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0, Kokkos::LayoutStride > stride_s0_s0_s0;
 
   //----------------------------------------
-  // Static dimension
+  // Static dimension.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutLeft > left_s2_s3_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutLeft > left_s2_s3_s4;
 
-    ASSERT_EQ( sizeof(left_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+    ASSERT_EQ( sizeof( left_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
 
-    left_s2_s3_s4 off3 ;
+    left_s2_s3_s4 off3;
 
-    stride_s0_s0_s0  stride3( off3 );
+    stride_s0_s0_s0 stride3( off3 );
 
-    ASSERT_EQ( off3.stride_0() , 1 );
-    ASSERT_EQ( off3.stride_1() , 2 );
-    ASSERT_EQ( off3.stride_2() , 6 );
-    ASSERT_EQ( off3.span() , 24 );
+    ASSERT_EQ( off3.stride_0(), 1 );
+    ASSERT_EQ( off3.stride_1(), 2 );
+    ASSERT_EQ( off3.stride_2(), 6 );
+    ASSERT_EQ( off3.span(), 24 );
 
-    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( off3.span() , stride3.span() );
+    ASSERT_EQ( off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( off3.span(), stride3.span() );
 
-    int offset = 0 ;
+    int offset = 0;
 
-    for ( int k = 0 ; k < 4 ; ++k ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
-      ASSERT_EQ( off3(i,j,k) , offset );
-      ASSERT_EQ( stride3(i,j,k) , off3(i,j,k) );
-    }}}
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < 3; ++j )
+    for ( int i = 0; i < 2; ++i, ++offset )
+    {
+      ASSERT_EQ( off3( i, j, k ), offset );
+      ASSERT_EQ( stride3( i, j, k ), off3( i, j, k ) );
+    }
   }
 
   //----------------------------------------
-  // Small dimension is unpadded
+  // Small dimension is unpadded.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
-    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( 2, 3, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), 2 * 3 * 4 );
 
     const Kokkos::LayoutLeft layout = dyn_off3.layout();
 
-    ASSERT_EQ( layout.dimension[0] , 2 );
-    ASSERT_EQ( layout.dimension[1] , 3 );
-    ASSERT_EQ( layout.dimension[2] , 4 );
-    ASSERT_EQ( layout.dimension[3] , 1 );
-    ASSERT_EQ( layout.dimension[4] , 1 );
-    ASSERT_EQ( layout.dimension[5] , 1 );
-    ASSERT_EQ( layout.dimension[6] , 1 );
-    ASSERT_EQ( layout.dimension[7] , 1 );
-
-    ASSERT_EQ( stride3.m_dim.rank , 3 );
-    ASSERT_EQ( stride3.m_dim.N0 , 2 );
-    ASSERT_EQ( stride3.m_dim.N1 , 3 );
-    ASSERT_EQ( stride3.m_dim.N2 , 4 );
-    ASSERT_EQ( stride3.m_dim.N3 , 1 );
-    ASSERT_EQ( stride3.size() , 2 * 3 * 4 );
-
-    int offset = 0 ;
-
-    for ( int k = 0 ; k < 4 ; ++k ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
-      ASSERT_EQ( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-    }}}
-
-    ASSERT_EQ( dyn_off3.span() , offset );
-    ASSERT_EQ( stride3.span() , dyn_off3.span() );
+    ASSERT_EQ( layout.dimension[0], 2 );
+    ASSERT_EQ( layout.dimension[1], 3 );
+    ASSERT_EQ( layout.dimension[2], 4 );
+    ASSERT_EQ( layout.dimension[3], 1 );
+    ASSERT_EQ( layout.dimension[4], 1 );
+    ASSERT_EQ( layout.dimension[5], 1 );
+    ASSERT_EQ( layout.dimension[6], 1 );
+    ASSERT_EQ( layout.dimension[7], 1 );
+
+    ASSERT_EQ( stride3.m_dim.rank, 3 );
+    ASSERT_EQ( stride3.m_dim.N0, 2 );
+    ASSERT_EQ( stride3.m_dim.N1, 3 );
+    ASSERT_EQ( stride3.m_dim.N2, 4 );
+    ASSERT_EQ( stride3.m_dim.N3, 1 );
+    ASSERT_EQ( stride3.size(), 2 * 3 * 4 );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < 3; ++j )
+    for ( int i = 0; i < 2; ++i, ++offset )
+    {
+      ASSERT_EQ( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
+
+    ASSERT_EQ( dyn_off3.span(), offset );
+    ASSERT_EQ( stride3.span(), dyn_off3.span() );
   }
 
-  // Large dimension is likely padded
+  //----------------------------------------
+  // Large dimension is likely padded.
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
-    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
-
-    ASSERT_EQ( stride3.m_dim.rank , 3 );
-    ASSERT_EQ( stride3.m_dim.N0 , N0 );
-    ASSERT_EQ( stride3.m_dim.N1 , N1 );
-    ASSERT_EQ( stride3.m_dim.N2 , 4 );
-    ASSERT_EQ( stride3.m_dim.N3 , 1 );
-    ASSERT_EQ( stride3.size() , N0 * N1 * 4 );
-    ASSERT_EQ( stride3.span() , dyn_off3.span() );
-
-    int offset = 0 ;
-
-    for ( int k = 0 ; k < 4 ; ++k ){
-    for ( int j = 0 ; j < N1 ; ++j ){
-    for ( int i = 0 ; i < N0 ; ++i ){
-      ASSERT_LE( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-      offset = dyn_off3(i,j,k) + 1 ;
-    }}}
-
-    ASSERT_LE( offset , dyn_off3.span() );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), N0 * N1 * 4 );
+
+    ASSERT_EQ( stride3.m_dim.rank, 3 );
+    ASSERT_EQ( stride3.m_dim.N0, N0 );
+    ASSERT_EQ( stride3.m_dim.N1, N1 );
+    ASSERT_EQ( stride3.m_dim.N2, 4 );
+    ASSERT_EQ( stride3.m_dim.N3, 1 );
+    ASSERT_EQ( stride3.size(), N0 * N1 * 4 );
+    ASSERT_EQ( stride3.span(), dyn_off3.span() );
+
+    int offset = 0;
+
+    for ( int k = 0; k < 4; ++k )
+    for ( int j = 0; j < N1; ++j )
+    for ( int i = 0; i < N0; ++i )
+    {
+      ASSERT_LE( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+      offset = dyn_off3( i, j, k ) + 1;
+    }
+
+    ASSERT_LE( offset, dyn_off3.span() );
   }
 
   //----------------------------------------
-  // Static dimension
+  // Static dimension.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutRight > right_s2_s3_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4, Kokkos::LayoutRight > right_s2_s3_s4;
 
-    ASSERT_EQ( sizeof(right_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+    ASSERT_EQ( sizeof( right_s2_s3_s4 ), sizeof( dim_s2_s3_s4 ) );
 
-    right_s2_s3_s4 off3 ;
+    right_s2_s3_s4 off3;
 
     stride_s0_s0_s0  stride3( off3 );
 
-    ASSERT_EQ( off3.stride_0() , 12 );
-    ASSERT_EQ( off3.stride_1() , 4 );
-    ASSERT_EQ( off3.stride_2() , 1 );
+    ASSERT_EQ( off3.stride_0(), 12 );
+    ASSERT_EQ( off3.stride_1(), 4 );
+    ASSERT_EQ( off3.stride_2(), 1 );
 
-    ASSERT_EQ( off3.dimension_0() , stride3.dimension_0() );
-    ASSERT_EQ( off3.dimension_1() , stride3.dimension_1() );
-    ASSERT_EQ( off3.dimension_2() , stride3.dimension_2() );
-    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( off3.span() , stride3.span() );
+    ASSERT_EQ( off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( off3.span(), stride3.span() );
 
-    int offset = 0 ;
+    int offset = 0;
 
-    for ( int i = 0 ; i < 2 ; ++i ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
-      ASSERT_EQ( off3(i,j,k) , offset );
-      ASSERT_EQ( off3(i,j,k) , stride3(i,j,k) );
-    }}}
+    for ( int i = 0; i < 2; ++i )
+    for ( int j = 0; j < 3; ++j )
+    for ( int k = 0; k < 4; ++k, ++offset )
+    {
+      ASSERT_EQ( off3( i, j, k ), offset );
+      ASSERT_EQ( off3( i, j, k ), stride3( i, j, k ) );
+    }
 
-    ASSERT_EQ( off3.span() , offset );
+    ASSERT_EQ( off3.span(), offset );
   }
 
   //----------------------------------------
-  // Small dimension is unpadded
+  // Small dimension is unpadded.
   {
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
-    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( 2, 3, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
-
-    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
-    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
-    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( dyn_off3.span() , stride3.span() );
-
-    int offset = 0 ;
-
-    for ( int i = 0 ; i < 2 ; ++i ){
-    for ( int j = 0 ; j < 3 ; ++j ){
-    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
-      ASSERT_EQ( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
-    }}}
-
-    ASSERT_EQ( dyn_off3.span() , offset );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), 2 * 3 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < 2; ++i )
+    for ( int j = 0; j < 3; ++j )
+    for ( int k = 0; k < 4; ++k, ++offset )
+    {
+      ASSERT_EQ( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( dyn_off3( i, j, k ), stride3( i, j, k ) );
+    }
+
+    ASSERT_EQ( dyn_off3.span(), offset );
   }
 
-  // Large dimension is likely padded
+  //----------------------------------------
+  // Large dimension is likely padded.
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
-    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
     stride_s0_s0_s0  stride3( dyn_off3 );
 
-    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
-    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
-    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
-    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
-    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
-    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
-
-    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
-    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
-    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_EQ( dyn_off3.span() , stride3.span() );
-
-    int offset = 0 ;
-
-    for ( int i = 0 ; i < N0 ; ++i ){
-    for ( int j = 0 ; j < N1 ; ++j ){
-    for ( int k = 0 ; k < 4 ; ++k ){
-      ASSERT_LE( offset , dyn_off3(i,j,k) );
-      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
-      offset = dyn_off3(i,j,k) + 1 ;
-    }}}
-
-    ASSERT_LE( offset , dyn_off3.span() );
+    ASSERT_EQ( dyn_off3.m_dim.rank, 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0, N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1, N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2, 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3, 1 );
+    ASSERT_EQ( dyn_off3.size(), N0 * N1 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0(), stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1(), stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2(), stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span(), stride3.span() );
+
+    int offset = 0;
+
+    for ( int i = 0; i < N0; ++i )
+    for ( int j = 0; j < N1; ++j )
+    for ( int k = 0; k < 4; ++k )
+    {
+      ASSERT_LE( offset, dyn_off3( i, j, k ) );
+      ASSERT_EQ( dyn_off3( i, j, k ), stride3( i, j, k ) );
+      offset = dyn_off3( i, j, k ) + 1;
+    }
+
+    ASSERT_LE( offset, dyn_off3.span() );
   }
 
   //----------------------------------------
-  // Subview
+  // Subview.
   {
     // Mapping rank 4 to rank 3
-    typedef Kokkos::Experimental::Impl::SubviewExtents<4,3> SubviewExtents ;
+    typedef Kokkos::Experimental::Impl::SubviewExtents< 4, 3 > SubviewExtents;
 
-    constexpr int N0 = 1000 ;
-    constexpr int N1 = 2000 ;
-    constexpr int N2 = 3000 ;
-    constexpr int N3 = 4000 ;
+    constexpr int N0 = 1000;
+    constexpr int N1 = 2000;
+    constexpr int N2 = 3000;
+    constexpr int N3 = 4000;
 
-    Kokkos::Experimental::Impl::ViewDimension<N0,N1,N2,N3> dim ;
+    Kokkos::Experimental::Impl::ViewDimension< N0, N1, N2, N3 > dim;
 
     SubviewExtents tmp( dim
                       , N0 / 2
                       , Kokkos::Experimental::ALL
-                      , std::pair<int,int>( N2 / 4 , 10 + N2 / 4 )
-                      , Kokkos::pair<int,int>( N3 / 4 , 20 + N3 / 4 )
+                      , std::pair< int, int >( N2 / 4, 10 + N2 / 4 )
+                      , Kokkos::pair< int, int >( N3 / 4, 20 + N3 / 4 )
                       );
 
-    ASSERT_EQ( tmp.domain_offset(0) , N0 / 2 );
-    ASSERT_EQ( tmp.domain_offset(1) , 0 );
-    ASSERT_EQ( tmp.domain_offset(2) , N2 / 4 );
-    ASSERT_EQ( tmp.domain_offset(3) , N3 / 4 );
+    ASSERT_EQ( tmp.domain_offset( 0 ), N0 / 2 );
+    ASSERT_EQ( tmp.domain_offset( 1 ), 0 );
+    ASSERT_EQ( tmp.domain_offset( 2 ), N2 / 4 );
+    ASSERT_EQ( tmp.domain_offset( 3 ), N3 / 4 );
 
-    ASSERT_EQ( tmp.range_index(0) , 1 );
-    ASSERT_EQ( tmp.range_index(1) , 2 );
-    ASSERT_EQ( tmp.range_index(2) , 3 );
+    ASSERT_EQ( tmp.range_index( 0 ), 1 );
+    ASSERT_EQ( tmp.range_index( 1 ), 2 );
+    ASSERT_EQ( tmp.range_index( 2 ), 3 );
 
-    ASSERT_EQ( tmp.range_extent(0) , N1 );
-    ASSERT_EQ( tmp.range_extent(1) , 10 );
-    ASSERT_EQ( tmp.range_extent(2) , 20 );
+    ASSERT_EQ( tmp.range_extent( 0 ), N1 );
+    ASSERT_EQ( tmp.range_extent( 1 ), 10 );
+    ASSERT_EQ( tmp.range_extent( 2 ), 20 );
   }
-  //----------------------------------------
+
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    constexpr int sub_N0 = 1000 ;
-    constexpr int sub_N1 = 200 ;
-    constexpr int sub_N2 = 4 ;
+    constexpr int sub_N0 = 1000;
+    constexpr int sub_N1 = 200;
+    constexpr int sub_N2 = 4;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutLeft > left_s0_s0_s4;
 
-    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    left_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                           , Kokkos::LayoutLeft( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
-    Kokkos::Experimental::Impl::SubviewExtents< 3 , 3 >
+    Kokkos::Experimental::Impl::SubviewExtents< 3, 3 >
       sub( dyn_off3.m_dim
-         , Kokkos::pair<int,int>(0,sub_N0)
-         , Kokkos::pair<int,int>(0,sub_N1)
-         , Kokkos::pair<int,int>(0,sub_N2)
+         , Kokkos::pair< int, int >( 0, sub_N0 )
+         , Kokkos::pair< int, int >( 0, sub_N1 )
+         , Kokkos::pair< int, int >( 0, sub_N2 )
          );
 
-    stride_s0_s0_s0  stride3( dyn_off3 , sub );
+    stride_s0_s0_s0  stride3( dyn_off3, sub );
 
-    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
-    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
-    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
-    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+    ASSERT_EQ( stride3.dimension_0(), sub_N0 );
+    ASSERT_EQ( stride3.dimension_1(), sub_N1 );
+    ASSERT_EQ( stride3.dimension_2(), sub_N2 );
+    ASSERT_EQ( stride3.size(), sub_N0 * sub_N1 * sub_N2 );
 
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()    , stride3.span() );
 
-    for ( int k = 0 ; k < sub_N2 ; ++k ){
-    for ( int j = 0 ; j < sub_N1 ; ++j ){
-    for ( int i = 0 ; i < sub_N0 ; ++i ){
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-    }}}
+    for ( int k = 0; k < sub_N2; ++k )
+    for ( int j = 0; j < sub_N1; ++j )
+    for ( int i = 0; i < sub_N0; ++i )
+    {
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
   }
 
   {
-    constexpr int N0 = 2000 ;
-    constexpr int N1 = 300 ;
+    constexpr int N0 = 2000;
+    constexpr int N1 = 300;
 
-    constexpr int sub_N0 = 1000 ;
-    constexpr int sub_N1 = 200 ;
-    constexpr int sub_N2 = 4 ;
+    constexpr int sub_N0 = 1000;
+    constexpr int sub_N1 = 200;
+    constexpr int sub_N2 = 4;
 
-    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4, Kokkos::LayoutRight > right_s0_s0_s4;
 
-    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>()
+    right_s0_s0_s4 dyn_off3( std::integral_constant< unsigned, sizeof( int ) >()
                            , Kokkos::LayoutRight( N0, N1, 0, 0, 0, 0, 0, 0 ) );
 
-    Kokkos::Experimental::Impl::SubviewExtents< 3 , 3 >
+    Kokkos::Experimental::Impl::SubviewExtents< 3, 3 >
       sub( dyn_off3.m_dim
-         , Kokkos::pair<int,int>(0,sub_N0)
-         , Kokkos::pair<int,int>(0,sub_N1)
-         , Kokkos::pair<int,int>(0,sub_N2)
+         , Kokkos::pair< int, int >( 0, sub_N0 )
+         , Kokkos::pair< int, int >( 0, sub_N1 )
+         , Kokkos::pair< int, int >( 0, sub_N2 )
          );
 
-    stride_s0_s0_s0  stride3( dyn_off3 , sub );
+    stride_s0_s0_s0  stride3( dyn_off3, sub );
 
-    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
-    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
-    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
-    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+    ASSERT_EQ( stride3.dimension_0(), sub_N0 );
+    ASSERT_EQ( stride3.dimension_1(), sub_N1 );
+    ASSERT_EQ( stride3.dimension_2(), sub_N2 );
+    ASSERT_EQ( stride3.size(), sub_N0 * sub_N1 * sub_N2 );
 
-    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
-    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
-    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
-    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+    ASSERT_EQ( dyn_off3.stride_0(), stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1(), stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2(), stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()    , stride3.span() );
 
-    for ( int i = 0 ; i < sub_N0 ; ++i ){
-    for ( int j = 0 ; j < sub_N1 ; ++j ){
-    for ( int k = 0 ; k < sub_N2 ; ++k ){
-      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
-    }}}
+    for ( int i = 0; i < sub_N0; ++i )
+    for ( int j = 0; j < sub_N1; ++j )
+    for ( int k = 0; k < sub_N2; ++k )
+    {
+      ASSERT_EQ( stride3( i, j, k ), dyn_off3( i, j, k ) );
+    }
   }
 
   //----------------------------------------
-  // view data analysis
+  // View data analysis.
   {
-    using namespace Kokkos::Experimental::Impl ;
-    static_assert( rank_dynamic<>::value == 0 , "" );
-    static_assert( rank_dynamic<1>::value == 0 , "" );
-    static_assert( rank_dynamic<0>::value == 1 , "" );
-    static_assert( rank_dynamic<0,1>::value == 1 , "" );
-    static_assert( rank_dynamic<0,0,1>::value == 2 , "" );
+    using namespace Kokkos::Experimental::Impl;
+
+    static_assert( rank_dynamic<>::value == 0, "" );
+    static_assert( rank_dynamic< 1 >::value == 0, "" );
+    static_assert( rank_dynamic< 0 >::value == 1, "" );
+    static_assert( rank_dynamic< 0, 1 >::value == 1, "" );
+    static_assert( rank_dynamic< 0, 0, 1 >::value == 2, "" );
   }
 
   {
-    using namespace Kokkos::Experimental::Impl ;
-
-    typedef ViewArrayAnalysis< int[] >                 a_int_r1 ;
-    typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5 ;
-    typedef ViewArrayAnalysis< const int[] >           a_const_int_r1 ;
-    typedef ViewArrayAnalysis< const int**[4][5][6] >  a_const_int_r5 ;
-
-    static_assert( a_int_r1::dimension::rank == 1 , "" );
-    static_assert( a_int_r1::dimension::rank_dynamic == 1 , "" );
-    static_assert( a_int_r5::dimension::ArgN0 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN1 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN2 == 4 , "" );
-    static_assert( a_int_r5::dimension::ArgN3 == 5 , "" );
-    static_assert( a_int_r5::dimension::ArgN4 == 6 , "" );
-    static_assert( a_int_r5::dimension::ArgN5 == 1 , "" );
-
-    static_assert( std::is_same< typename a_int_r1::dimension , ViewDimension<0> >::value , "" );
-    static_assert( std::is_same< typename a_int_r1::non_const_value_type , int >::value , "" );
-
-    static_assert( a_const_int_r1::dimension::rank == 1 , "" );
-    static_assert( a_const_int_r1::dimension::rank_dynamic == 1 , "" );
-    static_assert( std::is_same< typename a_const_int_r1::dimension , ViewDimension<0> >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value , "" );
-
-    static_assert( a_const_int_r5::dimension::rank == 5 , "" );
-    static_assert( a_const_int_r5::dimension::rank_dynamic == 2 , "" );
-
-    static_assert( a_const_int_r5::dimension::ArgN0 == 0 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN1 == 0 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN2 == 4 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN3 == 5 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN4 == 6 , "" );
-    static_assert( a_const_int_r5::dimension::ArgN5 == 1 , "" );
-
-    static_assert( std::is_same< typename a_const_int_r5::dimension , ViewDimension<0,0,4,5,6> >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r5::non_const_value_type , int >::value , "" );
-
-    static_assert( a_int_r5::dimension::rank == 5 , "" );
-    static_assert( a_int_r5::dimension::rank_dynamic == 2 , "" );
-    static_assert( std::is_same< typename a_int_r5::dimension , ViewDimension<0,0,4,5,6> >::value , "" );
-    static_assert( std::is_same< typename a_int_r5::non_const_value_type , int >::value , "" );
+    using namespace Kokkos::Experimental::Impl;
+
+    typedef ViewArrayAnalysis< int[] >                 a_int_r1;
+    typedef ViewArrayAnalysis< int**[4][5][6] >        a_int_r5;
+    typedef ViewArrayAnalysis< const int[] >           a_const_int_r1;
+    typedef ViewArrayAnalysis< const int**[4][5][6] >  a_const_int_r5;
+
+    static_assert( a_int_r1::dimension::rank == 1, "" );
+    static_assert( a_int_r1::dimension::rank_dynamic == 1, "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 4, "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 5, "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 6, "" );
+    static_assert( a_int_r5::dimension::ArgN5 == 1, "" );
+
+    static_assert( std::is_same< typename a_int_r1::dimension, ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_int_r1::non_const_value_type, int >::value, "" );
+
+    static_assert( a_const_int_r1::dimension::rank == 1, "" );
+    static_assert( a_const_int_r1::dimension::rank_dynamic == 1, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, ViewDimension<0> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type, int >::value, "" );
+
+    static_assert( a_const_int_r5::dimension::rank == 5, "" );
+    static_assert( a_const_int_r5::dimension::rank_dynamic == 2, "" );
+
+    static_assert( a_const_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_const_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_const_int_r5::dimension::ArgN2 == 4, "" );
+    static_assert( a_const_int_r5::dimension::ArgN3 == 5, "" );
+    static_assert( a_const_int_r5::dimension::ArgN4 == 6, "" );
+    static_assert( a_const_int_r5::dimension::ArgN5 == 1, "" );
+
+    static_assert( std::is_same< typename a_const_int_r5::dimension, ViewDimension<0, 0, 4, 5, 6> >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r5::non_const_value_type, int >::value, "" );
+
+    static_assert( a_int_r5::dimension::rank == 5, "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 2, "" );
+    static_assert( std::is_same< typename a_int_r5::dimension, ViewDimension<0, 0, 4, 5, 6> >::value, "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type, int >::value, "" );
   }
 
   {
-    using namespace Kokkos::Experimental::Impl ;
+    using namespace Kokkos::Experimental::Impl;
 
-    typedef int t_i4[4] ;
+    typedef int t_i4[4];
 
     // Dimensions of t_i4 are appended to the multdimensional array.
-    typedef ViewArrayAnalysis< t_i4 ***[3] > a_int_r5 ;
-
-    static_assert( a_int_r5::dimension::rank == 5 , "" );
-    static_assert( a_int_r5::dimension::rank_dynamic == 3 , "" );
-    static_assert( a_int_r5::dimension::ArgN0 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN1 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN2 == 0 , "" );
-    static_assert( a_int_r5::dimension::ArgN3 == 3 , "" );
-    static_assert( a_int_r5::dimension::ArgN4 == 4 , "" );
-    static_assert( std::is_same< typename a_int_r5::non_const_value_type , int >::value , "" );
+    typedef ViewArrayAnalysis< t_i4 ***[3] > a_int_r5;
+
+    static_assert( a_int_r5::dimension::rank == 5, "" );
+    static_assert( a_int_r5::dimension::rank_dynamic == 3, "" );
+    static_assert( a_int_r5::dimension::ArgN0 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN1 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN2 == 0, "" );
+    static_assert( a_int_r5::dimension::ArgN3 == 3, "" );
+    static_assert( a_int_r5::dimension::ArgN4 == 4, "" );
+    static_assert( std::is_same< typename a_int_r5::non_const_value_type, int >::value, "" );
   }
 
   {
-    using namespace Kokkos::Experimental::Impl ;
+    using namespace Kokkos::Experimental::Impl;
 
-    typedef ViewDataAnalysis< const int[] , void >  a_const_int_r1 ;
+    typedef ViewDataAnalysis< const int[], void >  a_const_int_r1;
 
-    static_assert( std::is_same< typename a_const_int_r1::specialize , void >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::dimension , Kokkos::Experimental::Impl::ViewDimension<0> >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::specialize, void >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::dimension, Kokkos::Experimental::Impl::ViewDimension<0> >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r1::type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::value_type , const int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::value_type, const int >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r1::scalar_array_type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::const_type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::const_value_type , const int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::const_scalar_array_type , const int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::non_const_type , int * >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r1::scalar_array_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::const_scalar_array_type, const int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_type, int * >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r1::non_const_value_type, int >::value, "" );
 
-    typedef ViewDataAnalysis< const int**[4] , void >  a_const_int_r3 ;
+    typedef ViewDataAnalysis< const int**[4], void >  a_const_int_r3;
 
-    static_assert( std::is_same< typename a_const_int_r3::specialize , void >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::specialize, void >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r3::dimension , Kokkos::Experimental::Impl::ViewDimension<0,0,4> >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::dimension, Kokkos::Experimental::Impl::ViewDimension<0, 0, 4> >::value, "" );
 
-    static_assert( std::is_same< typename a_const_int_r3::type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::value_type , const int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::scalar_array_type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::const_type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::const_value_type , const int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::const_scalar_array_type , const int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::non_const_type , int**[4] >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::non_const_value_type , int >::value , "" );
-    static_assert( std::is_same< typename a_const_int_r3::non_const_scalar_array_type , int**[4] >::value , "" );
+    static_assert( std::is_same< typename a_const_int_r3::type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::scalar_array_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_value_type, const int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::const_scalar_array_type, const int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_type, int**[4] >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_value_type, int >::value, "" );
+    static_assert( std::is_same< typename a_const_int_r3::non_const_scalar_array_type, int**[4] >::value, "" );
 
-
-    // std::cout << "typeid(const int**[4]).name() = " << typeid(const int**[4]).name() << std::endl ;
+    // std::cout << "typeid( const int**[4] ).name() = " << typeid( const int**[4] ).name() << std::endl;
   }
 
   //----------------------------------------
 
   {
-    constexpr int N = 10 ;
+    constexpr int N = 10;
 
-    typedef Kokkos::View<int*,Space>        T ;
-    typedef Kokkos::View<const int*,Space>  C ;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
 
-    int data[N] ;
+    int data[N];
 
-    T vr1(data,N); // view of non-const
-    C cr1(vr1);    // view of const from view of non-const
-    C cr2( (const int *) data , N );
+    T vr1( data, N ); // View of non-const.
+    C cr1( vr1 );     // View of const from view of non-const.
+    C cr2( (const int *) data, N );
 
     // Generate static_assert error:
     // T tmp( cr1 );
 
-    ASSERT_EQ( vr1.span() , N );
-    ASSERT_EQ( cr1.span() , N );
-    ASSERT_EQ( vr1.data() , & data[0] );
-    ASSERT_EQ( cr1.data() , & data[0] );
+    ASSERT_EQ( vr1.span(), N );
+    ASSERT_EQ( cr1.span(), N );
+    ASSERT_EQ( vr1.data(), & data[0] );
+    ASSERT_EQ( cr1.data(), & data[0] );
 
-    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::data_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::value_type          , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type, int >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename Space::memory_space >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type, int & >::value ) );
 
-    ASSERT_EQ( T::Rank , 1 );
+    ASSERT_EQ( T::Rank, 1 );
 
-    ASSERT_TRUE( ( std::is_same< typename C::data_type           , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::const_data_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::data_type          , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename C::scalar_array_type           , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::const_scalar_array_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::non_const_scalar_array_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::scalar_array_type          , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_scalar_array_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename C::value_type           , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::const_value_type     , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::value_type          , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type, int >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename C::memory_space , typename Space::memory_space >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename C::reference_type , const int & >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::reference_type, const int & >::value ) );
 
-    ASSERT_EQ( C::Rank , 1 );
+    ASSERT_EQ( C::Rank, 1 );
 
-    ASSERT_EQ( vr1.dimension_0() , N );
+    ASSERT_EQ( vr1.dimension_0(), N );
 
-    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , typename Space::memory_space >::accessible ) {
-      for ( int i = 0 ; i < N ; ++i ) data[i] = i + 1 ;
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+      for ( int i = 0; i < N; ++i ) data[i] = i + 1;
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 1 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( cr1[i], i + 1 );
 
       {
         T tmp( vr1 );
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
-        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 1 );
+        for ( int i = 0; i < N; ++i ) vr1( i ) = i + 2;
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 2 );
       }
 
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 2 );
     }
   }
 
-
   {
-    constexpr int N = 10 ;
-    typedef Kokkos::View<int*,Space>        T ;
-    typedef Kokkos::View<const int*,Space>  C ;
+    constexpr int N = 10;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
+
+    T vr1( "vr1", N );
+    C cr1( vr1 );
 
-    T vr1("vr1",N);
-    C cr1(vr1);
+    ASSERT_TRUE( ( std::is_same< typename T::data_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type          , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type    , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type, int* >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::scalar_array_type           , int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_scalar_array_type     , const int* >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_scalar_array_type , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::value_type          , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type    , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type, int >::value ) );
 
-    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space, typename Space::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type, int & >::value ) );
+    ASSERT_EQ( T::Rank, 1 );
 
-    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename Space::memory_space >::value ) );
-    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
-    ASSERT_EQ( T::Rank , 1 );
- 
-    ASSERT_EQ( vr1.dimension_0() , N );
+    ASSERT_EQ( vr1.dimension_0(), N );
 
-    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , typename Space::memory_space >::accessible ) {
-      for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 1 ;
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+    if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+      for ( int i = 0; i < N; ++i ) vr1( i ) = i + 1;
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 1 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( cr1[i], i + 1 );
 
       {
         T tmp( vr1 );
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
-        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
-        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 1 );
+        for ( int i = 0; i < N; ++i ) vr1( i ) = i + 2;
+        for ( int i = 0; i < N; ++i ) ASSERT_EQ( tmp[i], i + 2 );
       }
 
-      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+      for ( int i = 0; i < N; ++i ) ASSERT_EQ( vr1[i], i + 2 );
     }
   }
 
-  // Testing proper handling of zero-length allocations
+  // Testing proper handling of zero-length allocations.
   {
-    constexpr int N = 0 ;
-    typedef Kokkos::View<int*,Space>        T ;
-    typedef Kokkos::View<const int*,Space>  C ;
+    constexpr int N = 0;
+    typedef Kokkos::View< int*, Space >        T;
+    typedef Kokkos::View< const int*, Space >  C;
 
-    T vr1("vr1",N);
-    C cr1(vr1);
+    T vr1( "vr1", N );
+    C cr1( vr1 );
 
-    ASSERT_EQ( vr1.dimension_0() , 0 );
-    ASSERT_EQ( cr1.dimension_0() , 0 );
+    ASSERT_EQ( vr1.dimension_0(), 0 );
+    ASSERT_EQ( cr1.dimension_0(), 0 );
   }
 
-
   // Testing using space instance for allocation.
-  // The execution space of the memory space must be available for view data initialization
-
-  if ( std::is_same< ExecSpace , typename ExecSpace::memory_space::execution_space >::value ) {
-
-    using namespace Kokkos::Experimental ;
-
-    typedef typename ExecSpace::memory_space  memory_space ;
-    typedef View<int*,memory_space>           V ;
-
-    constexpr int N = 10 ;
-
-    memory_space mem_space ;
-
-    V v( "v" , N );
-    V va( view_alloc() , N );
-    V vb( view_alloc( "vb" ) , N );
-    V vc( view_alloc( "vc" , AllowPadding ) , N );
-    V vd( view_alloc( "vd" , WithoutInitializing ) , N );
-    V ve( view_alloc( "ve" , WithoutInitializing , AllowPadding ) , N );
-    V vf( view_alloc( "vf" , mem_space , WithoutInitializing , AllowPadding ) , N );
-    V vg( view_alloc( mem_space , "vg" , WithoutInitializing , AllowPadding ) , N );
-    V vh( view_alloc( WithoutInitializing , AllowPadding ) , N );
-    V vi( view_alloc( WithoutInitializing ) , N );
-    V vj( view_alloc( std::string("vj") , AllowPadding ) , N );
-    V vk( view_alloc( mem_space , std::string("vk") , AllowPadding ) , N );
+  // The execution space of the memory space must be available for view data initialization.
+  if ( std::is_same< ExecSpace, typename ExecSpace::memory_space::execution_space >::value ) {
+
+    using namespace Kokkos::Experimental;
+
+    typedef typename ExecSpace::memory_space  memory_space;
+    typedef View< int*, memory_space >        V;
+
+    constexpr int N = 10;
+
+    memory_space mem_space;
+
+    V v( "v", N );
+    V va( view_alloc(), N );
+    V vb( view_alloc( "vb" ), N );
+    V vc( view_alloc( "vc", AllowPadding ), N );
+    V vd( view_alloc( "vd", WithoutInitializing ), N );
+    V ve( view_alloc( "ve", WithoutInitializing, AllowPadding ), N );
+    V vf( view_alloc( "vf", mem_space, WithoutInitializing, AllowPadding ), N );
+    V vg( view_alloc( mem_space, "vg", WithoutInitializing, AllowPadding ), N );
+    V vh( view_alloc( WithoutInitializing, AllowPadding ), N );
+    V vi( view_alloc( WithoutInitializing ), N );
+    V vj( view_alloc( std::string( "vj" ), AllowPadding ), N );
+    V vk( view_alloc( mem_space, std::string( "vk" ), AllowPadding ), N );
   }
 
   {
-    typedef Kokkos::ViewTraits<int***,Kokkos::LayoutStride,ExecSpace>  traits_t ;
-    typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0>                         dims_t ;
-    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t , Kokkos::LayoutStride >  offset_t ;
+    typedef Kokkos::ViewTraits< int***, Kokkos::LayoutStride, ExecSpace >           traits_t;
+    typedef Kokkos::Experimental::Impl::ViewDimension< 0, 0, 0 >                    dims_t;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t, Kokkos::LayoutStride >  offset_t;
 
-    Kokkos::LayoutStride stride ;
+    Kokkos::LayoutStride stride;
 
-    stride.dimension[0] = 3 ;
-    stride.dimension[1] = 4 ;
-    stride.dimension[2] = 5 ;
-    stride.stride[0] = 4 ;
-    stride.stride[1] = 1 ;
-    stride.stride[2] = 12 ;
+    stride.dimension[0] = 3;
+    stride.dimension[1] = 4;
+    stride.dimension[2] = 5;
+    stride.stride[0] = 4;
+    stride.stride[1] = 1;
+    stride.stride[2] = 12;
 
-    const offset_t offset( std::integral_constant<unsigned,0>() , stride );
+    const offset_t offset( std::integral_constant< unsigned, 0 >(), stride );
 
-    ASSERT_EQ( offset.dimension_0() , 3 );
-    ASSERT_EQ( offset.dimension_1() , 4 );
-    ASSERT_EQ( offset.dimension_2() , 5 );
+    ASSERT_EQ( offset.dimension_0(), 3 );
+    ASSERT_EQ( offset.dimension_1(), 4 );
+    ASSERT_EQ( offset.dimension_2(), 5 );
 
-    ASSERT_EQ( offset.stride_0() , 4 );
-    ASSERT_EQ( offset.stride_1() , 1 );
-    ASSERT_EQ( offset.stride_2() , 12 );
+    ASSERT_EQ( offset.stride_0(), 4 );
+    ASSERT_EQ( offset.stride_1(), 1 );
+    ASSERT_EQ( offset.stride_2(), 12 );
 
-    ASSERT_EQ( offset.span() , 60 );
+    ASSERT_EQ( offset.span(), 60 );
     ASSERT_TRUE( offset.span_is_contiguous() );
 
-    Kokkos::Experimental::Impl::ViewMapping< traits_t , void >
-      v( Kokkos::Experimental::Impl::ViewCtorProp<int*>((int*)0), stride );
+    Kokkos::Experimental::Impl::ViewMapping< traits_t, void >
+      v( Kokkos::Experimental::Impl::ViewCtorProp< int* >( (int*) 0 ), stride );
   }
 
   {
-    typedef Kokkos::View<int**,Space>  V ;
-    typedef typename V::HostMirror  M ;
-    typedef typename Kokkos::View<int**,Space>::array_layout layout_type;
+    typedef Kokkos::View< int**, Space > V;
+    typedef typename V::HostMirror M;
+    typedef typename Kokkos::View< int**, Space >::array_layout layout_type;
 
-    constexpr int N0 = 10 ;
-    constexpr int N1 = 11 ;
+    constexpr int N0 = 10;
+    constexpr int N1 = 11;
 
-    V a("a",N0,N1);
-    M b = Kokkos::Experimental::create_mirror(a);
-    M c = Kokkos::Experimental::create_mirror_view(a);
-    M d ;
+    V a( "a", N0, N1 );
+    M b = Kokkos::Experimental::create_mirror( a );
+    M c = Kokkos::Experimental::create_mirror_view( a );
+    M d;
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    Kokkos::Experimental::deep_copy( a , b );
-    Kokkos::Experimental::deep_copy( c , a );
+    Kokkos::Experimental::deep_copy( a, b );
+    Kokkos::Experimental::deep_copy( c, a );
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+    }
 
-    Kokkos::Experimental::resize( b , 5 , 6 );
+    Kokkos::Experimental::resize( b, 5, 6 );
 
-    for ( int i0 = 0 ; i0 < 5 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 6 ; ++i1 ) {
+    for ( int i0 = 0; i0 < 5; ++i0 )
+    for ( int i1 = 0; i1 < 6; ++i1 )
+    {
       int val = 1 + i0 + i1 * N0;
-      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
-      ASSERT_EQ( b(i0,i1) , val );
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+      ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c , 5 , 6 );
-    Kokkos::Experimental::realloc( d , 5 , 6 );
+    Kokkos::Experimental::realloc( c, 5, 6 );
+    Kokkos::Experimental::realloc( d, 5, 6 );
 
-    ASSERT_EQ( b.dimension_0() , 5 );
-    ASSERT_EQ( b.dimension_1() , 6 );
-    ASSERT_EQ( c.dimension_0() , 5 );
-    ASSERT_EQ( c.dimension_1() , 6 );
-    ASSERT_EQ( d.dimension_0() , 5 );
-    ASSERT_EQ( d.dimension_1() , 6 );
+    ASSERT_EQ( b.dimension_0(), 5 );
+    ASSERT_EQ( b.dimension_1(), 6 );
+    ASSERT_EQ( c.dimension_0(), 5 );
+    ASSERT_EQ( c.dimension_1(), 6 );
+    ASSERT_EQ( d.dimension_0(), 5 );
+    ASSERT_EQ( d.dimension_1(), 6 );
 
-    layout_type layout(7,8);
-    Kokkos::Experimental::resize( b , layout );
-    for ( int i0 = 0 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 6 ; i1 < 8 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    layout_type layout( 7, 8 );
+    Kokkos::Experimental::resize( b, layout );
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 6; i1 < 8; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    for ( int i0 = 5 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 8 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    for ( int i0 = 5; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    for ( int i0 = 0 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 8 ; ++i1 ) {
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
        int val = 1 + i0 + i1 * N0;
-       ASSERT_EQ( b(i0,i1) , val );
+       ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c , layout );
-    Kokkos::Experimental::realloc( d , layout );
-
-    ASSERT_EQ( b.dimension_0() , 7 );
-    ASSERT_EQ( b.dimension_1() , 8 );
-    ASSERT_EQ( c.dimension_0() , 7 );
-    ASSERT_EQ( c.dimension_1() , 8 );
-    ASSERT_EQ( d.dimension_0() , 7 );
-    ASSERT_EQ( d.dimension_1() , 8 );
+    Kokkos::Experimental::realloc( c, layout );
+    Kokkos::Experimental::realloc( d, layout );
 
+    ASSERT_EQ( b.dimension_0(), 7 );
+    ASSERT_EQ( b.dimension_1(), 8 );
+    ASSERT_EQ( c.dimension_0(), 7 );
+    ASSERT_EQ( c.dimension_1(), 8 );
+    ASSERT_EQ( d.dimension_0(), 7 );
+    ASSERT_EQ( d.dimension_1(), 8 );
   }
 
   {
-    typedef Kokkos::View<int**,Kokkos::LayoutStride,Space>  V ;
-    typedef typename V::HostMirror  M ;
-    typedef typename Kokkos::View<int**,Kokkos::LayoutStride,Space>::array_layout layout_type;
+    typedef Kokkos::View< int**, Kokkos::LayoutStride, Space > V;
+    typedef typename V::HostMirror M;
+    typedef typename Kokkos::View< int**, Kokkos::LayoutStride, Space >::array_layout layout_type;
 
-    constexpr int N0 = 10 ;
-    constexpr int N1 = 11 ;
+    constexpr int N0 = 10;
+    constexpr int N1 = 11;
 
-    const int dimensions[] = {N0,N1};
-    const int order[] = {1,0};
+    const int dimensions[] = { N0, N1 };
+    const int order[] = { 1, 0 };
 
-    V a("a",Kokkos::LayoutStride::order_dimensions(2,order,dimensions));
-    M b = Kokkos::Experimental::create_mirror(a);
-    M c = Kokkos::Experimental::create_mirror_view(a);
-    M d ;
+    V a( "a", Kokkos::LayoutStride::order_dimensions( 2, order, dimensions ) );
+    M b = Kokkos::Experimental::create_mirror( a );
+    M c = Kokkos::Experimental::create_mirror_view( a );
+    M d;
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      b(i0,i1) = 1 + i0 + i1 * N0 ;
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      b( i0, i1 ) = 1 + i0 + i1 * N0;
+    }
 
-    Kokkos::Experimental::deep_copy( a , b );
-    Kokkos::Experimental::deep_copy( c , a );
+    Kokkos::Experimental::deep_copy( a, b );
+    Kokkos::Experimental::deep_copy( c, a );
 
-    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
-    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
-      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
+    for ( int i0 = 0; i0 < N0; ++i0 )
+    for ( int i1 = 0; i1 < N1; ++i1 )
+    {
+      ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+    }
 
-    const int dimensions2[] = {7,8};
-    const int order2[] = {1,0};
-    layout_type layout = layout_type::order_dimensions(2,order2,dimensions2);
-    Kokkos::Experimental::resize( b , layout );
+    const int dimensions2[] = { 7, 8 };
+    const int order2[] = { 1, 0 };
+    layout_type layout = layout_type::order_dimensions( 2, order2, dimensions2 );
+    Kokkos::Experimental::resize( b, layout );
 
-    for ( int i0 = 0 ; i0 < 7 ; ++i0 )
-    for ( int i1 = 0 ; i1 < 8 ; ++i1 ) {
+    for ( int i0 = 0; i0 < 7; ++i0 )
+    for ( int i1 = 0; i1 < 8; ++i1 )
+    {
        int val = 1 + i0 + i1 * N0;
-       ASSERT_EQ( b(i0,i1) , c(i0,i1) );
-       ASSERT_EQ( b(i0,i1) , val );
+       ASSERT_EQ( b( i0, i1 ), c( i0, i1 ) );
+       ASSERT_EQ( b( i0, i1 ), val );
     }
 
-    Kokkos::Experimental::realloc( c , layout );
-    Kokkos::Experimental::realloc( d , layout );
+    Kokkos::Experimental::realloc( c, layout );
+    Kokkos::Experimental::realloc( d, layout );
 
-    ASSERT_EQ( b.dimension_0() , 7 );
-    ASSERT_EQ( b.dimension_1() , 8 );
-    ASSERT_EQ( c.dimension_0() , 7 );
-    ASSERT_EQ( c.dimension_1() , 8 );
-    ASSERT_EQ( d.dimension_0() , 7 );
-    ASSERT_EQ( d.dimension_1() , 8 );
+    ASSERT_EQ( b.dimension_0(), 7 );
+    ASSERT_EQ( b.dimension_1(), 8 );
+    ASSERT_EQ( c.dimension_0(), 7 );
+    ASSERT_EQ( c.dimension_1(), 8 );
+    ASSERT_EQ( d.dimension_0(), 7 );
+    ASSERT_EQ( d.dimension_1(), 8 );
 
   }
 
   {
-    typedef Kokkos::View<int*,Space> V ;
-    typedef Kokkos::View<int*,Space,Kokkos::MemoryUnmanaged> U ;
+    typedef Kokkos::View< int*, Space > V;
+    typedef Kokkos::View< int*, Space, Kokkos::MemoryUnmanaged > U;
 
+    V a( "a", 10 );
 
-    V a("a",10);
+    ASSERT_EQ( a.use_count(), 1 );
 
-    ASSERT_EQ( a.use_count() , 1 );
+    V b = a;
 
-    V b = a ;
-
-    ASSERT_EQ( a.use_count() , 2 );
-    ASSERT_EQ( b.use_count() , 2 );
+    ASSERT_EQ( a.use_count(), 2 );
+    ASSERT_EQ( b.use_count(), 2 );
 
     {
-      U c = b ; // 'c' is compile-time unmanaged
+      U c = b; // 'c' is compile-time unmanaged.
 
-      ASSERT_EQ( a.use_count() , 2 );
-      ASSERT_EQ( b.use_count() , 2 );
-      ASSERT_EQ( c.use_count() , 2 );
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( b.use_count(), 2 );
+      ASSERT_EQ( c.use_count(), 2 );
 
-      V d = c ; // 'd' is run-time unmanaged
+      V d = c; // 'd' is run-time unmanaged.
 
-      ASSERT_EQ( a.use_count() , 2 );
-      ASSERT_EQ( b.use_count() , 2 );
-      ASSERT_EQ( c.use_count() , 2 );
-      ASSERT_EQ( d.use_count() , 2 );
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( b.use_count(), 2 );
+      ASSERT_EQ( c.use_count(), 2 );
+      ASSERT_EQ( d.use_count(), 2 );
     }
 
-    ASSERT_EQ( a.use_count() , 2 );
-    ASSERT_EQ( b.use_count() , 2 );
+    ASSERT_EQ( a.use_count(), 2 );
+    ASSERT_EQ( b.use_count(), 2 );
 
     b = V();
 
-    ASSERT_EQ( a.use_count() , 1 );
-    ASSERT_EQ( b.use_count() , 0 );
-
-#if ! defined ( KOKKOS_ENABLE_CUDA_LAMBDA )
-    /* Cannot launch host lambda when CUDA lambda is enabled */
-
-    typedef typename Kokkos::Impl::HostMirror< Space >::Space::execution_space
-      host_exec_space ;
-
-    Kokkos::parallel_for(
-      Kokkos::RangePolicy< host_exec_space >(0,10) ,
-      KOKKOS_LAMBDA( int i ){
-        // 'a' is captured by copy and the capture mechanism
-        // converts 'a' to an unmanaged copy.
-        // When the parallel dispatch accepts a move for the lambda
-        // this count should become 1
-        ASSERT_EQ( a.use_count() , 2 );
-        V x = a ;
-        ASSERT_EQ( a.use_count() , 2 );
-        ASSERT_EQ( x.use_count() , 2 );
-      });
-#endif /* #if ! defined ( KOKKOS_ENABLE_CUDA_LAMBDA ) */
+    ASSERT_EQ( a.use_count(), 1 );
+    ASSERT_EQ( b.use_count(), 0 );
+
+#if !defined( KOKKOS_ENABLE_CUDA_LAMBDA )
+    // Cannot launch host lambda when CUDA lambda is enabled.
+
+    typedef typename Kokkos::Impl::HostMirror< Space >::Space::execution_space host_exec_space;
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< host_exec_space >( 0, 10 ), KOKKOS_LAMBDA ( int i ) {
+      // 'a' is captured by copy, and the capture mechanism converts 'a' to an
+      // unmanaged copy.  When the parallel dispatch accepts a move for the
+      // lambda, this count should become 1.
+      ASSERT_EQ( a.use_count(), 2 );
+      V x = a;
+      ASSERT_EQ( a.use_count(), 2 );
+      ASSERT_EQ( x.use_count(), 2 );
+    });
+#endif // #if !defined( KOKKOS_ENABLE_CUDA_LAMBDA )
   }
 }
 
 template< class Space >
 struct TestViewMappingSubview
 {
-  typedef typename Space::execution_space ExecSpace ;
-  typedef typename Space::memory_space    MemSpace ;
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
 
-  typedef Kokkos::pair<int,int> range ;
+  typedef Kokkos::pair< int, int > range;
 
   enum { AN = 10 };
-  typedef Kokkos::View<int*,ExecSpace>  AT ;
-  typedef Kokkos::View<const int*,ExecSpace>  ACT ;
-  typedef Kokkos::Subview< AT , range >  AS ;
+  typedef Kokkos::View< int*, ExecSpace >  AT;
+  typedef Kokkos::View< const int*, ExecSpace >  ACT;
+  typedef Kokkos::Subview< AT, range >  AS;
 
-  enum { BN0 = 10 , BN1 = 11 , BN2 = 12 };
-  typedef Kokkos::View<int***,ExecSpace>  BT ;
-  typedef Kokkos::Subview< BT , range , range , range >  BS ;
+  enum { BN0 = 10, BN1 = 11, BN2 = 12 };
+  typedef Kokkos::View< int***, ExecSpace >  BT;
+  typedef Kokkos::Subview< BT, range, range, range >  BS;
 
-  enum { CN0 = 10 , CN1 = 11 , CN2 = 12 };
-  typedef Kokkos::View<int***[13][14],ExecSpace>  CT ;
-  typedef Kokkos::Subview< CT , range , range , range , int , int >  CS ;
+  enum { CN0 = 10, CN1 = 11, CN2 = 12 };
+  typedef Kokkos::View< int***[13][14], ExecSpace >  CT;
+  typedef Kokkos::Subview< CT, range, range, range, int, int >  CS;
 
-  enum { DN0 = 10 , DN1 = 11 , DN2 = 12 , DN3 = 13 , DN4 = 14 };
-  typedef Kokkos::View<int***[DN3][DN4],ExecSpace>  DT ;
-  typedef Kokkos::Subview< DT , int , range , range , range , int >  DS ;
+  enum { DN0 = 10, DN1 = 11, DN2 = 12, DN3 = 13, DN4 = 14 };
+  typedef Kokkos::View< int***[DN3][DN4], ExecSpace >  DT;
+  typedef Kokkos::Subview< DT, int, range, range, range, int >  DS;
 
+  typedef Kokkos::View< int***[13][14], Kokkos::LayoutLeft, ExecSpace >  DLT;
+  typedef Kokkos::Subview< DLT, range, int, int, int, int >  DLS1;
 
-  typedef Kokkos::View<int***[13][14],Kokkos::LayoutLeft,ExecSpace>  DLT ;
-  typedef Kokkos::Subview< DLT , range , int , int , int , int >  DLS1 ;
-
-  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout , Kokkos::LayoutLeft >::value
+  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout, Kokkos::LayoutLeft >::value
                , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" );
 
-  typedef Kokkos::View<int***[13][14],Kokkos::LayoutRight,ExecSpace>  DRT ;
-  typedef Kokkos::Subview< DRT , int , int , int , int , range >  DRS1 ;
+  typedef Kokkos::View< int***[13][14], Kokkos::LayoutRight, ExecSpace >  DRT;
+  typedef Kokkos::Subview< DRT, int, int, int, int, range >  DRS1;
 
-  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout , Kokkos::LayoutRight >::value
+  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout, Kokkos::LayoutRight >::value
                , "Subview layout error for rank 1 subview of right-most range of LayoutRight" );
 
-  AT Aa ;
-  AS Ab ;
-  ACT Ac ;
-  BT Ba ;
-  BS Bb ;
-  CT Ca ;
-  CS Cb ;
-  DT Da ;
-  DS Db ;
+  AT Aa;
+  AS Ab;
+  ACT Ac;
+  BT Ba;
+  BS Bb;
+  CT Ca;
+  CS Cb;
+  DT Da;
+  DS Db;
 
   TestViewMappingSubview()
-    : Aa("Aa",AN)
-    , Ab( Kokkos::Experimental::subview( Aa , std::pair<int,int>(1,AN-1) ) )
-    , Ac( Aa , std::pair<int,int>(1,AN-1) )
-    , Ba("Ba",BN0,BN1,BN2)
+    : Aa( "Aa", AN )
+    , Ab( Kokkos::Experimental::subview( Aa, std::pair< int, int >( 1, AN - 1 ) ) )
+    , Ac( Aa, std::pair< int, int >( 1, AN - 1 ) )
+    , Ba( "Ba", BN0, BN1, BN2 )
     , Bb( Kokkos::Experimental::subview( Ba
-                                        , std::pair<int,int>(1,BN0-1)
-                                        , std::pair<int,int>(1,BN1-1)
-                                        , std::pair<int,int>(1,BN2-1)
+                                        , std::pair< int, int >( 1, BN0 - 1 )
+                                        , std::pair< int, int >( 1, BN1 - 1 )
+                                        , std::pair< int, int >( 1, BN2 - 1 )
                                         ) )
-    , Ca("Ca",CN0,CN1,CN2)
+    , Ca( "Ca", CN0, CN1, CN2 )
     , Cb( Kokkos::Experimental::subview( Ca
-                                        , std::pair<int,int>(1,CN0-1)
-                                        , std::pair<int,int>(1,CN1-1)
-                                        , std::pair<int,int>(1,CN2-1)
+                                        , std::pair< int, int >( 1, CN0 - 1 )
+                                        , std::pair< int, int >( 1, CN1 - 1 )
+                                        , std::pair< int, int >( 1, CN2 - 1 )
                                         , 1
                                         , 2
                                         ) )
-    , Da("Da",DN0,DN1,DN2)
+    , Da( "Da", DN0, DN1, DN2 )
     , Db( Kokkos::Experimental::subview( Da
                                         , 1
-                                        , std::pair<int,int>(1,DN1-1)
-                                        , std::pair<int,int>(1,DN2-1)
-                                        , std::pair<int,int>(1,DN3-1)
+                                        , std::pair< int, int >( 1, DN1 - 1 )
+                                        , std::pair< int, int >( 1, DN2 - 1 )
+                                        , std::pair< int, int >( 1, DN3 - 1 )
                                         , 2
                                         ) )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int, long & error_count ) const
+  {
+    auto Ad = Kokkos::Experimental::subview< Kokkos::MemoryUnmanaged >( Aa, Kokkos::pair< int, int >( 1, AN - 1 ) );
+
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ab[i - 1] ) ++error_count;
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ac[i - 1] ) ++error_count;
+    for ( int i = 1; i < AN - 1; ++i ) if( & Aa[i] != & Ad[i - 1] ) ++error_count;
+
+    for ( int i2 = 1; i2 < BN2 - 1; ++i2 )
+    for ( int i1 = 1; i1 < BN1 - 1; ++i1 )
+    for ( int i0 = 1; i0 < BN0 - 1; ++i0 )
     {
+      if ( & Ba( i0, i1, i2 ) != & Bb( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
     }
 
+    for ( int i2 = 1; i2 < CN2 - 1; ++i2 )
+    for ( int i1 = 1; i1 < CN1 - 1; ++i1 )
+    for ( int i0 = 1; i0 < CN0 - 1; ++i0 )
+    {
+      if ( & Ca( i0, i1, i2, 1, 2 ) != & Cb( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
+    }
 
-  KOKKOS_INLINE_FUNCTION
-  void operator()( const int , long & error_count ) const
+    for ( int i2 = 1; i2 < DN3 - 1; ++i2 )
+    for ( int i1 = 1; i1 < DN2 - 1; ++i1 )
+    for ( int i0 = 1; i0 < DN1 - 1; ++i0 )
     {
-      auto Ad = Kokkos::Experimental::subview< Kokkos::MemoryUnmanaged >( Aa , Kokkos::pair<int,int>(1,AN-1) );
-
-      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ab[i-1] ) ++error_count ;
-      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ac[i-1] ) ++error_count ;
-      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ad[i-1] ) ++error_count ;
-
-      for ( int i2 = 1 ; i2 < BN2-1 ; ++i2 ) {
-      for ( int i1 = 1 ; i1 < BN1-1 ; ++i1 ) {
-      for ( int i0 = 1 ; i0 < BN0-1 ; ++i0 ) {
-        if ( & Ba(i0,i1,i2) != & Bb(i0-1,i1-1,i2-1) ) ++error_count ;
-      }}}
-
-      for ( int i2 = 1 ; i2 < CN2-1 ; ++i2 ) {
-      for ( int i1 = 1 ; i1 < CN1-1 ; ++i1 ) {
-      for ( int i0 = 1 ; i0 < CN0-1 ; ++i0 ) {
-        if ( & Ca(i0,i1,i2,1,2) != & Cb(i0-1,i1-1,i2-1) ) ++error_count ;
-      }}}
-
-      for ( int i2 = 1 ; i2 < DN3-1 ; ++i2 ) {
-      for ( int i1 = 1 ; i1 < DN2-1 ; ++i1 ) {
-      for ( int i0 = 1 ; i0 < DN1-1 ; ++i0 ) {
-        if ( & Da(1,i0,i1,i2,2) != & Db(i0-1,i1-1,i2-1) ) ++error_count ;
-      }}}
+      if ( & Da( 1, i0, i1, i2, 2 ) != & Db( i0 - 1, i1 - 1, i2 - 1 ) ) ++error_count;
     }
+  }
 
   static void run()
   {
-    TestViewMappingSubview self ;
-
-    ASSERT_EQ( self.Aa.dimension_0() , AN );
-    ASSERT_EQ( self.Ab.dimension_0() , AN - 2 );
-    ASSERT_EQ( self.Ac.dimension_0() , AN - 2 );
-    ASSERT_EQ( self.Ba.dimension_0() , BN0 );
-    ASSERT_EQ( self.Ba.dimension_1() , BN1 );
-    ASSERT_EQ( self.Ba.dimension_2() , BN2 );
-    ASSERT_EQ( self.Bb.dimension_0() , BN0 - 2 );
-    ASSERT_EQ( self.Bb.dimension_1() , BN1 - 2 );
-    ASSERT_EQ( self.Bb.dimension_2() , BN2 - 2 );
-
-    ASSERT_EQ( self.Ca.dimension_0() , CN0 );
-    ASSERT_EQ( self.Ca.dimension_1() , CN1 );
-    ASSERT_EQ( self.Ca.dimension_2() , CN2 );
-    ASSERT_EQ( self.Ca.dimension_3() , 13 );
-    ASSERT_EQ( self.Ca.dimension_4() , 14 );
-    ASSERT_EQ( self.Cb.dimension_0() , CN0 - 2 );
-    ASSERT_EQ( self.Cb.dimension_1() , CN1 - 2 );
-    ASSERT_EQ( self.Cb.dimension_2() , CN2 - 2 );
-
-    ASSERT_EQ( self.Da.dimension_0() , DN0 );
-    ASSERT_EQ( self.Da.dimension_1() , DN1 );
-    ASSERT_EQ( self.Da.dimension_2() , DN2 );
-    ASSERT_EQ( self.Da.dimension_3() , DN3 );
-    ASSERT_EQ( self.Da.dimension_4() , DN4 );
-
-    ASSERT_EQ( self.Db.dimension_0() , DN1 - 2 );
-    ASSERT_EQ( self.Db.dimension_1() , DN2 - 2 );
-    ASSERT_EQ( self.Db.dimension_2() , DN3 - 2 );
-
-    ASSERT_EQ( self.Da.stride_1() , self.Db.stride_0() );
-    ASSERT_EQ( self.Da.stride_2() , self.Db.stride_1() );
-    ASSERT_EQ( self.Da.stride_3() , self.Db.stride_2() );
-
-    long error_count = -1 ;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >(0,1) , self , error_count );
-    ASSERT_EQ( error_count , 0 );
+    TestViewMappingSubview self;
+
+    ASSERT_EQ( self.Aa.dimension_0(), AN );
+    ASSERT_EQ( self.Ab.dimension_0(), AN - 2 );
+    ASSERT_EQ( self.Ac.dimension_0(), AN - 2 );
+    ASSERT_EQ( self.Ba.dimension_0(), BN0 );
+    ASSERT_EQ( self.Ba.dimension_1(), BN1 );
+    ASSERT_EQ( self.Ba.dimension_2(), BN2 );
+    ASSERT_EQ( self.Bb.dimension_0(), BN0 - 2 );
+    ASSERT_EQ( self.Bb.dimension_1(), BN1 - 2 );
+    ASSERT_EQ( self.Bb.dimension_2(), BN2 - 2 );
+
+    ASSERT_EQ( self.Ca.dimension_0(), CN0 );
+    ASSERT_EQ( self.Ca.dimension_1(), CN1 );
+    ASSERT_EQ( self.Ca.dimension_2(), CN2 );
+    ASSERT_EQ( self.Ca.dimension_3(), 13 );
+    ASSERT_EQ( self.Ca.dimension_4(), 14 );
+    ASSERT_EQ( self.Cb.dimension_0(), CN0 - 2 );
+    ASSERT_EQ( self.Cb.dimension_1(), CN1 - 2 );
+    ASSERT_EQ( self.Cb.dimension_2(), CN2 - 2 );
+
+    ASSERT_EQ( self.Da.dimension_0(), DN0 );
+    ASSERT_EQ( self.Da.dimension_1(), DN1 );
+    ASSERT_EQ( self.Da.dimension_2(), DN2 );
+    ASSERT_EQ( self.Da.dimension_3(), DN3 );
+    ASSERT_EQ( self.Da.dimension_4(), DN4 );
+
+    ASSERT_EQ( self.Db.dimension_0(), DN1 - 2 );
+    ASSERT_EQ( self.Db.dimension_1(), DN2 - 2 );
+    ASSERT_EQ( self.Db.dimension_2(), DN3 - 2 );
+
+    ASSERT_EQ( self.Da.stride_1(), self.Db.stride_0() );
+    ASSERT_EQ( self.Da.stride_2(), self.Db.stride_1() );
+    ASSERT_EQ( self.Da.stride_3(), self.Db.stride_2() );
+
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, 1 ), self, error_count );
+    ASSERT_EQ( error_count, 0 );
   }
-
 };
 
 template< class Space >
 void test_view_mapping_subview()
 {
-  typedef typename Space::execution_space ExecSpace ;
+  typedef typename Space::execution_space ExecSpace;
 
   TestViewMappingSubview< ExecSpace >::run();
 }
@@ -1181,214 +1195,228 @@ struct TestViewMapOperator {
   static_assert( ViewType::reference_type_is_lvalue_reference
                , "Test only valid for lvalue reference type" );
 
-  const ViewType v ;
+  const ViewType v;
 
   KOKKOS_INLINE_FUNCTION
-  void test_left( size_t i0 , long & error_count ) const
+  void test_left( size_t i0, long & error_count ) const
+  {
+    typename ViewType::value_type * const base_ptr = & v( 0, 0, 0, 0, 0, 0, 0, 0 );
+    const size_t n1 = v.dimension_1();
+    const size_t n2 = v.dimension_2();
+    const size_t n3 = v.dimension_3();
+    const size_t n4 = v.dimension_4();
+    const size_t n5 = v.dimension_5();
+    const size_t n6 = v.dimension_6();
+    const size_t n7 = v.dimension_7();
+
+    long offset = 0;
+
+    for ( size_t i7 = 0; i7 < n7; ++i7 )
+    for ( size_t i6 = 0; i6 < n6; ++i6 )
+    for ( size_t i5 = 0; i5 < n5; ++i5 )
+    for ( size_t i4 = 0; i4 < n4; ++i4 )
+    for ( size_t i3 = 0; i3 < n3; ++i3 )
+    for ( size_t i2 = 0; i2 < n2; ++i2 )
+    for ( size_t i1 = 0; i1 < n1; ++i1 )
     {
-      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
-      const size_t n1 = v.dimension_1();
-      const size_t n2 = v.dimension_2();
-      const size_t n3 = v.dimension_3();
-      const size_t n4 = v.dimension_4();
-      const size_t n5 = v.dimension_5();
-      const size_t n6 = v.dimension_6();
-      const size_t n7 = v.dimension_7();
-
-      long offset = 0 ;
-
-      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
-      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
-      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
-      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
-      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
-      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
-      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
-      {
-        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
-        if ( d < offset ) ++error_count ;
-        offset = d ;
-      }
-
-      if ( v.span() <= size_t(offset) ) ++error_count ;
+      const long d = & v( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+      if ( d < offset ) ++error_count;
+      offset = d;
     }
 
+    if ( v.span() <= size_t( offset ) ) ++error_count;
+  }
+
   KOKKOS_INLINE_FUNCTION
-  void test_right( size_t i0 , long & error_count ) const
+  void test_right( size_t i0, long & error_count ) const
+  {
+    typename ViewType::value_type * const base_ptr = & v( 0, 0, 0, 0, 0, 0, 0, 0 );
+    const size_t n1 = v.dimension_1();
+    const size_t n2 = v.dimension_2();
+    const size_t n3 = v.dimension_3();
+    const size_t n4 = v.dimension_4();
+    const size_t n5 = v.dimension_5();
+    const size_t n6 = v.dimension_6();
+    const size_t n7 = v.dimension_7();
+
+    long offset = 0;
+
+    for ( size_t i1 = 0; i1 < n1; ++i1 )
+    for ( size_t i2 = 0; i2 < n2; ++i2 )
+    for ( size_t i3 = 0; i3 < n3; ++i3 )
+    for ( size_t i4 = 0; i4 < n4; ++i4 )
+    for ( size_t i5 = 0; i5 < n5; ++i5 )
+    for ( size_t i6 = 0; i6 < n6; ++i6 )
+    for ( size_t i7 = 0; i7 < n7; ++i7 )
     {
-      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
-      const size_t n1 = v.dimension_1();
-      const size_t n2 = v.dimension_2();
-      const size_t n3 = v.dimension_3();
-      const size_t n4 = v.dimension_4();
-      const size_t n5 = v.dimension_5();
-      const size_t n6 = v.dimension_6();
-      const size_t n7 = v.dimension_7();
-
-      long offset = 0 ;
-
-      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
-      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
-      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
-      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
-      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
-      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
-      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
-      {
-        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
-        if ( d < offset ) ++error_count ;
-        offset = d ;
-      }
-
-      if ( v.span() <= size_t(offset) ) ++error_count ;
+      const long d = & v( i0, i1, i2, i3, i4, i5, i6, i7 ) - base_ptr;
+      if ( d < offset ) ++error_count;
+      offset = d;
     }
 
+    if ( v.span() <= size_t( offset ) ) ++error_count;
+  }
+
   KOKKOS_INLINE_FUNCTION
-  void operator()( size_t i , long & error_count ) const
-    {
-      if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutLeft >::value )
-        test_left(i,error_count);
-      else if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutRight >::value )
-        test_right(i,error_count);
+  void operator()( size_t i, long & error_count ) const
+  {
+    if ( std::is_same< typename ViewType::array_layout, Kokkos::LayoutLeft >::value ) {
+      test_left( i, error_count );
     }
+    else if ( std::is_same< typename ViewType::array_layout, Kokkos::LayoutRight >::value ) {
+      test_right( i, error_count );
+    }
+  }
 
-  constexpr static size_t N0 = 10 ;
-  constexpr static size_t N1 =  9 ;
-  constexpr static size_t N2 =  8 ;
-  constexpr static size_t N3 =  7 ;
-  constexpr static size_t N4 =  6 ;
-  constexpr static size_t N5 =  5 ;
-  constexpr static size_t N6 =  4 ;
-  constexpr static size_t N7 =  3 ;
+  constexpr static size_t N0 = 10;
+  constexpr static size_t N1 =  9;
+  constexpr static size_t N2 =  8;
+  constexpr static size_t N3 =  7;
+  constexpr static size_t N4 =  6;
+  constexpr static size_t N5 =  5;
+  constexpr static size_t N6 =  4;
+  constexpr static size_t N7 =  3;
 
-  TestViewMapOperator() : v( "Test" , N0, N1, N2, N3, N4, N5, N6, N7 ) {}
+  TestViewMapOperator() : v( "Test", N0, N1, N2, N3, N4, N5, N6, N7 ) {}
 
   static void run()
-    {
-      TestViewMapOperator self ;
-
-      ASSERT_EQ( self.v.dimension_0() , ( 0 < ViewType::rank ? N0 : 1 ) );
-      ASSERT_EQ( self.v.dimension_1() , ( 1 < ViewType::rank ? N1 : 1 ) );
-      ASSERT_EQ( self.v.dimension_2() , ( 2 < ViewType::rank ? N2 : 1 ) );
-      ASSERT_EQ( self.v.dimension_3() , ( 3 < ViewType::rank ? N3 : 1 ) );
-      ASSERT_EQ( self.v.dimension_4() , ( 4 < ViewType::rank ? N4 : 1 ) );
-      ASSERT_EQ( self.v.dimension_5() , ( 5 < ViewType::rank ? N5 : 1 ) );
-      ASSERT_EQ( self.v.dimension_6() , ( 6 < ViewType::rank ? N6 : 1 ) );
-      ASSERT_EQ( self.v.dimension_7() , ( 7 < ViewType::rank ? N7 : 1 ) );
-
-      ASSERT_LE( self.v.dimension_0()*
-                 self.v.dimension_1()*
-                 self.v.dimension_2()*
-                 self.v.dimension_3()*
-                 self.v.dimension_4()*
-                 self.v.dimension_5()*
-                 self.v.dimension_6()*
-                 self.v.dimension_7()
-               , self.v.span() );
-
-      long error_count ;
-      Kokkos::RangePolicy< typename ViewType::execution_space > range(0,self.v.dimension_0());
-      Kokkos::parallel_reduce( range , self , error_count );
-      ASSERT_EQ( 0 , error_count );
-    }
+  {
+    TestViewMapOperator self;
+
+    ASSERT_EQ( self.v.dimension_0(), ( 0 < ViewType::rank ? N0 : 1 ) );
+    ASSERT_EQ( self.v.dimension_1(), ( 1 < ViewType::rank ? N1 : 1 ) );
+    ASSERT_EQ( self.v.dimension_2(), ( 2 < ViewType::rank ? N2 : 1 ) );
+    ASSERT_EQ( self.v.dimension_3(), ( 3 < ViewType::rank ? N3 : 1 ) );
+    ASSERT_EQ( self.v.dimension_4(), ( 4 < ViewType::rank ? N4 : 1 ) );
+    ASSERT_EQ( self.v.dimension_5(), ( 5 < ViewType::rank ? N5 : 1 ) );
+    ASSERT_EQ( self.v.dimension_6(), ( 6 < ViewType::rank ? N6 : 1 ) );
+    ASSERT_EQ( self.v.dimension_7(), ( 7 < ViewType::rank ? N7 : 1 ) );
+
+    ASSERT_LE( self.v.dimension_0() *
+               self.v.dimension_1() *
+               self.v.dimension_2() *
+               self.v.dimension_3() *
+               self.v.dimension_4() *
+               self.v.dimension_5() *
+               self.v.dimension_6() *
+               self.v.dimension_7()
+             , self.v.span() );
+
+    long error_count;
+    Kokkos::RangePolicy< typename ViewType::execution_space > range( 0, self.v.dimension_0() );
+    Kokkos::parallel_reduce( range, self, error_count );
+    ASSERT_EQ( 0, error_count );
+  }
 };
 
-
 template< class Space >
 void test_view_mapping_operator()
 {
-  typedef typename Space::execution_space ExecSpace ;
-
-  TestViewMapOperator< Kokkos::View<int,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int**,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int***,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int****,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*****,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int******,Kokkos::LayoutLeft,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*******,Kokkos::LayoutLeft,ExecSpace> >::run();
-
-  TestViewMapOperator< Kokkos::View<int,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int**,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int***,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int****,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*****,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int******,Kokkos::LayoutRight,ExecSpace> >::run();
-  TestViewMapOperator< Kokkos::View<int*******,Kokkos::LayoutRight,ExecSpace> >::run();
+  typedef typename Space::execution_space ExecSpace;
+
+  TestViewMapOperator< Kokkos::View<int, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int**, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int***, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int****, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*****, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int******, Kokkos::LayoutLeft, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*******, Kokkos::LayoutLeft, ExecSpace> >::run();
+
+  TestViewMapOperator< Kokkos::View<int, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int**, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int***, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int****, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*****, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int******, Kokkos::LayoutRight, ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::View<int*******, Kokkos::LayoutRight, ExecSpace> >::run();
 }
 
 /*--------------------------------------------------------------------------*/
 
 template< class Space >
 struct TestViewMappingAtomic {
-  typedef typename Space::execution_space ExecSpace ;
-  typedef typename Space::memory_space    MemSpace ;
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
 
-  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait ;
+  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait;
 
-  typedef Kokkos::View< int * , ExecSpace > T ;
-  typedef Kokkos::View< int * , ExecSpace , mem_trait >  T_atom ;
+  typedef Kokkos::View< int *, ExecSpace > T;
+  typedef Kokkos::View< int *, ExecSpace, mem_trait >  T_atom;
 
-  T      x ;
-  T_atom x_atom ;
+  T      x;
+  T_atom x_atom;
 
-  constexpr static size_t N = 100000 ;
+  constexpr static size_t N = 100000;
 
   struct TagInit {};
   struct TagUpdate {};
   struct TagVerify {};
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const
-    { x(i) = i ; }
+  void operator()( const TagInit &, const int i ) const
+  { x( i ) = i; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagUpdate & , const int i ) const
-    { x_atom(i%2) += 1 ; }
+  void operator()( const TagUpdate &, const int i ) const
+  { x_atom( i % 2 ) += 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagVerify & , const int i , long & error_count ) const
-    {
-       if ( i < 2 ) { if ( x(i) != int(i + N / 2) ) ++error_count ; }
-       else         { if ( x(i) != int(i) ) ++error_count ; }
-    }
+  void operator()( const TagVerify &, const int i, long & error_count ) const
+  {
+     if ( i < 2 ) { if ( x( i ) != int( i + N / 2 ) ) ++error_count; }
+     else         { if ( x( i ) != int( i ) ) ++error_count; }
+  }
 
   TestViewMappingAtomic()
-    : x("x",N)
+    : x( "x", N )
     , x_atom( x )
     {}
 
   static void run()
+  {
+    ASSERT_TRUE( T::reference_type_is_lvalue_reference );
+    ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
+
+    TestViewMappingAtomic self;
+
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, TagInit >( 0, N ), self );
+    Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace, TagUpdate >( 0, N ), self );
+
+    long error_count = -1;
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagVerify >( 0, N ), self, error_count );
+
+    ASSERT_EQ( 0, error_count );
+
+    typename TestViewMappingAtomic::T_atom::HostMirror x_host = Kokkos::create_mirror_view( self.x );
+    Kokkos::deep_copy( x_host, self.x );
+
+    error_count = -1;
+
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::DefaultHostExecutionSpace, TagVerify >( 0, N ), 
+      [=] ( const TagVerify &, const int i, long & tmp_error_count )
     {
-      ASSERT_TRUE( T::reference_type_is_lvalue_reference );
-      ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
-
-      TestViewMappingAtomic self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagInit >(0,N) , self );
-      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagUpdate >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagVerify >(0,N) , self , error_count );
-      ASSERT_EQ( 0 , error_count );
-      typename TestViewMappingAtomic::T_atom::HostMirror x_host = Kokkos::create_mirror_view(self.x);
-      Kokkos::deep_copy(x_host,self.x);
-      error_count = -1;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::DefaultHostExecutionSpace, TagVerify>(0,N), 
-        [=] ( const TagVerify & , const int i , long & tmp_error_count ) {
-        if ( i < 2 ) { if ( x_host(i) != int(i + N / 2) ) ++tmp_error_count ; }
-        else         { if ( x_host(i) != int(i) ) ++tmp_error_count ; }
-      }, error_count);
-      ASSERT_EQ( 0 , error_count );
-      Kokkos::deep_copy(self.x,x_host);
-    }
+      if ( i < 2 ) {
+        if ( x_host( i ) != int( i + N / 2 ) ) ++tmp_error_count ;
+      }
+      else {
+        if ( x_host( i ) != int( i ) ) ++tmp_error_count ;
+      }
+    }, error_count);
+
+    ASSERT_EQ( 0 , error_count );
+    Kokkos::deep_copy( self.x, x_host );
+  }
 };
 
 /*--------------------------------------------------------------------------*/
 
 template< class Space >
 struct TestViewMappingClassValue {
-  typedef typename Space::execution_space ExecSpace ;
-  typedef typename Space::memory_space    MemSpace ;
+  typedef typename Space::execution_space ExecSpace;
+  typedef typename Space::memory_space    MemSpace;
 
   struct ValueType {
     KOKKOS_INLINE_FUNCTION
@@ -1396,11 +1424,11 @@ struct TestViewMappingClassValue {
     {
 #if 0
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-      printf("TestViewMappingClassValue construct on Cuda\n");
+      printf( "TestViewMappingClassValue construct on Cuda\n" );
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      printf("TestViewMappingClassValue construct on Host\n");
+      printf( "TestViewMappingClassValue construct on Host\n" );
 #else
-      printf("TestViewMappingClassValue construct unknown\n");
+      printf( "TestViewMappingClassValue construct unknown\n" );
 #endif
 #endif
     }
@@ -1409,11 +1437,11 @@ struct TestViewMappingClassValue {
     {
 #if 0
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
-      printf("TestViewMappingClassValue destruct on Cuda\n");
+      printf( "TestViewMappingClassValue destruct on Cuda\n" );
 #elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
-      printf("TestViewMappingClassValue destruct on Host\n");
+      printf( "TestViewMappingClassValue destruct on Host\n" );
 #else
-      printf("TestViewMappingClassValue destruct unknown\n");
+      printf( "TestViewMappingClassValue destruct unknown\n" );
 #endif
 #endif
     }
@@ -1421,17 +1449,15 @@ struct TestViewMappingClassValue {
 
   static void run()
   {
-    using namespace Kokkos::Experimental ;
+    using namespace Kokkos::Experimental;
+
     ExecSpace::fence();
     {
-      View< ValueType , ExecSpace > a("a");
+      View< ValueType, ExecSpace > a( "a" );
       ExecSpace::fence();
     }
     ExecSpace::fence();
   }
 };
 
-} /* namespace Test */
-
-/*--------------------------------------------------------------------------*/
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewOfClass.hpp b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
index 381b8786bc740dfcfb922eb6ddf5443ffa7136cd..d624c5dda2034b04b5b1a427614f38186aa032d8 100644
--- a/lib/kokkos/core/unit_test/TestViewOfClass.hpp
+++ b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,34 +48,29 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
 template< class Space >
 struct NestedView {
-
-  Kokkos::View<int*,Space> member ;
+  Kokkos::View< int*, Space > member;
 
 public:
-
   KOKKOS_INLINE_FUNCTION
-  NestedView() : member()
-    {}
+  NestedView() : member() {}
 
   KOKKOS_INLINE_FUNCTION
-  NestedView & operator = ( const Kokkos::View<int*,Space> & lhs )
-    {
-      member = lhs ;
-      if ( member.dimension_0() ) Kokkos::atomic_add( & member(0) , 1 );
-      return *this ;
-    }
+  NestedView & operator=( const Kokkos::View< int*, Space > & lhs )
+  {
+    member = lhs;
+    if ( member.dimension_0() ) Kokkos::atomic_add( & member( 0 ), 1 );
+    return *this;
+  }
 
   KOKKOS_INLINE_FUNCTION
   ~NestedView()
-  { 
+  {
     if ( member.dimension_0() ) {
-      Kokkos::atomic_add( & member(0) , -1 );
+      Kokkos::atomic_add( & member( 0 ), -1 );
     }
   }
 };
@@ -83,49 +78,44 @@ public:
 template< class Space >
 struct NestedViewFunctor {
 
-  Kokkos::View< NestedView<Space> * , Space > nested ;
-  Kokkos::View<int*,Space>                    array ;
+  Kokkos::View< NestedView<Space> *, Space > nested;
+  Kokkos::View< int*, Space >                array;
 
-  NestedViewFunctor( 
-    const Kokkos::View< NestedView<Space> * , Space > & arg_nested ,
-    const Kokkos::View<int*,Space>                    & arg_array )
+  NestedViewFunctor(
+    const Kokkos::View< NestedView<Space> *, Space > & arg_nested,
+    const Kokkos::View< int*, Space >                & arg_array )
   : nested( arg_nested )
   , array(  arg_array )
   {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( int i ) const
-    { nested[i] = array ; }
+  void operator()( int i ) const { nested[i] = array; }
 };
 
-
 template< class Space >
 void view_nested_view()
 {
-  Kokkos::View<int*,Space> tracking("tracking",1);
+  Kokkos::View< int*, Space > tracking( "tracking", 1 );
 
-  typename Kokkos::View<int*,Space>::HostMirror
-     host_tracking = Kokkos::create_mirror( tracking );
+  typename Kokkos::View< int*, Space >::HostMirror host_tracking = Kokkos::create_mirror( tracking );
 
   {
-    Kokkos::View< NestedView<Space> * , Space > a("a_nested_view",2);
+    Kokkos::View< NestedView<Space> *, Space > a( "a_nested_view", 2 );
 
-    Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,2) , NestedViewFunctor<Space>( a , tracking ) );
-    Kokkos::deep_copy( host_tracking , tracking );
-    ASSERT_EQ( 2 , host_tracking(0) );
+    Kokkos::parallel_for( Kokkos::RangePolicy< Space >( 0, 2 ), NestedViewFunctor< Space >( a, tracking ) );
+    Kokkos::deep_copy( host_tracking, tracking );
+    ASSERT_EQ( 2, host_tracking( 0 ) );
 
-    Kokkos::View< NestedView<Space> * , Space > b("b_nested_view",2);
-    Kokkos::parallel_for( Kokkos::RangePolicy<Space>(0,2) , NestedViewFunctor<Space>( b , tracking ) );
-    Kokkos::deep_copy( host_tracking , tracking );
-    ASSERT_EQ( 4 , host_tracking(0) );
+    Kokkos::View< NestedView<Space> *, Space > b( "b_nested_view", 2 );
+    Kokkos::parallel_for( Kokkos::RangePolicy< Space >( 0, 2 ), NestedViewFunctor< Space >( b, tracking ) );
+    Kokkos::deep_copy( host_tracking, tracking );
+    ASSERT_EQ( 4, host_tracking( 0 ) );
 
   }
-  Kokkos::deep_copy( host_tracking , tracking );
 
-  ASSERT_EQ( 0 , host_tracking(0) );
-}
+  Kokkos::deep_copy( host_tracking, tracking );
 
+  ASSERT_EQ( 0, host_tracking( 0 ) );
 }
 
-/*--------------------------------------------------------------------------*/
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp b/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp
index 09141e582c48423341029bae51c09fe51d14c893..21ae92e93ccdc09c3e42057f706c7bec383239eb 100644
--- a/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSpaceAssign.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,35 +48,29 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 namespace Test {
 
-template< typename SpaceDst , typename SpaceSrc >
+template< typename SpaceDst, typename SpaceSrc >
 void view_space_assign()
 {
-  Kokkos::View<double*,SpaceDst> a =
-  Kokkos::View<double*,SpaceSrc>("a",1);
+  Kokkos::View< double*, SpaceDst > a =
+    Kokkos::View< double*, SpaceSrc >( "a", 1 );
 
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceDst> b =
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceSrc>("b",1);
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceDst > b =
+    Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "b", 1 );
 
-  Kokkos::View<double*,Kokkos::LayoutRight,SpaceDst> c =
-  Kokkos::View<double*,Kokkos::LayoutRight,SpaceSrc>("c",1);
+  Kokkos::View< double*, Kokkos::LayoutRight, SpaceDst > c =
+    Kokkos::View< double*, Kokkos::LayoutRight, SpaceSrc >( "c", 1 );
 
-  Kokkos::View<double*,SpaceDst,Kokkos::MemoryRandomAccess> d =
-  Kokkos::View<double*,SpaceSrc>("d",1);
+  Kokkos::View< double*, SpaceDst, Kokkos::MemoryRandomAccess > d =
+    Kokkos::View< double*, SpaceSrc >( "d", 1 );
 
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceDst,Kokkos::MemoryRandomAccess> e =
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceSrc>("e",1);
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceDst, Kokkos::MemoryRandomAccess > e =
+    Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "e", 1 );
 
   // Rank-one layout can assign:
-  Kokkos::View<double*,Kokkos::LayoutRight,SpaceDst> f =
-  Kokkos::View<double*,Kokkos::LayoutLeft,SpaceSrc>("f",1);
+  Kokkos::View< double*, Kokkos::LayoutRight, SpaceDst > f =
+  Kokkos::View< double*, Kokkos::LayoutLeft, SpaceSrc >( "f", 1 );
 }
 
-
 } // namespace Test
-
-/*--------------------------------------------------------------------------*/
-
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
index 1c2575b6f61c9fa11b28963852085960ecc420aa..386301b45dbc9f9d6bb5770133d818a7eccba40e 100644
--- a/lib/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -48,64 +48,68 @@
 #include <sstream>
 #include <iostream>
 
-/*--------------------------------------------------------------------------*/
-
 namespace TestViewSubview {
 
-template<class Layout, class Space>
+template< class Layout, class Space >
 struct getView {
   static
-    Kokkos::View<double**,Layout,Space> get(int n, int m) {
-      return Kokkos::View<double**,Layout,Space>("G",n,m);
+    Kokkos::View< double**, Layout, Space > get( int n, int m ) {
+      return Kokkos::View< double**, Layout, Space >( "G", n, m );
   }
 };
 
-template<class Space>
-struct getView<Kokkos::LayoutStride,Space> {
+template< class Space >
+struct getView< Kokkos::LayoutStride, Space > {
   static
-    Kokkos::View<double**,Kokkos::LayoutStride,Space> get(int n, int m) {
-      const int rank = 2 ;
+    Kokkos::View< double**, Kokkos::LayoutStride, Space > get( int n, int m ) {
+      const int rank = 2;
       const int order[] = { 0, 1 };
-      const unsigned dim[] = { unsigned(n), unsigned(m) };
-      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank , order , dim );
-      return Kokkos::View<double**,Kokkos::LayoutStride,Space>("G",stride);
+      const unsigned dim[] = { unsigned( n ), unsigned( m ) };
+      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank, order, dim );
+
+      return Kokkos::View< double**, Kokkos::LayoutStride, Space >( "G", stride );
   }
 };
 
-template<class ViewType, class Space>
+template< class ViewType, class Space >
 struct fill_1D {
   typedef typename Space::execution_space execution_space;
   typedef typename ViewType::size_type size_type;
+
   ViewType a;
   double val;
-  fill_1D(ViewType a_, double val_):a(a_),val(val_) {
-  }
+
+  fill_1D( ViewType a_, double val_ ) : a( a_ ), val( val_ ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int i) const {
-    a(i) = val;
-  }
+  void operator()( const int i ) const { a( i ) = val; }
 };
 
-template<class ViewType, class Space>
+template< class ViewType, class Space >
 struct fill_2D {
   typedef typename Space::execution_space execution_space;
   typedef typename ViewType::size_type size_type;
+
   ViewType a;
   double val;
-  fill_2D(ViewType a_, double val_):a(a_),val(val_) {
-  }
+
+  fill_2D( ViewType a_, double val_ ) : a( a_ ), val( val_ ) {}
+
   KOKKOS_INLINE_FUNCTION
-  void operator() (const int i) const{
-    for(int j = 0; j < static_cast<int>(a.dimension_1()); j++)
-      a(i,j) = val;
+  void operator()( const int i ) const
+  {
+    for ( int j = 0; j < static_cast< int >( a.dimension_1() ); j++ ) {
+      a( i, j ) = val;
+    }
   }
 };
 
-template<class Layout, class Space>
+template< class Layout, class Space >
 void test_auto_1d ()
 {
-  typedef Kokkos::View<double**, Layout, Space> mv_type;
+  typedef Kokkos::View< double**, Layout, Space > mv_type;
   typedef typename mv_type::size_type size_type;
+
   const double ZERO = 0.0;
   const double ONE = 1.0;
   const double TWO = 2.0;
@@ -113,359 +117,359 @@ void test_auto_1d ()
   const size_type numRows = 10;
   const size_type numCols = 3;
 
-  mv_type X = getView<Layout,Space>::get(numRows, numCols);
-  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view (X);
+  mv_type X = getView< Layout, Space >::get( numRows, numCols );
+  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view( X );
 
-  fill_2D<mv_type,Space> f1(X, ONE);
-  Kokkos::parallel_for(X.dimension_0(),f1);
-  Kokkos::deep_copy (X_h, X);
-  for (size_type j = 0; j < numCols; ++j) {
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == ONE);
+  fill_2D< mv_type, Space > f1( X, ONE );
+  Kokkos::parallel_for( X.dimension_0(), f1 );
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ONE );
     }
   }
 
-  fill_2D<mv_type,Space> f2(X, 0.0);
-  Kokkos::parallel_for(X.dimension_0(),f2);
-  Kokkos::deep_copy (X_h, X);
-  for (size_type j = 0; j < numCols; ++j) {
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == ZERO);
+  fill_2D< mv_type, Space > f2( X, 0.0 );
+  Kokkos::parallel_for( X.dimension_0(), f2 );
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ZERO );
     }
   }
 
-  fill_2D<mv_type,Space> f3(X, TWO);
-  Kokkos::parallel_for(X.dimension_0(),f3);
-  Kokkos::deep_copy (X_h, X);
-  for (size_type j = 0; j < numCols; ++j) {
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == TWO);
+  fill_2D< mv_type, Space > f3( X, TWO );
+  Kokkos::parallel_for( X.dimension_0(), f3 );
+  Kokkos::deep_copy( X_h, X );
+  for ( size_type j = 0; j < numCols; ++j ) {
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == TWO );
     }
   }
 
-  for (size_type j = 0; j < numCols; ++j) {
-    auto X_j = Kokkos::subview (X, Kokkos::ALL, j);
+  for ( size_type j = 0; j < numCols; ++j ) {
+    auto X_j = Kokkos::subview( X, Kokkos::ALL, j );
 
-    fill_1D<decltype(X_j),Space> f4(X_j, ZERO);
-    Kokkos::parallel_for(X_j.dimension_0(),f4);
-    Kokkos::deep_copy (X_h, X);
-    for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i,j) == ZERO);
+    fill_1D< decltype( X_j ), Space > f4( X_j, ZERO );
+    Kokkos::parallel_for( X_j.dimension_0(), f4 );
+    Kokkos::deep_copy( X_h, X );
+    for ( size_type i = 0; i < numRows; ++i ) {
+      ASSERT_TRUE( X_h( i, j ) == ZERO );
     }
 
-    for (size_type jj = 0; jj < numCols; ++jj) {
-      auto X_jj = Kokkos::subview (X, Kokkos::ALL, jj);
-      fill_1D<decltype(X_jj),Space> f5(X_jj, ONE);
-      Kokkos::parallel_for(X_jj.dimension_0(),f5);
-      Kokkos::deep_copy (X_h, X);
-      for (size_type i = 0; i < numRows; ++i) {
-        ASSERT_TRUE(X_h(i,jj) == ONE);
+    for ( size_type jj = 0; jj < numCols; ++jj ) {
+      auto X_jj = Kokkos::subview ( X, Kokkos::ALL, jj );
+      fill_1D< decltype( X_jj ), Space > f5( X_jj, ONE );
+      Kokkos::parallel_for( X_jj.dimension_0(), f5 );
+      Kokkos::deep_copy( X_h, X );
+      for ( size_type i = 0; i < numRows; ++i ) {
+        ASSERT_TRUE( X_h( i, jj ) == ONE );
       }
     }
   }
 }
 
-template<class LD, class LS, class Space>
-void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n, int m) {
-  Kokkos::View<double**,LS,Space> l2d("l2d",n,m);
+template< class LD, class LS, class Space >
+void test_1d_strided_assignment_impl( bool a, bool b, bool c, bool d, int n, int m ) {
+  Kokkos::View< double**, LS, Space > l2d( "l2d", n, m );
 
-  int col = n>2?2:0;
-  int row = m>2?2:0;
+  int col = n > 2 ? 2 : 0;
+  int row = m > 2 ? 2 : 0;
 
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
-  if(a) {
-    Kokkos::View<double*,LD,Space> l1da = Kokkos::subview(l2d,Kokkos::ALL,row);
-    ASSERT_TRUE( & l1da(0) == & l2d(0,row) );
-    if(n>1)
-      ASSERT_TRUE( & l1da(1) == & l2d(1,row) );
-  }
-  if(b && n>13) {
-    Kokkos::View<double*,LD,Space> l1db = Kokkos::subview(l2d,std::pair<unsigned,unsigned>(2,13),row);
-    ASSERT_TRUE( & l1db(0) == & l2d(2,row) );
-    ASSERT_TRUE( & l1db(1) == & l2d(3,row) );
-  }
-  if(c) {
-    Kokkos::View<double*,LD,Space> l1dc = Kokkos::subview(l2d,col,Kokkos::ALL);
-    ASSERT_TRUE( & l1dc(0) == & l2d(col,0) );
-    if(m>1)
-      ASSERT_TRUE( & l1dc(1) == & l2d(col,1) );
-  }
-  if(d && m>13) {
-    Kokkos::View<double*,LD,Space> l1dd = Kokkos::subview(l2d,col,std::pair<unsigned,unsigned>(2,13));
-    ASSERT_TRUE( & l1dd(0) == & l2d(col,2) );
-    ASSERT_TRUE( & l1dd(1) == & l2d(col,3) );
-  }
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    if ( a ) {
+      Kokkos::View< double*, LD, Space > l1da = Kokkos::subview( l2d, Kokkos::ALL, row );
+      ASSERT_TRUE( & l1da( 0 ) == & l2d( 0, row ) );
+      if ( n > 1 ) {
+        ASSERT_TRUE( & l1da( 1 ) == & l2d( 1, row ) );
+      }
+    }
+
+    if ( b && n > 13 ) {
+      Kokkos::View< double*, LD, Space > l1db = Kokkos::subview( l2d, std::pair< unsigned, unsigned >( 2, 13 ), row );
+      ASSERT_TRUE( & l1db( 0 ) == & l2d( 2, row ) );
+      ASSERT_TRUE( & l1db( 1 ) == & l2d( 3, row ) );
+    }
+
+    if ( c ) {
+      Kokkos::View< double*, LD, Space > l1dc = Kokkos::subview( l2d, col, Kokkos::ALL );
+      ASSERT_TRUE( & l1dc( 0 ) == & l2d( col, 0 ) );
+      if( m > 1 ) {
+        ASSERT_TRUE( & l1dc( 1 ) == & l2d( col, 1 ) );
+      }
+    }
+
+    if ( d && m > 13 ) {
+      Kokkos::View< double*, LD, Space > l1dd = Kokkos::subview( l2d, col, std::pair< unsigned, unsigned >( 2, 13 ) );
+      ASSERT_TRUE( & l1dd( 0 ) == & l2d( col, 2 ) );
+      ASSERT_TRUE( & l1dd( 1 ) == & l2d( col, 3 ) );
+    }
   }
 
 }
 
-template<class Space >
+template< class Space >
 void test_1d_strided_assignment() {
-  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutLeft,Space>(true,true,true,true,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutRight,Space>(true,true,true,true,17,3);
-
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
-
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
-
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
-  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
-  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
+  test_1d_strided_assignment_impl< Kokkos::LayoutStride, Kokkos::LayoutLeft, Space >( true, true, true, true, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutStride, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 3 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( false, false, true, true, 17, 3 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( false, false, true, true, 17, 3 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 1 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutLeft, Space >( true, true, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutLeft, Space >( true, true, false, false, 17, 1 );
+
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 1 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutLeft, Kokkos::LayoutRight, Space >( false, false, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( false, false, true, true, 1, 17 );
+  test_1d_strided_assignment_impl< Kokkos::LayoutRight, Kokkos::LayoutRight, Space >( true, true, true, true, 17, 1 );
 }
 
 template< class Space >
 void test_left_0()
 {
-  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutLeft , Space >
-    view_static_8_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space > view_static_8_type;
 
-  view_static_8_type  x_static_8("x_static_left_8");
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_static_8_type x_static_8( "x_static_left_8" );
 
-  ASSERT_TRUE( x_static_8.is_contiguous() );
+    ASSERT_TRUE( x_static_8.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x_static_8, 0, 0, 0, 0, 0, 0, 0, 0 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & x_static_8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x_static_8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3, 0, 1, 2, 3 );
 
-  ASSERT_TRUE( x1.is_contiguous() );
-  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x1(1) == & x_static_8(1,1,2,3,0,1,2,3) );
+    ASSERT_TRUE( x1.is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x_static_8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3
+                                 , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,3,1,1,2,3) );
-  ASSERT_TRUE( & x2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x_static_8( 0, 1, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x_static_8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                                    , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! sx2.is_contiguous() );
-  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! sx2.is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x_static_8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x_static_8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x_static_8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                                 , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
 
-  ASSERT_TRUE( ! sx4.is_contiguous() );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
-  }
+    ASSERT_TRUE( ! sx4.is_contiguous() );
 
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x_static_8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_left_1()
 {
-  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutLeft , Space >
-    view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int ****[2][3][4][5], Kokkos::LayoutLeft, Space > view_type;
 
-  view_type  x8("x_left_8",2,3,4,5);
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type x8( "x_left_8", 2, 3, 4, 5 );
 
-  ASSERT_TRUE( x8.is_contiguous() );
+    ASSERT_TRUE( x8.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x8, 0, 0, 0, 0, 0, 0, 0, 0 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & x8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3, 0, 1, 2, 3 );
 
-  ASSERT_TRUE( x1.is_contiguous() );
-  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x1(1) == & x8(1,1,2,3,0,1,2,3) );
+    ASSERT_TRUE( x1.is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x8, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3
+                         , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(1,0) == & x8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,3,1,1,2,3) );
-  ASSERT_TRUE( & x2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x8( 0, 1, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                            , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( ! sx2.is_contiguous() );
-  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( ! sx2.is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                         , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
 
-  ASSERT_TRUE( ! sx4.is_contiguous() );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
-  }
+    ASSERT_TRUE( ! sx4.is_contiguous() );
 
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_left_2()
 {
-  typedef Kokkos::View< int **** , Kokkos::LayoutLeft , Space > view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
-
-  view_type  x4("x4",2,3,4,5);
-
-  ASSERT_TRUE( x4.is_contiguous() );
-
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x4 , 0, 0, 0, 0 );
-
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & x4(0,0,0,0) );
-
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, 2, 3 );
-
-  ASSERT_TRUE( x1.is_contiguous() );
-  ASSERT_TRUE( & x1(0) == & x4(0,1,2,3) );
-  ASSERT_TRUE( & x1(1) == & x4(1,1,2,3) );
-
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, Kokkos::pair<int,int>(1,3), 2 );
-
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  ASSERT_TRUE( & x2(0,0) == & x4(0,1,1,2) );
-  ASSERT_TRUE( & x2(1,0) == & x4(1,1,1,2) );
-  ASSERT_TRUE( & x2(0,1) == & x4(0,1,2,2) );
-  ASSERT_TRUE( & x2(1,1) == & x4(1,1,2,2) );
-
-  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x4, 1, Kokkos::pair<int,int>(0,2)
-                       , 2, Kokkos::pair<int,int>(1,4) );
-
-  ASSERT_TRUE( ! sx2.is_contiguous() );
-  ASSERT_TRUE( & sx2(0,0) == & x4(1,0,2,1) );
-  ASSERT_TRUE( & sx2(1,0) == & x4(1,1,2,1) );
-  ASSERT_TRUE( & sx2(0,1) == & x4(1,0,2,2) );
-  ASSERT_TRUE( & sx2(1,1) == & x4(1,1,2,2) );
-  ASSERT_TRUE( & sx2(0,2) == & x4(1,0,2,3) );
-  ASSERT_TRUE( & sx2(1,2) == & x4(1,1,2,3) );
-
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x4, Kokkos::pair<int,int>(1,2) /* of [2] */
-                       , Kokkos::pair<int,int>(1,3) /* of [3] */
-                       , Kokkos::pair<int,int>(0,4) /* of [4] */
-                       , Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
-
-  ASSERT_TRUE( ! sx4.is_contiguous() );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x4( 1+i0, 1+i1, 0+i2, 2+i3 ) );
-  }
-
+  typedef Kokkos::View< int ****, Kokkos::LayoutLeft, Space > view_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_type x4( "x4", 2, 3, 4, 5 );
+
+    ASSERT_TRUE( x4.is_contiguous() );
+
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( x4, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & x4( 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( x1.is_contiguous() );
+    ASSERT_TRUE( & x1( 0 ) == & x4( 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x4( 1, 1, 2, 3 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 0, 2 ), 1
+                         , Kokkos::pair< int, int >( 1, 3 ), 2 );
+
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x4( 0, 1, 1, 2 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x4( 1, 1, 1, 2 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x4( 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x4( 1, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x4, 1, Kokkos::pair< int, int >( 0, 2 )
+                         , 2, Kokkos::pair< int, int >( 1, 4 ) );
+
+    ASSERT_TRUE( ! sx2.is_contiguous() );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x4( 1, 0, 2, 1 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x4( 1, 1, 2, 1 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x4( 1, 0, 2, 2 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x4( 1, 1, 2, 2 ) );
+    ASSERT_TRUE( & sx2( 0, 2 ) == & x4( 1, 0, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 2 ) == & x4( 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x4, Kokkos::pair< int, int >( 1, 2 ) /* of [2] */
+                         , Kokkos::pair< int, int >( 1, 3 ) /* of [3] */
+                         , Kokkos::pair< int, int >( 0, 4 ) /* of [4] */
+                         , Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( ! sx4.is_contiguous() );
+
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x4( 1 + i0, 1 + i1, 0 + i2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_left_3()
 {
-  typedef Kokkos::View< int ** , Kokkos::LayoutLeft , Space > view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int **, Kokkos::LayoutLeft, Space > view_type;
 
-  view_type  xm("x4",10,5);
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type xm( "x4", 10, 5 );
 
-  ASSERT_TRUE( xm.is_contiguous() );
+    ASSERT_TRUE( xm.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( xm , 5, 3 );
+    Kokkos::View< int, Kokkos::LayoutLeft, Space > x0 = Kokkos::subview( xm, 5, 3 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & xm(5,3) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & xm( 5, 3 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
-    Kokkos::subview( xm, Kokkos::ALL, 3 );
+    Kokkos::View< int*, Kokkos::LayoutLeft, Space > x1 = Kokkos::subview( xm, Kokkos::ALL, 3 );
 
-  ASSERT_TRUE( x1.is_contiguous() );
-  for ( int i = 0 ; i < int(xm.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x1(i) == & xm(i,3) );
-  }
-
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
-    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), Kokkos::ALL );
+    ASSERT_TRUE( x1.is_contiguous() );
+    for ( int i = 0; i < int( xm.dimension_0() ); ++i ) {
+      ASSERT_TRUE( & x1( i ) == & xm( i, 3 ) );
+    }
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2(i,j) == & xm(1+i,j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2 =
+      Kokkos::subview( xm, Kokkos::pair< int, int >( 1, 9 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2c =
-    Kokkos::subview( xm, Kokkos::ALL, std::pair<int,int>(2,4) );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    for ( int j = 0; j < int( x2.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2.dimension_0() ); ++i )
+    {
+      ASSERT_TRUE( & x2( i, j ) == & xm( 1 + i, j ) );
+    }
 
-  ASSERT_TRUE( x2c.is_contiguous() );
-  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2c(i,j) == & xm(i,2+j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2c =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 2, 4 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n1 =
-    Kokkos::subview( xm , std::pair<int,int>(1,1) , Kokkos::ALL );
+    ASSERT_TRUE( x2c.is_contiguous() );
+    for ( int j = 0; j < int( x2c.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2c.dimension_0() ); ++i )
+    {
+      ASSERT_TRUE( & x2c( i, j ) == & xm( i, 2 + j ) );
+    }
 
-  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
-  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2_n1 =
+      Kokkos::subview( xm, std::pair< int, int >( 1, 1 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n2 =
-    Kokkos::subview( xm , Kokkos::ALL , std::pair<int,int>(1,1) );
+    ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+    ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
 
-  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
-  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+    Kokkos::View< int**, Kokkos::LayoutLeft, Space > x2_n2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 1, 1 ) );
 
+    ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+    ASSERT_TRUE( x2_n2.dimension_1() == 0 );
   }
 }
 
@@ -474,766 +478,814 @@ void test_left_3()
 template< class Space >
 void test_right_0()
 {
-  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutRight , Space >
-    view_static_8_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
-
-  view_static_8_type  x_static_8("x_static_right_8");
-
-  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
-
-  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
-
-  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
-    Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
-
-  ASSERT_TRUE( x1.dimension_0() == 2 );
-  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,1) );
-  ASSERT_TRUE( & x1(1) == & x_static_8(0,1,2,3,0,1,2,2) );
-
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
-    Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
-                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
-
-  ASSERT_TRUE( x2.dimension_0() == 2 );
-  ASSERT_TRUE( x2.dimension_1() == 2 );
-  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,1,0,1,2,1) );
-  ASSERT_TRUE( & x2(1,0) == & x_static_8(0,1,2,2,0,1,2,1) );
-  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,1,0,1,2,2) );
-  ASSERT_TRUE( & x2(1,1) == & x_static_8(0,1,2,2,0,1,2,2) );
-
-  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
-
-  ASSERT_TRUE( sx2.dimension_0() == 2 );
-  ASSERT_TRUE( sx2.dimension_1() == 2 );
-  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
-
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
-
-  ASSERT_TRUE( sx4.dimension_0() == 2 );
-  ASSERT_TRUE( sx4.dimension_1() == 2 );
-  ASSERT_TRUE( sx4.dimension_2() == 2 );
-  ASSERT_TRUE( sx4.dimension_3() == 2 );
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0, 0+i0, 1, 1+i1, 1, 0+i2, 2, 2+i3) );
-  }
-
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space > view_static_8_type;
+
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_static_8_type x_static_8( "x_static_right_8" );
+
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( x_static_8, 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    ASSERT_TRUE( & x0() == & x_static_8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
+
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 =
+      Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( x1.dimension_0() == 2 );
+    ASSERT_TRUE( & x1( 0 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x_static_8( 0, 1, 2, 3, 0, 1, 2, 2 ) );
+
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 )
+                                 , 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
+
+    ASSERT_TRUE( x2.dimension_0() == 2 );
+    ASSERT_TRUE( x2.dimension_1() == 2 );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x_static_8( 0, 1, 2, 1, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x_static_8( 0, 1, 2, 2, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x_static_8( 0, 1, 2, 1, 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x_static_8( 0, 1, 2, 2, 0, 1, 2, 2 ) );
+
+    // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x_static_8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                                    , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
+
+    ASSERT_TRUE( sx2.dimension_0() == 2 );
+    ASSERT_TRUE( sx2.dimension_1() == 2 );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x_static_8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x_static_8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x_static_8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x_static_8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
+
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x_static_8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                                 , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                                 , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
+
+    ASSERT_TRUE( sx4.dimension_0() == 2 );
+    ASSERT_TRUE( sx4.dimension_1() == 2 );
+    ASSERT_TRUE( sx4.dimension_2() == 2 );
+    ASSERT_TRUE( sx4.dimension_3() == 2 );
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x_static_8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_right_1()
 {
-  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutRight , Space >
-    view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int ****[2][3][4][5], Kokkos::LayoutRight, Space > view_type;
 
-  view_type  x8("x_right_8",2,3,4,5);
+  if ( Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace, typename Space::memory_space>::accessible ) {
+    view_type x8( "x_right_8", 2, 3, 4, 5 );
 
-  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( x8, 0, 0, 0, 0, 0, 0, 0, 0 );
 
-  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+    ASSERT_TRUE( & x0() == & x8( 0, 0, 0, 0, 0, 0, 0, 0 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
-    Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 =
+      Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
 
-  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,1) );
-  ASSERT_TRUE( & x1(1) == & x8(0,1,2,3,0,1,2,2) );
+    ASSERT_TRUE( & x1( 0 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x1( 1 ) == & x8( 0, 1, 2, 3, 0, 1, 2, 2 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
-    Kokkos::subview( x8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
-                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( x8, 0, 1, 2, Kokkos::pair< int, int >( 1, 3 )
+                         , 0, 1, 2, Kokkos::pair< int, int >( 1, 3 ) );
 
-  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,1,0,1,2,1) );
-  ASSERT_TRUE( & x2(1,0) == & x8(0,1,2,2,0,1,2,1) );
-  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,1,0,1,2,2) );
-  ASSERT_TRUE( & x2(1,1) == & x8(0,1,2,2,0,1,2,2) );
+    ASSERT_TRUE( & x2( 0, 0 ) == & x8( 0, 1, 2, 1, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 1, 0 ) == & x8( 0, 1, 2, 2, 0, 1, 2, 1 ) );
+    ASSERT_TRUE( & x2( 0, 1 ) == & x8( 0, 1, 2, 1, 0, 1, 2, 2 ) );
+    ASSERT_TRUE( & x2( 1, 1 ) == & x8( 0, 1, 2, 2, 0, 1, 2, 2 ) );
 
-  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
-  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
-    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
-                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+    // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
+    Kokkos::View< int**, Kokkos::LayoutStride, Space > sx2 =
+      Kokkos::subview( x8, 1, Kokkos::pair< int, int >( 0, 2 ), 2, 3
+                            , Kokkos::pair< int, int >( 0, 2 ), 1, 2, 3 );
 
-  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
-  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
-  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+    ASSERT_TRUE( & sx2( 0, 0 ) == & x8( 1, 0, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 0 ) == & x8( 1, 1, 2, 3, 0, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 0, 1 ) == & x8( 1, 0, 2, 3, 1, 1, 2, 3 ) );
+    ASSERT_TRUE( & sx2( 1, 1 ) == & x8( 1, 1, 2, 3, 1, 1, 2, 3 ) );
 
-  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
-    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
-                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
-                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
-                   );
-
-  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
-  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
-  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
-  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
-    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
-  }
+    Kokkos::View< int****, Kokkos::LayoutStride, Space > sx4 =
+      Kokkos::subview( x8, 0, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 1, Kokkos::pair< int, int >( 1, 3 ) /* of [5] */
+                         , 1, Kokkos::pair< int, int >( 0, 2 ) /* of [3] */
+                         , 2, Kokkos::pair< int, int >( 2, 4 ) /* of [5] */
+                     );
 
+    for ( int i0 = 0; i0 < (int) sx4.dimension_0(); ++i0 )
+    for ( int i1 = 0; i1 < (int) sx4.dimension_1(); ++i1 )
+    for ( int i2 = 0; i2 < (int) sx4.dimension_2(); ++i2 )
+    for ( int i3 = 0; i3 < (int) sx4.dimension_3(); ++i3 )
+    {
+      ASSERT_TRUE( & sx4( i0, i1, i2, i3 ) == & x8( 0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3 ) );
+    }
   }
 }
 
 template< class Space >
 void test_right_3()
 {
-  typedef Kokkos::View< int ** , Kokkos::LayoutRight , Space > view_type ;
-
-  if(Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,typename Space::memory_space>::accessible) {
+  typedef Kokkos::View< int **, Kokkos::LayoutRight, Space > view_type;
 
-  view_type  xm("x4",10,5);
+  if ( Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, typename Space::memory_space >::accessible ) {
+    view_type xm( "x4", 10, 5 );
 
-  ASSERT_TRUE( xm.is_contiguous() );
+    ASSERT_TRUE( xm.is_contiguous() );
 
-  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( xm , 5, 3 );
+    Kokkos::View< int, Kokkos::LayoutRight, Space > x0 = Kokkos::subview( xm, 5, 3 );
 
-  ASSERT_TRUE( x0.is_contiguous() );
-  ASSERT_TRUE( & x0() == & xm(5,3) );
+    ASSERT_TRUE( x0.is_contiguous() );
+    ASSERT_TRUE( & x0() == & xm( 5, 3 ) );
 
-  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
-    Kokkos::subview( xm, 3, Kokkos::ALL );
-
-  ASSERT_TRUE( x1.is_contiguous() );
-  for ( int i = 0 ; i < int(xm.dimension_1()) ; ++i ) {
-    ASSERT_TRUE( & x1(i) == & xm(3,i) );
-  }
+    Kokkos::View< int*, Kokkos::LayoutRight, Space > x1 = Kokkos::subview( xm, 3, Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2c =
-    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), Kokkos::ALL );
+    ASSERT_TRUE( x1.is_contiguous() );
+    for ( int i = 0; i < int( xm.dimension_1() ); ++i ) {
+      ASSERT_TRUE( & x1( i ) == & xm( 3, i ) );
+    }
 
-  ASSERT_TRUE( x2c.is_contiguous() );
-  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2c(i,j) == & xm(1+i,j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2c =
+      Kokkos::subview( xm, Kokkos::pair< int, int >( 1, 9 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
-    Kokkos::subview( xm, Kokkos::ALL, std::pair<int,int>(2,4) );
+    ASSERT_TRUE( x2c.is_contiguous() );
+    for ( int j = 0; j < int( x2c.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2c.dimension_0() ); ++i ) {
+      ASSERT_TRUE( & x2c( i, j ) == & xm( 1 + i, j ) );
+    }
 
-  ASSERT_TRUE( ! x2.is_contiguous() );
-  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
-  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
-    ASSERT_TRUE( & x2(i,j) == & xm(i,2+j) );
-  }
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 2, 4 ) );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n1 =
-    Kokkos::subview( xm , std::pair<int,int>(1,1) , Kokkos::ALL );
+    ASSERT_TRUE( ! x2.is_contiguous() );
+    for ( int j = 0; j < int( x2.dimension_1() ); ++j )
+    for ( int i = 0; i < int( x2.dimension_0() ); ++i )
+    {
+      ASSERT_TRUE( & x2( i, j ) == & xm( i, 2 + j ) );
+    }
 
-  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
-  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2_n1 =
+      Kokkos::subview( xm, std::pair< int, int >( 1, 1 ), Kokkos::ALL );
 
-  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n2 =
-    Kokkos::subview( xm , Kokkos::ALL , std::pair<int,int>(1,1) );
+    ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+    ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
 
-  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
-  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+    Kokkos::View< int**, Kokkos::LayoutRight, Space > x2_n2 =
+      Kokkos::subview( xm, Kokkos::ALL, std::pair< int, int >( 1, 1 ) );
 
+    ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+    ASSERT_TRUE( x2_n2.dimension_1() == 0 );
   }
 }
 
 namespace Impl {
 
-constexpr int N0=113;
-constexpr int N1=11;
-constexpr int N2=17;
-constexpr int N3=5;
-constexpr int N4=7;
+constexpr int N0 = 113;
+constexpr int N1 = 11;
+constexpr int N2 = 17;
+constexpr int N3 = 5;
+constexpr int N4 = 7;
 
-template<class SubView,class View>
-void test_Check1D(SubView a, View b, std::pair<int,int> range) {
+template< class SubView, class View >
+void test_Check1D( SubView a, View b, std::pair< int, int > range ) {
   int errors = 0;
-  for(int i=0;i<range.second-range.first;i++) {
-    if(a(i)!=b(i+range.first))
-      errors++;
+
+  for ( int i = 0; i < range.second - range.first; i++ ) {
+    if ( a( i ) != b( i + range.first ) ) errors++;
+  }
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check1D: " << errors << std::endl;
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check1D: " << errors <<std::endl;
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class SubView,class View>
-void test_Check1D2D(SubView a, View b, int i0, std::pair<int,int> range) {
+template< class SubView, class View >
+void test_Check1D2D( SubView a, View b, int i0, std::pair< int, int > range ) {
   int errors = 0;
-  for(int i1=0;i1<range.second-range.first;i1++) {
-    if(a(i1)!=b(i0,i1+range.first))
-      errors++;
+
+  for ( int i1 = 0; i1 < range.second - range.first; i1++ ) {
+    if ( a( i1 ) != b( i0, i1 + range.first ) ) errors++;
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check1D2D: " << errors <<std::endl;
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check1D2D: " << errors << std::endl;
+  }
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class SubView,class View>
-void test_Check2D3D(SubView a, View b, int i0, std::pair<int,int> range1, std::pair<int,int> range2) {
+template< class SubView, class View >
+void test_Check2D3D( SubView a, View b, int i0, std::pair< int, int > range1
+                   , std::pair< int, int > range2 )
+{
   int errors = 0;
-  for(int i1=0;i1<range1.second-range1.first;i1++) {
-    for(int i2=0;i2<range2.second-range2.first;i2++) {
-      if(a(i1,i2)!=b(i0,i1+range1.first,i2+range2.first))
-        errors++;
+
+  for ( int i1 = 0; i1 < range1.second - range1.first; i1++ ) {
+    for ( int i2 = 0; i2 < range2.second - range2.first; i2++ ) {
+      if ( a( i1, i2 ) != b( i0, i1 + range1.first, i2 + range2.first ) ) errors++;
     }
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check2D3D: " << errors <<std::endl;
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check2D3D: " << errors << std::endl;
+  }
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class SubView,class View>
-void test_Check3D5D(SubView a, View b, int i0, int i1, std::pair<int,int> range2, std::pair<int,int> range3, std::pair<int,int> range4) {
+template<class SubView, class View>
+void test_Check3D5D( SubView a, View b, int i0, int i1, std::pair< int, int > range2
+                   , std::pair< int, int > range3, std::pair< int, int > range4 )
+{
   int errors = 0;
-  for(int i2=0;i2<range2.second-range2.first;i2++) {
-    for(int i3=0;i3<range3.second-range3.first;i3++) {
-      for(int i4=0;i4<range4.second-range4.first;i4++) {
-        if(a(i2,i3,i4)!=b(i0,i1,i2+range2.first,i3+range3.first,i4+range4.first))
+
+  for ( int i2 = 0; i2 < range2.second - range2.first; i2++ ) {
+    for ( int i3 = 0; i3 < range3.second - range3.first; i3++ ) {
+      for ( int i4 = 0; i4 < range4.second - range4.first; i4++ ) {
+        if ( a( i2, i3, i4 ) != b( i0, i1, i2 + range2.first, i3 + range3.first, i4 + range4.first ) ) {
           errors++;
+        }
       }
     }
   }
-  if(errors>0)
-    std::cout << "Error Suviews test_Check3D5D: " << errors <<std::endl;
+
+  if ( errors > 0 ) {
+    std::cout << "Error Suviews test_Check3D5D: " << errors << std::endl;
+  }
+
   ASSERT_TRUE( errors == 0 );
 }
 
-template<class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_1d_assign_impl() {
-
-  { //Breaks
-    Kokkos::View<int*,LayoutOrg,Space> a_org("A",N0);
-    Kokkos::View<int*,LayoutOrg,Space,MemTraits> a(a_org);
+  { // Breaks.
+    Kokkos::View< int*, LayoutOrg, Space > a_org( "A", N0 );
+    Kokkos::View< int*, LayoutOrg, Space, MemTraits > a( a_org );
     Kokkos::fence();
-    for(int i=0; i<N0; i++)
-      a_org(i) = i;
+    for ( int i = 0; i < N0; i++ ) a_org( i ) = i;
 
-    Kokkos::View<int[N0],Layout,Space,MemTraits> a1(a);
+    Kokkos::View< int[N0], Layout, Space, MemTraits > a1( a );
     Kokkos::fence();
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
 
-    Kokkos::View<int[N0],LayoutSub,Space,MemTraits> a2(a1);
+    Kokkos::View< int[N0], LayoutSub, Space, MemTraits > a2( a1 );
     Kokkos::fence();
-    test_Check1D(a2,a,std::pair<int,int>(0,N0));
+    test_Check1D( a2, a, std::pair< int, int >( 0, N0 ) );
     a1 = a;
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
 
-    //Runtime Fail expected
-    //Kokkos::View<int[N1]> afail1(a);
+    // Runtime Fail expected.
+    //Kokkos::View< int[N1] > afail1( a );
 
-    //Compile Time Fail expected
-    //Kokkos::View<int[N1]> afail2(a1);
+    // Compile Time Fail expected.
+    //Kokkos::View< int[N1] > afail2( a1 );
   }
 
-  { // Works
-    Kokkos::View<int[N0],LayoutOrg,Space,MemTraits> a("A");
-    Kokkos::View<int*,Layout,Space,MemTraits> a1(a);
+  { // Works.
+    Kokkos::View< int[N0], LayoutOrg, Space, MemTraits > a( "A" );
+    Kokkos::View< int*, Layout, Space, MemTraits > a1( a );
     Kokkos::fence();
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
     a1 = a;
     Kokkos::fence();
-    test_Check1D(a1,a,std::pair<int,int>(0,N0));
+    test_Check1D( a1, a, std::pair< int, int >( 0, N0 ) );
   }
 }
 
-template<class Space, class Type, class TypeSub,class LayoutSub, class Layout, class LayoutOrg,class MemTraits>
+template< class Space, class Type, class TypeSub, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_2d_subview_3d_impl_type() {
-  Kokkos::View<int***,LayoutOrg,Space> a_org("A",N0,N1,N2);
-  Kokkos::View<Type,Layout,Space,MemTraits> a(a_org);
-  for(int i0=0; i0<N0; i0++)
-    for(int i1=0; i1<N1; i1++)
-      for(int i2=0; i2<N2; i2++)
-        a_org(i0,i1,i2) = i0*1000000+i1*1000+i2;
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a1;
-  a1 = Kokkos::subview(a,3,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< int***, LayoutOrg, Space > a_org( "A", N0, N1, N2 );
+  Kokkos::View< Type, Layout, Space, MemTraits > a( a_org );
+
+  for ( int i0 = 0; i0 < N0; i0++ )
+  for ( int i1 = 0; i1 < N1; i1++ )
+  for ( int i2 = 0; i2 < N2; i2++ )
+  {
+    a_org( i0, i1, i2 ) = i0 * 1000000 + i1 * 1000 + i2;
+  }
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a1;
+  a1 = Kokkos::subview( a, 3, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check2D3D(a1,a,3,std::pair<int,int>(0,N1),std::pair<int,int>(0,N2));
+  test_Check2D3D( a1, a, 3, std::pair< int, int >( 0, N1 ), std::pair< int, int >( 0, N2 ) );
 
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a2(a,3,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a2( a, 3, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check2D3D(a2,a,3,std::pair<int,int>(0,N1),std::pair<int,int>(0,N2));
+  test_Check2D3D( a2, a, 3, std::pair< int, int >( 0, N1 ), std::pair< int, int >( 0, N2 ) );
 }
 
-template<class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_2d_subview_3d_impl_layout() {
-  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int[N0][N1][N2],int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int[N0][N1][N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int*   [N1][N2],int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int*   [N1][N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,int**      [N2],int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int**      [N2],int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int**      [N2],int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int**      [N2], int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,int***         ,int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int***         ,int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,int***         ,int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, int***         , int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int***         , int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, int***         , int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int[N0][N1][N2],const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int[N0][N1][N2],const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int[N0][N1][N2],const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int[N0][N1][N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int*   [N1][N2],const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int*   [N1][N2],const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int*   [N1][N2],const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int*   [N1][N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int**      [N2],const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int**      [N2],const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int**      [N2],const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int**      [N2], const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 
-  test_2d_subview_3d_impl_type<Space,const int***         ,const int[N1][N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int***         ,const int*   [N2],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_2d_subview_3d_impl_type<Space,const int***         ,const int**      ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int[N1][N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int*   [N2], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_2d_subview_3d_impl_type< Space, const int***         , const int**      , LayoutSub, Layout, LayoutOrg, MemTraits >();
 }
 
-template<class Space, class Type, class TypeSub,class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class Type, class TypeSub, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_3d_subview_5d_impl_type() {
-  Kokkos::View<int*****,LayoutOrg,Space> a_org("A",N0,N1,N2,N3,N4);
-  Kokkos::View<Type,Layout,Space,MemTraits> a(a_org);
-  for(int i0=0; i0<N0; i0++)
-    for(int i1=0; i1<N1; i1++)
-      for(int i2=0; i2<N2; i2++)
-        for(int i3=0; i3<N3; i3++)
-          for(int i4=0; i4<N4; i4++)
-            a_org(i0,i1,i2,i3,i4) = i0*1000000+i1*10000+i2*100+i3*10+i4;
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a1;
-  a1 = Kokkos::subview(a,3,5,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< int*****, LayoutOrg, Space > a_org( "A", N0, N1, N2, N3, N4 );
+  Kokkos::View< Type, Layout, Space, MemTraits > a( a_org );
+
+  for ( int i0 = 0; i0 < N0; i0++ )
+  for ( int i1 = 0; i1 < N1; i1++ )
+  for ( int i2 = 0; i2 < N2; i2++ )
+  for ( int i3 = 0; i3 < N3; i3++ )
+  for ( int i4 = 0; i4 < N4; i4++ )
+  {
+    a_org( i0, i1, i2, i3, i4 ) = i0 * 1000000 + i1 * 10000 + i2 * 100 + i3 * 10 + i4;
+  }
+
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a1;
+  a1 = Kokkos::subview( a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check3D5D(a1,a,3,5,std::pair<int,int>(0,N2),std::pair<int,int>(0,N3),std::pair<int,int>(0,N4));
+  test_Check3D5D( a1, a, 3, 5, std::pair< int, int >( 0, N2 ), std::pair< int, int >( 0, N3 ), std::pair< int, int >( 0, N4 ) );
 
-  Kokkos::View<TypeSub,LayoutSub,Space,MemTraits> a2(a,3,5,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
+  Kokkos::View< TypeSub, LayoutSub, Space, MemTraits > a2( a, 3, 5, Kokkos::ALL, Kokkos::ALL, Kokkos::ALL );
   Kokkos::fence();
-  test_Check3D5D(a2,a,3,5,std::pair<int,int>(0,N2),std::pair<int,int>(0,N3),std::pair<int,int>(0,N4));
+  test_Check3D5D( a2, a, 3, 5, std::pair< int, int >( 0, N2 ), std::pair< int, int >( 0, N3 ), std::pair< int, int >( 0, N4 ) );
 }
 
-template<class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits>
+template< class Space, class LayoutSub, class Layout, class LayoutOrg, class MemTraits >
 void test_3d_subview_5d_impl_layout() {
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int[N0][N1][N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*   [N1][N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int**      [N2][N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int***         [N3][N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int****            [N4],int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, int*****               ,int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*****               ,int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*****               ,int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, int*****               ,int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int[N0][N1][N2][N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*   [N1][N2][N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int**      [N2][N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int***         [N3][N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int****            [N4],const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
-
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int[N2][N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int*   [N3][N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int**      [N4],LayoutSub, Layout, LayoutOrg, MemTraits>();
-  test_3d_subview_5d_impl_type<Space, const int*****               ,const int***         ,LayoutSub, Layout, LayoutOrg, MemTraits>();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int[N0][N1][N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*   [N1][N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int**      [N2][N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int***         [N3][N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int****            [N4], int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, int*****               , int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, int*****               , int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int[N0][N1][N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*   [N1][N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int**      [N2][N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int***         [N3][N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int****            [N4], const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
+
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int[N2][N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int*   [N3][N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int**      [N4], LayoutSub, Layout, LayoutOrg, MemTraits >();
+  test_3d_subview_5d_impl_type< Space, const int*****               , const int***         , LayoutSub, Layout, LayoutOrg, MemTraits >();
 }
 
 inline
 void test_subview_legal_args_right() {
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutRight,Kokkos::LayoutRight,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutRight, Kokkos::LayoutRight, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
 }
 
 inline
 void test_subview_legal_args_left() {
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,int,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,int,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,5,0,int,int,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(1,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::Impl::ALL_t,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::Impl::ALL_t>::value));
-  ASSERT_EQ(0,(Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime<Kokkos::LayoutLeft,Kokkos::LayoutLeft,3,3,0,Kokkos::pair<int,int>,Kokkos::pair<int,int>,Kokkos::pair<int,int>>::value));
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int>, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, int, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, int, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 5, 0, int, int, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 1, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::Impl::ALL_t, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
+  ASSERT_EQ( 0, (  Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::Impl::ALL_t >::value ) );
+  ASSERT_EQ( 0, ( Kokkos::Experimental::Impl::SubviewLegalArgsCompileTime< Kokkos::LayoutLeft, Kokkos::LayoutLeft, 3, 3, 0, Kokkos::pair<int, int>, Kokkos::pair<int, int>, Kokkos::pair<int, int> >::value ) );
 }
 
-}
+} // namespace Impl
 
-template< class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_1d_assign() {
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft, MemTraits>();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft  >();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft  ,Kokkos::LayoutLeft, MemTraits>();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutRight ,Kokkos::LayoutLeft  >();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutRight ,Kokkos::LayoutRight, MemTraits>();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutRight ,Kokkos::LayoutRight, MemTraits>();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutLeft  ,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
-  //Impl::test_1d_assign_impl<Space,Kokkos::LayoutRight ,Kokkos::LayoutStride,Kokkos::LayoutLeft  >();
-  Impl::test_1d_assign_impl<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft, MemTraits>();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutLeft, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft, Kokkos::LayoutLeft, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutRight, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutLeft, Kokkos::LayoutStride, Kokkos::LayoutLeft >();
+  //Impl::test_1d_assign_impl< Space, Kokkos::LayoutRight, Kokkos::LayoutStride, Kokkos::LayoutLeft >();
+  Impl::test_1d_assign_impl< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft, MemTraits >();
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_2d_subview_3d() {
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutRight ,Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutRight, MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits>();
-  Impl::test_2d_subview_3d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft,  MemTraits>();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutRight, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits >();
+  Impl::test_2d_subview_3d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft,  MemTraits >();
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_3d_subview_5d_right() {
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits>();
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutRight, MemTraits>();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutRight, Kokkos::LayoutRight, MemTraits >();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutRight, MemTraits >();
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_3d_subview_5d_left() {
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits>();
-  Impl::test_3d_subview_5d_impl_layout<Space,Kokkos::LayoutStride,Kokkos::LayoutStride,Kokkos::LayoutLeft,  MemTraits>();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutLeft,  Kokkos::LayoutLeft,  MemTraits >();
+  Impl::test_3d_subview_5d_impl_layout< Space, Kokkos::LayoutStride, Kokkos::LayoutStride, Kokkos::LayoutLeft,  MemTraits >();
 }
 
+namespace Impl {
 
+template< class Layout, class Space >
+struct FillView_3D {
+  Kokkos::View< int***, Layout, Space > a;
 
-namespace Impl {
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const
+  {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % a.dimension_0()
+                : ii / ( a.dimension_1() * a.dimension_2() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / a.dimension_0() ) % a.dimension_1()
+                : ( ii / a.dimension_2() ) % a.dimension_1();
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( a.dimension_0() * a.dimension_1() )
+                : ii % a.dimension_2();
 
-  template<class Layout, class Space>
-  struct FillView_3D {
-    Kokkos::View<int***,Layout,Space> a;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        ii % a.dimension_0(): ii / (a.dimension_1()*a.dimension_2());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / a.dimension_0()) % a.dimension_1() : (ii / a.dimension_2()) % a.dimension_1();
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        ii / (a.dimension_0() * a.dimension_1()) : ii % a.dimension_2();
-      a(i,j,k) = 1000000 * i + 1000 * j + k;
+    a( i, j, k ) = 1000000 * i + 1000 * j + k;
+  }
+};
+
+template< class Layout, class Space >
+struct FillView_4D {
+  Kokkos::View< int****, Layout, Space > a;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+              ? ii % a.dimension_0()
+              : ii / ( a.dimension_1() * a.dimension_2() * a.dimension_3() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+              ? ( ii / a.dimension_0() ) % a.dimension_1()
+              : ( ii / ( a.dimension_2() * a.dimension_3() ) % a.dimension_1() );
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+              ? ( ii / ( a.dimension_0() * a.dimension_1() ) ) % a.dimension_2()
+              : ( ii / a.dimension_3() ) % a.dimension_2();
+
+    const int l = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( a.dimension_0() * a.dimension_1() * a.dimension_2() )
+                : ii % a.dimension_3();
+
+    a( i, j, k, l ) = 1000000 * i + 10000 * j + 100 * k + l;
+  }
+};
+
+template< class Layout, class Space, class MemTraits >
+struct CheckSubviewCorrectness_3D_3D {
+  Kokkos::View< const int***, Layout, Space, MemTraits > a;
+  Kokkos::View< const int***, Layout, Space, MemTraits > b;
+  int offset_0, offset_2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const
+  {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % b.dimension_0()
+                : ii / ( b.dimension_1() * b.dimension_2() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / b.dimension_0() ) % b.dimension_1()
+                : ( ii / b.dimension_2() ) % b.dimension_1();
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( b.dimension_0() * b.dimension_1() )
+                : ii % b.dimension_2();
+
+    if ( a( i + offset_0, j, k + offset_2 ) != b( i, j, k ) ) {
+      Kokkos::abort( "Error: check_subview_correctness 3D-3D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)" );
     }
-  };
-
-  template<class Layout, class Space>
-  struct FillView_4D {
-    Kokkos::View<int****,Layout,Space> a;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-         ii % a.dimension_0(): ii / (a.dimension_1()*a.dimension_2()*a.dimension_3());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / a.dimension_0()) % a.dimension_1() : (ii / (a.dimension_2()*a.dimension_3()) % a.dimension_1());
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        (ii / (a.dimension_0() * a.dimension_1())) % a.dimension_2() : (ii / a.dimension_3()) % a.dimension_2();
-      const int l = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-         ii / (a.dimension_0() * a.dimension_1() * a.dimension_2()) : ii % a.dimension_3();
-      a(i,j,k,l) = 1000000 * i + 10000 * j + 100 * k + l;
+  }
+};
+
+template< class Layout, class Space, class MemTraits >
+struct CheckSubviewCorrectness_3D_4D {
+  Kokkos::View< const int****, Layout, Space, MemTraits > a;
+  Kokkos::View< const int***, Layout, Space, MemTraits > b;
+  int offset_0, offset_2, index;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int & ii ) const {
+    const int i = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ii % b.dimension_0()
+                : ii / ( b.dimension_1() * b.dimension_2() );
+
+    const int j = std::is_same< Layout, Kokkos::LayoutLeft >::value
+                ? ( ii / b.dimension_0() ) % b.dimension_1()
+                : ( ii / b.dimension_2() ) % b.dimension_1();
+
+    const int k = std::is_same< Layout, Kokkos::LayoutRight >::value
+                ? ii / ( b.dimension_0() * b.dimension_1() )
+                : ii % b.dimension_2();
+
+    int i0, i1, i2, i3;
+
+    if ( std::is_same< Layout, Kokkos::LayoutLeft >::value ) {
+      i0 = i + offset_0;
+      i1 = j;
+      i2 = k + offset_2;
+      i3 = index;
     }
-  }; 
-
-  template<class Layout, class Space, class MemTraits>
-  struct CheckSubviewCorrectness_3D_3D {
-    Kokkos::View<const int***,Layout,Space,MemTraits> a;
-    Kokkos::View<const int***,Layout,Space,MemTraits> b;
-    int offset_0,offset_2;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        ii % b.dimension_0(): ii / (b.dimension_1()*b.dimension_2());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / b.dimension_0()) % b.dimension_1() : (ii / b.dimension_2()) % b.dimension_1();
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        ii / (b.dimension_0() * b.dimension_1()) : ii % b.dimension_2();
-      if( a(i+offset_0,j,k+offset_2) != b(i,j,k))
-        Kokkos::abort("Error: check_subview_correctness 3D-3D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)");
+    else {
+      i0 = index;
+      i1 = i + offset_0;
+      i2 = j;
+      i3 = k + offset_2;
     }
-  };
-
-  template<class Layout, class Space, class MemTraits>
-  struct CheckSubviewCorrectness_3D_4D {
-    Kokkos::View<const int****,Layout,Space,MemTraits> a;
-    Kokkos::View<const int***,Layout,Space,MemTraits> b;
-    int offset_0,offset_2,index;
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int& ii) const {
-      const int i = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        ii % b.dimension_0(): ii / (b.dimension_1()*b.dimension_2());
-      const int j = std::is_same<Layout,Kokkos::LayoutLeft>::value ?
-        (ii / b.dimension_0()) % b.dimension_1() : (ii / b.dimension_2()) % b.dimension_1();
-      const int k = std::is_same<Layout,Kokkos::LayoutRight>::value ?
-        ii / (b.dimension_0() * b.dimension_1()) : ii % b.dimension_2();
-
-      int i0,i1,i2,i3;
-      if(std::is_same<Layout,Kokkos::LayoutLeft>::value) {
-        i0 = i + offset_0;
-        i1 = j;
-        i2 = k + offset_2;
-        i3 = index;
-      } else {
-        i0 = index;
-        i1 = i + offset_0;
-        i2 = j;
-        i3 = k + offset_2;
-      }
-      if( a(i0,i1,i2,i3) != b(i,j,k))
-        Kokkos::abort("Error: check_subview_correctness 3D-4D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)");
+
+    if ( a( i0, i1, i2, i3 ) != b( i, j, k ) ) {
+      Kokkos::abort( "Error: check_subview_correctness 3D-4D (LayoutLeft -> LayoutLeft or LayoutRight -> LayoutRight)" );
     }
-  };
-}
+  }
+};
 
-template<class Space, class MemTraits = void>
+} // namespace Impl
+
+template< class Space, class MemTraits = void >
 void test_layoutleft_to_layoutleft() {
   Impl::test_subview_legal_args_left();
 
   {
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> a("A",100,4,3);
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::ALL);
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 3 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::ALL );
 
-    Impl::FillView_3D<Kokkos::LayoutLeft,Space> fill;
+    Impl::FillView_3D< Kokkos::LayoutLeft, Space > fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)), fill);  
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_3D<Kokkos::LayoutLeft,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutLeft, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 0;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
+
   {
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> a("A",100,4,5);
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::pair<int,int>(1,3));
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 5 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::pair< int, int >( 1, 3 ) );
 
-    Impl::FillView_3D<Kokkos::LayoutLeft,Space> fill;
+    Impl::FillView_3D<Kokkos::LayoutLeft, Space> fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_3D<Kokkos::LayoutLeft,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutLeft, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 1;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
+
   {
-    Kokkos::View<int****,Kokkos::LayoutLeft,Space> a("A",100,4,5,3); 
-    Kokkos::View<int***,Kokkos::LayoutLeft,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::pair<int,int>(1,3),1);
+    Kokkos::View< int****, Kokkos::LayoutLeft, Space > a( "A", 100, 4, 5, 3 );
+    Kokkos::View< int***, Kokkos::LayoutLeft, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::pair< int, int >( 1, 3 ), 1 );
 
-    Impl::FillView_4D<Kokkos::LayoutLeft,Space> fill;
+    Impl::FillView_4D< Kokkos::LayoutLeft, Space > fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)*a.extent(3)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) * a.extent( 3 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_4D<Kokkos::LayoutLeft,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_4D< Kokkos::LayoutLeft, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 1;
     check.index = 1;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
 }
 
-template<class Space, class MemTraits = void>
+template< class Space, class MemTraits = void >
 void test_layoutright_to_layoutright() {
   Impl::test_subview_legal_args_right();
 
   {
-    Kokkos::View<int***,Kokkos::LayoutRight,Space> a("A",100,4,3);
-    Kokkos::View<int***,Kokkos::LayoutRight,Space> b(a,Kokkos::pair<int,int>(16,32),Kokkos::ALL,Kokkos::ALL);
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > a( "A", 100, 4, 3 );
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > b( a, Kokkos::pair< int, int >( 16, 32 ), Kokkos::ALL, Kokkos::ALL );
 
-    Impl::FillView_3D<Kokkos::LayoutRight,Space> fill;
+    Impl::FillView_3D<Kokkos::LayoutRight, Space> fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_3D<Kokkos::LayoutRight,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_3D< Kokkos::LayoutRight, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 16;
     check.offset_2 = 0;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
-  {
-    Kokkos::View<int****,Kokkos::LayoutRight,Space> a("A",3,4,5,100);
-    Kokkos::View<int***,Kokkos::LayoutRight,Space> b(a,1,Kokkos::pair<int,int>(1,3),Kokkos::ALL,Kokkos::ALL);
 
+  {
+    Kokkos::View< int****, Kokkos::LayoutRight, Space > a( "A", 3, 4, 5, 100 );
+    Kokkos::View< int***, Kokkos::LayoutRight, Space > b( a, 1, Kokkos::pair< int, int >( 1, 3 ), Kokkos::ALL, Kokkos::ALL );
 
-    Impl::FillView_4D<Kokkos::LayoutRight,Space> fill;
+    Impl::FillView_4D< Kokkos::LayoutRight, Space > fill;
     fill.a = a;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,a.extent(0)*a.extent(1)*a.extent(2)*a.extent(3)), fill);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, a.extent( 0 ) * a.extent( 1 ) * a.extent( 2 ) * a.extent( 3 ) ), fill );
 
-    Impl::CheckSubviewCorrectness_3D_4D<Kokkos::LayoutRight,Space,MemTraits> check;
+    Impl::CheckSubviewCorrectness_3D_4D< Kokkos::LayoutRight, Space, MemTraits > check;
     check.a = a;
     check.b = b;
     check.offset_0 = 1;
     check.offset_2 = 0;
     check.index = 1;
-    Kokkos::parallel_for(Kokkos::RangePolicy<typename Space::execution_space>(0,b.extent(0)*b.extent(1)*b.extent(2)), check);
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename Space::execution_space >( 0, b.extent( 0 ) * b.extent( 1 ) * b.extent( 2 ) ), check );
   }
 }
 
-
-}
-//----------------------------------------------------------------------------
-
+} // namespace TestViewSubview
diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp
index f952ab3db51028aff0a0ebfe313b2639e353ab87..4f52fc956707147761dd60354d9cade69b37bb9a 100644
--- a/lib/kokkos/core/unit_test/UnitTestMain.cpp
+++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,15 +36,14 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
 
 #include <gtest/gtest.h>
 
-int main(int argc, char *argv[]) {
-  ::testing::InitGoogleTest(&argc,argv);
+int main( int argc, char *argv[] ) {
+  ::testing::InitGoogleTest( &argc, argv );
   return RUN_ALL_TESTS();
 }
-
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda.hpp b/lib/kokkos/core/unit_test/cuda/TestCuda.hpp
index 36b9b0688ba239ec2f6bf2b847184e95b07f84a3..768b0392048184a4e26c320f16329c07bb8caba5 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda.hpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda.hpp
@@ -40,31 +40,25 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_CUDA_HPP
 #define KOKKOS_TEST_CUDA_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
-
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
 #include <TestViewSpaceAssign.hpp>
 #include <TestAtomic.hpp>
 #include <TestAtomicOperations.hpp>
-
 #include <TestAtomicViews.hpp>
-
 #include <TestRange.hpp>
 #include <TestTeam.hpp>
 #include <TestReduce.hpp>
@@ -73,20 +67,16 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
 
-// For Some Reason I can only have the definition of SetUp and TearDown in one cpp file ...
+// For some reason I can only have the definition of SetUp and TearDown in one cpp file ...
 class cuda : public ::testing::Test {
 protected:
   static void SetUpTestCase();
@@ -95,17 +85,19 @@ protected:
 
 #ifdef TEST_CUDA_INSTANTIATE_SETUP_TEARDOWN
 void cuda::SetUpTestCase()
-  {
-    Kokkos::Cuda::print_configuration( std::cout );
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
-  }
+{
+  Kokkos::print_configuration( std::cout );
+  Kokkos::HostSpace::execution_space::initialize();
+  Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( 0 ) );
+}
 
 void cuda::TearDownTestCase()
-  {
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
-  }
-#endif
+{
+  Kokkos::Cuda::finalize();
+  Kokkos::HostSpace::execution_space::finalize();
 }
 #endif
+
+} // namespace Test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
index ff379dc805ddcbadcd4e6b135d03beda683d8d5b..7cf19b26d1b3ebe6a73f2614aab51dda9d9bd88c 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Atomics.cpp
@@ -40,164 +40,164 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , atomics )
+TEST_F( cuda, atomics )
 {
-  const int loop_count = 1e3 ;
+  const int loop_count = 1e3;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Cuda >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Cuda >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Cuda >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Cuda >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Cuda >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Cuda >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Cuda>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Cuda >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Cuda >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Cuda >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Cuda>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Cuda >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Cuda >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Cuda >( 100, 3 ) ) );
 }
 
-TEST_F( cuda , atomic_operations )
+TEST_F( cuda, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Cuda>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Cuda>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Cuda>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Cuda >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Cuda >( start, end -i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Cuda >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Cuda >( start, end - i, 4 ) ) );
   }
 }
 
-TEST_F( cuda , atomic_views_integral )
+TEST_F( cuda, atomic_views_integral )
 {
   const long length = 1000000;
+
   {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Cuda>(length, 8 ) ) );
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Cuda >( length, 8 ) ) );
   }
 }
 
-TEST_F( cuda , atomic_views_nonintegral )
+TEST_F( cuda, atomic_views_nonintegral )
 {
   const long length = 1000000;
-  {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Cuda>(length, 4 ) ) );
 
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Cuda >( length, 4 ) ) );
   }
 }
 
-
-TEST_F( cuda , atomic_view_api )
+TEST_F( cuda, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::Cuda>();
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Cuda >();
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
index aeaa2a0e81d8114d95fed6566891fecf98d2feb2..e655193a51f513dd390a5545aebe66ebb44f2c11 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Other.cpp
@@ -40,63 +40,68 @@
 // ************************************************************************
 //@HEADER
 */
+
 #define TEST_CUDA_INSTANTIATE_SETUP_TEARDOWN
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , init ) {
+TEST_F( cuda, init )
+{
   ;
 }
 
-TEST_F( cuda , md_range ) {
-  TestMDRange_2D< Kokkos::Cuda >::test_for2(100,100);
-
-  TestMDRange_3D< Kokkos::Cuda >::test_for3(100,100,100);
+TEST_F( cuda , mdrange_for ) {
+  TestMDRange_2D< Kokkos::Cuda >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Cuda >::test_for3( 100, 100, 100 );
+  TestMDRange_4D< Kokkos::Cuda >::test_for4( 100, 10, 100, 10 );
+  TestMDRange_5D< Kokkos::Cuda >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< Kokkos::Cuda >::test_for6( 100, 10, 5, 2, 10, 5 );
 }
 
-TEST_F( cuda, policy_construction) {
+TEST_F( cuda, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::Cuda >();
   TestTeamPolicyConstruction< Kokkos::Cuda >();
 }
 
-TEST_F( cuda , range_tag )
+TEST_F( cuda, range_tag )
 {
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( cuda , compiler_macros )
+TEST_F( cuda, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( cuda , memory_pool )
+TEST_F( cuda, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::Cuda >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -110,24 +115,24 @@ TEST_F( cuda , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-TEST_F( cuda , task_fib )
+TEST_F( cuda, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::Cuda >::run(i, (i+1)*(i+1)*10000 );
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Cuda >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
   }
 }
 
-TEST_F( cuda , task_depend )
+TEST_F( cuda, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::Cuda >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Cuda >::run( i );
   }
 }
 
-TEST_F( cuda , task_team )
+TEST_F( cuda, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::Cuda >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Cuda >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::Cuda >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Cuda >::run( 1000 ); // Put back after testing.
 }
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -135,55 +140,55 @@ TEST_F( cuda , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
-TEST_F( cuda , cxx11 )
+TEST_F( cuda, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Cuda >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Cuda >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Cuda >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( cuda, tile_layout )
 {
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::Cuda, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Cuda, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Cuda, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Cuda, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Cuda, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Cuda, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Cuda, 8, 8 >( 9, 11 );
 }
 
-#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-#if defined (KOKKOS_COMPILER_CLANG)
-TEST_F( cuda , dispatch )
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+#if defined( KOKKOS_COMPILER_CLANG )
+TEST_F( cuda, dispatch )
 {
-  const int repeat = 100 ;
-  for ( int i = 0 ; i < repeat ; ++i ) {
-  for ( int j = 0 ; j < repeat ; ++j ) {
-    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda >(0,j)
-                        , KOKKOS_LAMBDA( int ) {} );
-  }}
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
 }
 #endif
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp
index b9ab9fe72d494a672cefe07f770ea38663e2ffec..01eed4e023447acb953c27ce2e8aa2ab18d155a4 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_a.cpp
@@ -40,17 +40,17 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , reducers )
+TEST_F( cuda, reducers )
 {
-  TestReducers<int, Kokkos::Cuda>::execute_integer();
-  TestReducers<size_t, Kokkos::Cuda>::execute_integer();
-  TestReducers<double, Kokkos::Cuda>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::Cuda>::execute_basic();
+  TestReducers< int, Kokkos::Cuda >::execute_integer();
+  TestReducers< size_t, Kokkos::Cuda >::execute_integer();
+  TestReducers< double, Kokkos::Cuda >::execute_float();
+  TestReducers< Kokkos::complex<double>, Kokkos::Cuda >::execute_basic();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp
index c588d752dd21ef2135d1e4fa52c37f5dba0c37a9..7f4e0973e7a512a5e855ba30c9e65e5a539c123d 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Reductions_b.cpp
@@ -40,38 +40,44 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, long_reduce) {
-  TestReduce< long ,   Kokkos::Cuda >( 0 );
-  TestReduce< long ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, long_reduce )
+{
+  TestReduce< long, Kokkos::Cuda >( 0 );
+  TestReduce< long, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, double_reduce) {
-  TestReduce< double ,   Kokkos::Cuda >( 0 );
-  TestReduce< double ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, double_reduce )
+{
+  TestReduce< double, Kokkos::Cuda >( 0 );
+  TestReduce< double, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 0 );
-  TestReduceDynamic< long ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::Cuda >( 0 );
+  TestReduceDynamic< long, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Cuda >( 0 );
-  TestReduceDynamic< double ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::Cuda >( 0 );
+  TestReduceDynamic< double, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::Cuda >( 1000000 );
+TEST_F( cuda, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::Cuda >( 0 );
+  TestReduceDynamicView< long, Kokkos::Cuda >( 1000000 );
 }
 
-TEST_F( cuda , scan )
+TEST_F( cuda, scan )
 {
-  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Cuda >::test_range( 1, 1000 );
   TestScan< Kokkos::Cuda >( 0 );
   TestScan< Kokkos::Cuda >( 100000 );
   TestScan< Kokkos::Cuda >( 10000000 );
@@ -79,10 +85,11 @@ TEST_F( cuda , scan )
 }
 
 #if 0
-TEST_F( cuda , scan_small )
+TEST_F( cuda, scan_small )
 {
-  typedef TestScan< Kokkos::Cuda , Kokkos::Impl::CudaExecUseScanSmall > TestScanFunctor ;
-  for ( int i = 0 ; i < 1000 ; ++i ) {
+  typedef TestScan< Kokkos::Cuda, Kokkos::Impl::CudaExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
     TestScanFunctor( 10 );
     TestScanFunctor( 10000 );
   }
@@ -93,38 +100,39 @@ TEST_F( cuda , scan_small )
 }
 #endif
 
-TEST_F( cuda  , team_scan )
+TEST_F( cuda, team_scan )
 {
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( cuda , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( cuda, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( cuda , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( cuda, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( cuda , reduction_deduction )
+TEST_F( cuda, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index f3cbc3b8897625f07f7c4fc810662b68cfe907e9..5bed7640daa114879f789e67807946e0dc2343f4 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -40,6 +40,7 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
@@ -47,353 +48,338 @@ namespace Test {
 __global__
 void test_abort()
 {
-  Kokkos::abort("test_abort");
+  Kokkos::abort( "test_abort" );
 }
 
 __global__
 void test_cuda_spaces_int_value( int * ptr )
 {
-  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
+  if ( *ptr == 42 ) { *ptr = 2 * 42; }
 }
 
-TEST_F( cuda , space_access )
+TEST_F( cuda, space_access )
 {
-  //--------------------------------------
-
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace , Kokkos::HostSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaSpace, Kokkos::HostSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::HostSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace , Kokkos::CudaHostPinnedSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaHostPinnedSpace >::assignable , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaHostPinnedSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace >::assignable, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaSpace >::accessible , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace >::assignable , "" );
+    ! Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace >::assignable, "" );
 
   static_assert(
-    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::MemorySpaceAccess< Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace >::accessible, "" );
 
   //--------------------------------------
 
   static_assert(
-    ! Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::HostSpace >::accessible , "" );
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::HostSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::CudaSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaUVMSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda , Kokkos::CudaHostPinnedSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::Cuda, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   static_assert(
-    ! Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , Kokkos::CudaSpace >::accessible , "" );
+    ! Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , Kokkos::CudaUVMSpace >::accessible , "" );
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaUVMSpace >::accessible, "" );
 
   static_assert(
-    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >::accessible , "" );
-
+    Kokkos::Impl::SpaceAccessibility< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >::accessible, "" );
 
   static_assert(
     std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaSpace >::Space
-                , Kokkos::HostSpace >::value , "" );
+                , Kokkos::HostSpace >::value, "" );
 
   static_assert(
     std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaUVMSpace >::Space
                 , Kokkos::Device< Kokkos::HostSpace::execution_space
-                                , Kokkos::CudaUVMSpace > >::value , "" );
+                                , Kokkos::CudaUVMSpace > >::value, "" );
 
   static_assert(
     std::is_same< Kokkos::Impl::HostMirror< Kokkos::CudaHostPinnedSpace >::Space
-                , Kokkos::CudaHostPinnedSpace >::value , "" );
+                , Kokkos::CudaHostPinnedSpace >::value, "" );
 
   static_assert(
     std::is_same< Kokkos::Device< Kokkos::HostSpace::execution_space
                                 , Kokkos::CudaUVMSpace >
                 , Kokkos::Device< Kokkos::HostSpace::execution_space
-                                , Kokkos::CudaUVMSpace > >::value , "" );
+                                , Kokkos::CudaUVMSpace > >::value, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::Cuda >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::CudaSpace >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::CudaUVMSpace >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 
   static_assert(
     Kokkos::Impl::SpaceAccessibility
       < Kokkos::Impl::HostMirror< Kokkos::CudaHostPinnedSpace >::Space
       , Kokkos::HostSpace
-      >::accessible , "" );
+      >::accessible, "" );
 }
 
 TEST_F( cuda, uvm )
 {
   if ( Kokkos::CudaUVMSpace::available() ) {
+    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >( "uvm_ptr", sizeof( int ) );
 
-    int * uvm_ptr = (int*) Kokkos::kokkos_malloc< Kokkos::CudaUVMSpace >("uvm_ptr",sizeof(int));
-
-    *uvm_ptr = 42 ;
+    *uvm_ptr = 42;
 
     Kokkos::Cuda::fence();
-    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
+    test_cuda_spaces_int_value<<< 1, 1 >>>( uvm_ptr );
     Kokkos::Cuda::fence();
 
-    EXPECT_EQ( *uvm_ptr, int(2*42) );
-
-    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >(uvm_ptr );
+    EXPECT_EQ( *uvm_ptr, int( 2 * 42 ) );
 
+    Kokkos::kokkos_free< Kokkos::CudaUVMSpace >( uvm_ptr );
   }
 }
 
 TEST_F( cuda, uvm_num_allocs )
 {
-  // The max number of uvm allocations allowed is 65536
+  // The max number of UVM allocations allowed is 65536.
   #define MAX_NUM_ALLOCS 65536
 
   if ( Kokkos::CudaUVMSpace::available() ) {
-
     struct TestMaxUVMAllocs {
 
-      using view_type         = Kokkos::View< double* , Kokkos::CudaUVMSpace >;
-      using view_of_view_type = Kokkos::View< view_type[ MAX_NUM_ALLOCS ] 
+      using view_type         = Kokkos::View< double*, Kokkos::CudaUVMSpace >;
+      using view_of_view_type = Kokkos::View< view_type[ MAX_NUM_ALLOCS ]
                                             , Kokkos::CudaUVMSpace >;
 
-      TestMaxUVMAllocs()
-      : view_allocs_test("view_allocs_test")
+      TestMaxUVMAllocs() : view_allocs_test( "view_allocs_test" )
       {
+        for ( auto i = 0; i < MAX_NUM_ALLOCS; ++i ) {
 
-        for ( auto i = 0; i < MAX_NUM_ALLOCS ; ++i ) {
-
-          // Kokkos will throw a runtime exception if an attempt is made to 
-          // allocate more than the maximum number of uvm allocations
+          // Kokkos will throw a runtime exception if an attempt is made to
+          // allocate more than the maximum number of uvm allocations.
 
           // In this test, the max num of allocs occurs when i = MAX_NUM_ALLOCS - 1
           // since the 'outer' view counts as one UVM allocation, leaving
-          // 65535 possible UVM allocations, that is 'i in [0 , 65535)'
+          // 65535 possible UVM allocations, that is 'i in [0, 65535)'.
 
-          // The test will catch the exception thrown in this case and continue
+          // The test will catch the exception thrown in this case and continue.
 
-          if ( i == ( MAX_NUM_ALLOCS - 1) ) {
-            EXPECT_ANY_THROW( { view_allocs_test(i) = view_type("inner_view",1); } ) ;
+          if ( i == ( MAX_NUM_ALLOCS - 1 ) ) {
+            EXPECT_ANY_THROW( { view_allocs_test( i ) = view_type( "inner_view", 1 ); } );
           }
           else {
-            if(i<MAX_NUM_ALLOCS - 1000) {
-              EXPECT_NO_THROW( { view_allocs_test(i) = view_type("inner_view",1); } ) ;
-            } else { // This might or might not throw depending on compilation options. 
+            if ( i < MAX_NUM_ALLOCS - 1000 ) {
+              EXPECT_NO_THROW( { view_allocs_test( i ) = view_type( "inner_view", 1 ); } );
+            } else { // This might or might not throw depending on compilation options.
               try {
-                view_allocs_test(i) = view_type("inner_view",1);
+                view_allocs_test( i ) = view_type( "inner_view", 1 );
               }
-              catch (...) {}
+              catch ( ... ) {}
             }
           }
 
-        } //end allocation for loop
+        } // End allocation for loop.
 
-        for ( auto i = 0; i < MAX_NUM_ALLOCS -1; ++i ) {
+        for ( auto i = 0; i < MAX_NUM_ALLOCS - 1; ++i ) {
 
-          view_allocs_test(i) = view_type();
+          view_allocs_test( i ) = view_type();
 
-        } //end deallocation for loop
+        } // End deallocation for loop.
 
-        view_allocs_test = view_of_view_type(); // deallocate the view of views
+        view_allocs_test = view_of_view_type(); // Deallocate the view of views.
       }
 
-      // Member
-      view_of_view_type view_allocs_test ;
-    } ;
-
-    // trigger the test via the TestMaxUVMAllocs constructor
-    TestMaxUVMAllocs() ;
+      // Member.
+      view_of_view_type view_allocs_test;
+    };
 
+    // Trigger the test via the TestMaxUVMAllocs constructor.
+    TestMaxUVMAllocs();
   }
-  #undef MAX_NUM_ALLOCS 
+
+  #undef MAX_NUM_ALLOCS
 }
 
-template< class MemSpace , class ExecSpace >
+template< class MemSpace, class ExecSpace >
 struct TestViewCudaAccessible {
-
   enum { N = 1000 };
 
-  using V = Kokkos::View<double*,MemSpace> ;
+  using V = Kokkos::View< double*, MemSpace >;
 
-  V m_base ;
+  V m_base;
 
   struct TagInit {};
   struct TagTest {};
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_base[i] != i + 1 ) ++error_count ; }
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_base[i] != i + 1 ) ++error_count; }
 
   TestViewCudaAccessible()
-    : m_base("base",N)
+    : m_base( "base", N )
     {}
 
   static void run()
-    {
-      TestViewCudaAccessible self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
-      MemSpace::execution_space::fence();
-      // Next access is a different execution space, must complete prior kernel.
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
+  {
+    TestViewCudaAccessible self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space, TagInit >( 0, N ), self );
+    MemSpace::execution_space::fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
 };
 
-TEST_F( cuda , impl_view_accessible )
+TEST_F( cuda, impl_view_accessible )
 {
-  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaSpace, Kokkos::Cuda >::run();
 
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace, Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace, Kokkos::HostSpace::execution_space >::run();
 
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
-  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace::execution_space >::run();
 }
 
 template< class MemSpace >
 struct TestViewCudaTexture {
-
   enum { N = 1000 };
 
-  using V = Kokkos::View<double*,MemSpace> ;
-  using T = Kokkos::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
+  using V = Kokkos::View< double*, MemSpace >;
+  using T = Kokkos::View< const double*, MemSpace, Kokkos::MemoryRandomAccess >;
 
-  V m_base ;
-  T m_tex ;
+  V m_base;
+  T m_tex;
 
   struct TagInit {};
   struct TagTest {};
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+  void operator()( const TagInit &, const int i ) const { m_base[i] = i + 1; }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const TagTest & , const int i , long & error_count ) const
-    { if ( m_tex[i] != i + 1 ) ++error_count ; }
+  void operator()( const TagTest &, const int i, long & error_count ) const
+  { if ( m_tex[i] != i + 1 ) ++error_count; }
 
   TestViewCudaTexture()
-    : m_base("base",N)
+    : m_base( "base", N )
     , m_tex( m_base )
     {}
 
   static void run()
-    {
-      EXPECT_TRUE( ( std::is_same< typename V::reference_type
-                                 , double &
-                                 >::value ) );
-
-      EXPECT_TRUE( ( std::is_same< typename T::reference_type
-                                 , const double
-                                 >::value ) );
-
-      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
-      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
-
-      TestViewCudaTexture self ;
-      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
-      long error_count = -1 ;
-      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
-      EXPECT_EQ( error_count , 0 );
-    }
-};
+  {
+    EXPECT_TRUE( ( std::is_same< typename V::reference_type, double & >::value ) );
+    EXPECT_TRUE( ( std::is_same< typename T::reference_type, const double >::value ) );
+
+    EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view.
+    EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value.
 
+    TestViewCudaTexture self;
+    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda, TagInit >( 0, N ), self );
 
-TEST_F( cuda , impl_view_texture )
+    long error_count = -1;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda, TagTest >( 0, N ), self, error_count );
+    EXPECT_EQ( error_count, 0 );
+  }
+};
+
+TEST_F( cuda, impl_view_texture )
 {
   TestViewCudaTexture< Kokkos::CudaSpace >::run();
   TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
index fd8a647ef3f03b9d1109a464a51cd06e90de703d..0aea35db517bdba78967eb8b443cb771aaf2215f 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
+TEST_F( cuda, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
+TEST_F( cuda, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
+TEST_F( cuda, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_assign_strided ) {
+TEST_F( cuda, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
 }
 
-TEST_F( cuda, view_subview_left_0 ) {
+TEST_F( cuda, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_left_1 ) {
+TEST_F( cuda, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_left_2 ) {
+TEST_F( cuda, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_left_3 ) {
+TEST_F( cuda, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_right_0 ) {
+TEST_F( cuda, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_right_1 ) {
+TEST_F( cuda, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
 }
 
-TEST_F( cuda, view_subview_right_3 ) {
+TEST_F( cuda, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
index 053fcfc2095c26540ff75e545bb4f920e0a96912..f31f4cbe62bc06bd5fee04abc6a71913c6fbddd9 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_layoutleft_to_layoutleft) {
+TEST_F( cuda, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( cuda, view_subview_layoutright_to_layoutright) {
+TEST_F( cuda, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
index 4c5f2ef72fdd45b2b9033d54c3c83e70c3c089c1..0213a196e8612b4d9d3821de6d657803e9e22b6c 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_1d_assign ) {
+TEST_F( cuda, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
index aee6f1730d6fb33e15877a043fe0ef8beaed11d9..181e1bab2ccb531722b08e627a8ee724fcd393d9 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
index 2ef48c686e1d3a202aaf5f017d9ac88cc486085d..708cc1f5ba98fc7eb0f5603524c2b533eb090fee 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
index aec123ac235ef631172b3dc7c26151d2da7e38da..a3db996f8d87d63dd1a21ea74eb83a615a0e7162 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_2d_from_3d ) {
+TEST_F( cuda, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
index e8ad2319963b2750e01d518309e84c7423a387d6..2f7cffa75da133039d0624d2d812053774013846 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
index e86b4513fd8b8fdeb85c7bce130b3ae274d5e214..949c6f3e0b9d3055e7da32ace79a810310861d99 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
index ad9dcc0fd1faccf2c8f8ff5e254b82a33f9d998b..3e68277a9e93b447a90a9b3496e0b4d0ccc407e2 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_left ) {
+TEST_F( cuda, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
index f97d97e59c205fda791ac1d231b1429e1f8d4ec2..0cd91b7795f52f457f4403559cb353180bcdbe44 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
index 2a07f28f830a125d865eb89a4a456cb5d0aa2b62..cd1c13f7d073f1a445c35ded9eaa9fd121d35fee 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
index 3c51d9420184c91d8ddc1b15e9fb50659c1651d6..22d27535431f7b6414c52305a46547654c40ccbb 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_right ) {
+TEST_F( cuda, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
index 835caa7b879891ed4cd0d24bac61bdaf6a686efb..5dc5f87b4e2b7faa2a52163f8b8af732b53000a9 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( cuda, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
index 53bd5eee20205d56ca4356df4f2bb1118e0ff93d..318d8edbbb82eb6dd097b959e07861cf74a77099 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( cuda, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::CudaUVMSpace, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
index e4348319f695da2819e24143754777746bdc35d6..a2158f06c73db10193e1275c5d49c99738b0c06b 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp
@@ -1,12 +1,12 @@
-#include<cuda/TestCuda_SubView_c01.cpp>
-#include<cuda/TestCuda_SubView_c02.cpp>
-#include<cuda/TestCuda_SubView_c03.cpp>
-#include<cuda/TestCuda_SubView_c04.cpp>
-#include<cuda/TestCuda_SubView_c05.cpp>
-#include<cuda/TestCuda_SubView_c06.cpp>
-#include<cuda/TestCuda_SubView_c07.cpp>
-#include<cuda/TestCuda_SubView_c08.cpp>
-#include<cuda/TestCuda_SubView_c09.cpp>
-#include<cuda/TestCuda_SubView_c10.cpp>
-#include<cuda/TestCuda_SubView_c11.cpp>
-#include<cuda/TestCuda_SubView_c12.cpp>
+#include <cuda/TestCuda_SubView_c01.cpp>
+#include <cuda/TestCuda_SubView_c02.cpp>
+#include <cuda/TestCuda_SubView_c03.cpp>
+#include <cuda/TestCuda_SubView_c04.cpp>
+#include <cuda/TestCuda_SubView_c05.cpp>
+#include <cuda/TestCuda_SubView_c06.cpp>
+#include <cuda/TestCuda_SubView_c07.cpp>
+#include <cuda/TestCuda_SubView_c08.cpp>
+#include <cuda/TestCuda_SubView_c09.cpp>
+#include <cuda/TestCuda_SubView_c10.cpp>
+#include <cuda/TestCuda_SubView_c11.cpp>
+#include <cuda/TestCuda_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
index 13834d09ad03854d1ac1ae17c7e8a159efa55ca7..8d9b9328ba9691fe90947554aeb9e9825322d55a 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Team.cpp
@@ -40,81 +40,87 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , team_tag )
+TEST_F( cuda, team_tag )
 {
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
 
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( cuda , team_shared_request) {
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( cuda, team_shared_request )
+{
+  TestSharedTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-//THis Tests request to much L0 scratch
-//TEST_F( cuda, team_scratch_request) {
-//  TestScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-//  TestScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+// This tests request to much L0 scratch.
+//TEST_F( cuda, team_scratch_request )
+//{
+//  TestScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+//  TestScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 //}
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( cuda , team_lambda_shared_request) {
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( cuda, team_lambda_shared_request )
+{
   TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static>  >();
+  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
   TestLambdaSharedTeam< Kokkos::CudaSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
   TestLambdaSharedTeam< Kokkos::CudaUVMSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
-  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic>  >();
+  TestLambdaSharedTeam< Kokkos::CudaHostPinnedSpace, Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( cuda, shmem_size) {
+TEST_F( cuda, shmem_size )
+{
   TestShmemSize< Kokkos::Cuda >();
 }
 
-TEST_F( cuda, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::Cuda , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( cuda, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Cuda, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( cuda , team_vector )
+#if !defined(KOKKOS_CUDA_CLANG_WORKAROUND) && !defined(KOKKOS_ARCH_PASCAL)
+TEST_F( cuda, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >( 10 ) ) );
 }
+#endif
 
 TEST_F( cuda, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Cuda >( 8192, 2048, 16, 16 );
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp
index c01ca1c1463c6573c8d9e51c0ca31ed43c19941e..be0c4c5715eeba492112e9a83dbc3cba09796d98 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_a.cpp
@@ -40,20 +40,21 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_view_mapping_a ) {
+TEST_F( cuda, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::CudaSpace >();
   test_view_mapping_operator< Kokkos::CudaSpace >();
 }
 
-TEST_F( cuda , view_of_class )
+TEST_F( cuda, view_of_class )
 {
   TestViewMappingClassValue< Kokkos::CudaSpace >::run();
   TestViewMappingClassValue< Kokkos::CudaUVMSpace >::run();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
index 8e821ada000678c762b22db574dd1e0d816bbd54..b4d8e5d953f8e753eac945560fac763589bd2025 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_b.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_view_mapping_d ) {
+TEST_F( cuda, impl_view_mapping_d )
+{
   test_view_mapping< Kokkos::CudaHostPinnedSpace >();
   test_view_mapping_operator< Kokkos::CudaHostPinnedSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp
index cf29a68e96586dc5d194bd0b28338259784dceb0..e4e6894c5346b6283371903bc2e1bdea18c5f399 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_c.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_view_mapping_c ) {
+TEST_F( cuda, impl_view_mapping_c )
+{
   test_view_mapping< Kokkos::CudaUVMSpace >();
   test_view_mapping_operator< Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp
index db14b5158f6efa01a6397df98041827a830158d4..82a3dd83e88c3b047525771a5dd9deca32d6d891 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_d.cpp
@@ -40,73 +40,77 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , view_nested_view )
+TEST_F( cuda, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Cuda >();
 }
 
-
-
-TEST_F( cuda , view_remap )
+TEST_F( cuda, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
 
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::CudaUVMSpace > output_type ;
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::CudaUVMSpace > output_type;
 
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::CudaUVMSpace > input_type ;
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::CudaUVMSpace > input_type;
 
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::CudaUVMSpace > diff_type ;
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::CudaUVMSpace > diff_type;
 
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
 
   Kokkos::fence();
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
   Kokkos::fence();
 
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
- 
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
   Kokkos::fence();
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
+
   Kokkos::fence();
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( cuda , view_aggregate )
+TEST_F( cuda, view_aggregate )
 {
   TestViewAggregate< Kokkos::Cuda >();
 }
 
-TEST_F( cuda , template_meta_functions )
+TEST_F( cuda, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
+  TestTemplateMetaFunctions< int, Kokkos::Cuda >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp
index 07d425647330228815a7103e6f7596a8a2f2a460..27450fa6ff827dbbe6970331eca68589a423c406 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_e.cpp
@@ -40,17 +40,20 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
-  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
+TEST_F( cuda, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::CudaSpace, Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaUVMSpace, Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaHostPinnedSpace, Kokkos::HostSpace::execution_space >();
 }
 
-TEST_F( cuda , impl_view_mapping_b ) {
+TEST_F( cuda, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::CudaSpace >();
   test_view_mapping_subview< Kokkos::CudaUVMSpace >();
   test_view_mapping_subview< Kokkos::CudaHostPinnedSpace >();
@@ -59,5 +62,4 @@ TEST_F( cuda , impl_view_mapping_b ) {
   TestViewMappingAtomic< Kokkos::CudaHostPinnedSpace >::run();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp
index 34721f02dc73f418ba7c348fe65c3a59d534dc7c..56524111aec939d0ff2b80196b5352a44f6919dd 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_f.cpp
@@ -40,16 +40,17 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_api_a) {
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
-  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
+TEST_F( cuda, view_api_a )
+{
+  typedef Kokkos::View< const int *, Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> > view_texture_managed;
+  typedef Kokkos::View< const int *, Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess | Kokkos::Unmanaged> > view_texture_unmanaged;
 
-  TestViewAPI< double , Kokkos::Cuda >();
+  TestViewAPI< double, Kokkos::Cuda >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp
index abbcf3bf8bfa6d89ff5c5a5891d8cd16018becf0..d5fd24456d782409450fcf949d6c6280504bb785 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_g.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_api_b) {
-  TestViewAPI< double , Kokkos::CudaUVMSpace >();
+TEST_F( cuda, view_api_b )
+{
+  TestViewAPI< double, Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp
index 9899642035ada183fe7b7b5c4a60610e3c271739..649023e4afcaf921511edab82cc10035776246ae 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_h.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda, view_api_c) {
-  TestViewAPI< double , Kokkos::CudaHostPinnedSpace >();
+TEST_F( cuda, view_api_c )
+{
+  TestViewAPI< double, Kokkos::CudaHostPinnedSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp
index 9bc09ba893affeec45923883b62751534a7e86dc..b46b1e5f8173bd724c0333de776366704c23f152 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_ViewAPI_s.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <cuda/TestCuda.hpp>
 
 namespace Test {
 
-TEST_F( cuda , view_space_assign ) {
-  view_space_assign< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >();
-  view_space_assign< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >();
+TEST_F( cuda, view_space_assign )
+{
+  view_space_assign< Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace >();
+  view_space_assign< Kokkos::CudaSpace, Kokkos::CudaUVMSpace >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
index 28ae5b41b039a385db047de37c5a0d1865a1ee1b..ed9bb68cd60a004c214ec473ae35653f61c6a814 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP.hpp
@@ -40,11 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_OPENMP_HPP
 #define KOKKOS_TEST_OPENMP_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
+
 #ifdef KOKKOS_LAMBDA
 #undef KOKKOS_LAMBDA
 #endif
@@ -53,13 +56,8 @@
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
@@ -74,15 +72,11 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
@@ -95,23 +89,24 @@ protected:
     const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
     const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
 
-    const unsigned threads_count = std::max( 1u , numa_count ) *
-                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+    const unsigned threads_count = std::max( 1u, numa_count ) *
+                                   std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 );
 
     Kokkos::OpenMP::initialize( threads_count );
-    Kokkos::OpenMP::print_configuration( std::cout , true );
-    srand(10231);
+    Kokkos::print_configuration( std::cout, true );
+    srand( 10231 );
   }
 
   static void TearDownTestCase()
   {
     Kokkos::OpenMP::finalize();
 
-    omp_set_num_threads(1);
+    omp_set_num_threads( 1 );
 
-    ASSERT_EQ( 1 , omp_get_max_threads() );
+    ASSERT_EQ( 1, omp_get_max_threads() );
   }
 };
 
-}
+} // namespace Test
+
 #endif
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
index ed6c9f8d1696c9c653c82f52b14a8a73520b7735..2585c01973b3aeba5fd00f27068c361b15552800 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Atomics.cpp
@@ -40,165 +40,162 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , atomics )
+TEST_F( openmp, atomics )
 {
-  const int loop_count = 1e4 ;
+  const int loop_count = 1e4;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::OpenMP >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::OpenMP >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::OpenMP >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::OpenMP >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::OpenMP >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::OpenMP >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::OpenMP >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::OpenMP >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::OpenMP >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::OpenMP>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::OpenMP >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::OpenMP >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::OpenMP >( 100, 3 ) ) );
 }
 
-TEST_F( openmp , atomic_operations )
+TEST_F( openmp, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::OpenMP>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::OpenMP>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::OpenMP>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::OpenMP >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::OpenMP >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::OpenMP >( start, end - i, 4 ) ) );
   }
-
 }
 
-
-TEST_F( openmp , atomic_views_integral )
+TEST_F( openmp, atomic_views_integral )
 {
   const long length = 1000000;
   {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::OpenMP>(length, 8 ) ) );
-
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::OpenMP >( length, 8 ) ) );
   }
 }
 
-TEST_F( openmp , atomic_views_nonintegral )
+TEST_F( openmp, atomic_views_nonintegral )
 {
   const long length = 1000000;
   {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::OpenMP>(length, 4 ) ) );
-
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<  double, Kokkos::OpenMP >( length, 4 ) ) );
   }
 }
 
-TEST_F( openmp , atomic_view_api )
+TEST_F( openmp, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::OpenMP>();
+  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
index 126d730f0ff96272ae1e21eb5f8f81523fda8f02..b4f32dac706222e2c1f79f43469eadb4f5e3e6c6 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Other.cpp
@@ -40,65 +40,90 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , init ) {
+TEST_F( openmp, init )
+{
   ;
 }
 
-TEST_F( openmp , md_range ) {
-  TestMDRange_2D< Kokkos::OpenMP >::test_for2(100,100);
+TEST_F( openmp, mdrange_for )
+{
+  Kokkos::Timer timer;
+  TestMDRange_2D< Kokkos::OpenMP >::test_for2( 10000, 1000 );
+  std::cout << " 2D: " << timer.seconds() << std::endl;
+
+  timer.reset();
+  TestMDRange_3D< Kokkos::OpenMP >::test_for3( 100, 100, 1000 );
+  std::cout << " 3D: " << timer.seconds() << std::endl;
 
-  TestMDRange_3D< Kokkos::OpenMP >::test_for3(100,100,100);
+  timer.reset();
+  TestMDRange_4D< Kokkos::OpenMP >::test_for4( 100, 10, 100, 100 );
+  std::cout << " 4D: " << timer.seconds() << std::endl;
+
+  timer.reset();
+  TestMDRange_5D< Kokkos::OpenMP >::test_for5( 100, 10, 10, 100, 50 );
+  std::cout << " 5D: " << timer.seconds() << std::endl;
+
+  timer.reset();
+  TestMDRange_6D< Kokkos::OpenMP >::test_for6( 10, 10, 10, 10, 50, 50 );
+  std::cout << " 6D: " << timer.seconds() << std::endl;
 }
 
-TEST_F( openmp, policy_construction) {
+TEST_F( openmp, mdrange_reduce )
+{
+  TestMDRange_2D< Kokkos::OpenMP >::test_reduce2( 100, 100 );
+  TestMDRange_3D< Kokkos::OpenMP >::test_reduce3( 100, 10, 100 );
+}
+
+TEST_F( openmp, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::OpenMP >();
   TestTeamPolicyConstruction< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , range_tag )
+TEST_F( openmp, range_tag )
 {
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(0);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(3);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  TestRange< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( openmp , compiler_macros )
+TEST_F( openmp, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::OpenMP >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( openmp , memory_pool )
+TEST_F( openmp, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::OpenMP >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -112,24 +137,24 @@ TEST_F( openmp , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-TEST_F( openmp , task_fib )
+TEST_F( openmp, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::OpenMP >::run(i, (i+1)*(i+1)*10000 );
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::OpenMP >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
   }
 }
 
-TEST_F( openmp , task_depend )
+TEST_F( openmp, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::OpenMP >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::OpenMP >::run( i );
   }
 }
 
-TEST_F( openmp , task_team )
+TEST_F( openmp, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::OpenMP >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::OpenMP >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::OpenMP >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::OpenMP >::run( 1000 ); // Put back after testing.
 }
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -137,53 +162,51 @@ TEST_F( openmp , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
-TEST_F( openmp , cxx11 )
+TEST_F( openmp, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::OpenMP >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::OpenMP >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( openmp, tile_layout )
 {
-  TestTile::test< Kokkos::OpenMP , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::OpenMP , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::OpenMP , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::OpenMP , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::OpenMP , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::OpenMP , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::OpenMP , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::OpenMP, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::OpenMP, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::OpenMP, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::OpenMP, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::OpenMP, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::OpenMP, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::OpenMP, 8, 8 >( 9, 11 );
 }
 
-
-TEST_F( openmp , dispatch )
+TEST_F( openmp, dispatch )
 {
-  const int repeat = 100 ;
-  for ( int i = 0 ; i < repeat ; ++i ) {
-  for ( int j = 0 ; j < repeat ; ++j ) {
-    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::OpenMP >(0,j)
-                        , KOKKOS_LAMBDA( int ) {} );
-  }}
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::OpenMP >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
index d41e1493eea6306d68087d1a8562ab963e1ec039..22c29308a6289361bfa0b62d47e579e4bb1e29c2 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Reductions.cpp
@@ -40,46 +40,52 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, long_reduce) {
-  TestReduce< long ,   Kokkos::OpenMP >( 0 );
-  TestReduce< long ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, long_reduce )
+{
+  TestReduce< long, Kokkos::OpenMP >( 0 );
+  TestReduce< long, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp, double_reduce) {
-  TestReduce< double ,   Kokkos::OpenMP >( 0 );
-  TestReduce< double ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, double_reduce )
+{
+  TestReduce< double, Kokkos::OpenMP >( 0 );
+  TestReduce< double, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp , reducers )
+TEST_F( openmp, reducers )
 {
-  TestReducers<int, Kokkos::OpenMP>::execute_integer();
-  TestReducers<size_t, Kokkos::OpenMP>::execute_integer();
-  TestReducers<double, Kokkos::OpenMP>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::OpenMP>::execute_basic();
+  TestReducers< int, Kokkos::OpenMP >::execute_integer();
+  TestReducers< size_t, Kokkos::OpenMP >::execute_integer();
+  TestReducers< double, Kokkos::OpenMP >::execute_float();
+  TestReducers< Kokkos::complex<double>, Kokkos::OpenMP >::execute_basic();
 }
 
-TEST_F( openmp, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::OpenMP >( 0 );
-  TestReduceDynamic< long ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::OpenMP >( 0 );
+  TestReduceDynamic< long, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::OpenMP >( 0 );
-  TestReduceDynamic< double ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::OpenMP >( 0 );
+  TestReduceDynamic< double, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::OpenMP >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::OpenMP >( 1000000 );
+TEST_F( openmp, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::OpenMP >( 0 );
+  TestReduceDynamicView< long, Kokkos::OpenMP >( 1000000 );
 }
 
-TEST_F( openmp , scan )
+TEST_F( openmp, scan )
 {
-  TestScan< Kokkos::OpenMP >::test_range( 1 , 1000 );
+  TestScan< Kokkos::OpenMP >::test_range( 1, 1000 );
   TestScan< Kokkos::OpenMP >( 0 );
   TestScan< Kokkos::OpenMP >( 100000 );
   TestScan< Kokkos::OpenMP >( 10000000 );
@@ -87,10 +93,11 @@ TEST_F( openmp , scan )
 }
 
 #if 0
-TEST_F( openmp , scan_small )
+TEST_F( openmp, scan_small )
 {
-  typedef TestScan< Kokkos::OpenMP , Kokkos::Impl::OpenMPExecUseScanSmall > TestScanFunctor ;
-  for ( int i = 0 ; i < 1000 ; ++i ) {
+  typedef TestScan< Kokkos::OpenMP, Kokkos::Impl::OpenMPExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
     TestScanFunctor( 10 );
     TestScanFunctor( 10000 );
   }
@@ -101,38 +108,39 @@ TEST_F( openmp , scan_small )
 }
 #endif
 
-TEST_F( openmp  , team_scan )
+TEST_F( openmp, team_scan )
 {
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( openmp , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( openmp, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( openmp , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( openmp, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( openmp , reduction_deduction )
+TEST_F( openmp, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
index 9854417e42da5a8bdd6986b85fbdd754bab3e57b..fefae073227a7086bb440152b76abf16dc9c00b2 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::OpenMP >();
+TEST_F( openmp, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::OpenMP >();
+TEST_F( openmp, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::OpenMP >();
+TEST_F( openmp, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_assign_strided ) {
+TEST_F( openmp, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_0 ) {
+TEST_F( openmp, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_1 ) {
+TEST_F( openmp, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_2 ) {
+TEST_F( openmp, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_left_3 ) {
+TEST_F( openmp, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_right_0 ) {
+TEST_F( openmp, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_right_1 ) {
+TEST_F( openmp, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, view_subview_right_3 ) {
+TEST_F( openmp, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
index 2aa1fc5c633ffab0319c37c7a00a9abe48438597..7de7ca91bdc082057bccc1b71ec8f482a16bc0f9 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_layoutleft_to_layoutleft) {
+TEST_F( openmp, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( openmp, view_subview_layoutright_to_layoutright) {
+TEST_F( openmp, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
index 1a6871cfca8f3136b13011f66576cd7a9d891978..d727ec0ee592c57d357b8cfebfa83a9bcc06eb12 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_1d_assign ) {
+TEST_F( openmp, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
index b04edbb997d564a2e921bacf7b36959b17e8755f..df43f555d385037dafe3a29b9cec66ef2eb9b781 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
index 765e235830db2f7e48ad8fe9df271429fef2c2ab..38f241ebf7bdea50af2f8a0b06dd69b16175667c 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
index 9d8b62708a3d4d898ddbc923b733c78c869c2826..11a4ea8ac24bf457f9d4fbe97b5180536d1fac69 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_2d_from_3d ) {
+TEST_F( openmp, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
index 9c19cf0e57dcf7058f4f0aeb4752465c470e9fa9..a91baa34df3f0fc41db37909fdcdbeefc27a3158 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
index c1bdf72351b02958f5e1e857c41f7e5d999ade64..20d4d9bd64462eaa9d90a5d776c7129a7a816312 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
index 08a3b5a54a2c66599ebc61384357324a79815507..528df1c0700d7582f427310d8f7610376f9166bb 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_left ) {
+TEST_F( openmp, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
index 0864ebbdaa44b1bd00a154fe2f7fcf4b55ae48eb..d9eea8dba91a7c03cdfd8460b2241438ffbbce1d 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
index e38dfecbf6e353bcab69f7341d2754ea6ef85cf9..f909dc33c067ca4ff6c3badeddf92c6bb12a2bd6 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
index b7e4683d23d18bb838c97a1fa198b2d38874de77..59996d5e33b594a23c7e368354208c68707339e9 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_right ) {
+TEST_F( openmp, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
index fc3e66fd4853c6104503aaf461eda97183cb44e1..3f9c215d9b10dbbeb3aada555515ab27c1e38adb 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( openmp, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
index e21a13ee579e5052241252ffa6b99ba49f9c6b47..d3a73483a0bc11c4d60eb4d6d658c00fde838566 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( openmp, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::OpenMP, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
index 9da159ab5773a0a7b1a49605cf1a88294a29d09d..399c6e92e4c7cf858ecef02a97e1bf4742ec6eda 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp
@@ -1,12 +1,12 @@
-#include<openmp/TestOpenMP_SubView_c01.cpp>
-#include<openmp/TestOpenMP_SubView_c02.cpp>
-#include<openmp/TestOpenMP_SubView_c03.cpp>
-#include<openmp/TestOpenMP_SubView_c04.cpp>
-#include<openmp/TestOpenMP_SubView_c05.cpp>
-#include<openmp/TestOpenMP_SubView_c06.cpp>
-#include<openmp/TestOpenMP_SubView_c07.cpp>
-#include<openmp/TestOpenMP_SubView_c08.cpp>
-#include<openmp/TestOpenMP_SubView_c09.cpp>
-#include<openmp/TestOpenMP_SubView_c10.cpp>
-#include<openmp/TestOpenMP_SubView_c11.cpp>
-#include<openmp/TestOpenMP_SubView_c12.cpp>
+#include <openmp/TestOpenMP_SubView_c01.cpp>
+#include <openmp/TestOpenMP_SubView_c02.cpp>
+#include <openmp/TestOpenMP_SubView_c03.cpp>
+#include <openmp/TestOpenMP_SubView_c04.cpp>
+#include <openmp/TestOpenMP_SubView_c05.cpp>
+#include <openmp/TestOpenMP_SubView_c06.cpp>
+#include <openmp/TestOpenMP_SubView_c07.cpp>
+#include <openmp/TestOpenMP_SubView_c08.cpp>
+#include <openmp/TestOpenMP_SubView_c09.cpp>
+#include <openmp/TestOpenMP_SubView_c10.cpp>
+#include <openmp/TestOpenMP_SubView_c11.cpp>
+#include <openmp/TestOpenMP_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
index 38cf0a0f409c8dbe5d923cae4b88bec619a5a8b0..216789e8bf6ebcd1d2deab1e567317376c611e0b 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_Team.cpp
@@ -40,67 +40,73 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , team_tag )
+TEST_F( openmp, team_tag )
 {
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
 
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( openmp , team_shared_request) {
-  TestSharedTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( openmp, team_shared_request )
+{
+  TestSharedTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( openmp, team_scratch_request) {
-  TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( openmp, team_scratch_request )
+{
+  TestScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( openmp , team_lambda_shared_request) {
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( openmp, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( openmp, shmem_size) {
+TEST_F( openmp, shmem_size )
+{
   TestShmemSize< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::OpenMP , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( openmp, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::OpenMP, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( openmp , team_vector )
+TEST_F( openmp, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >( 10 ) ) );
 }
 
 #ifdef KOKKOS_COMPILER_GNU
@@ -112,11 +118,10 @@ TEST_F( openmp , team_vector )
 #ifndef SKIP_TEST
 TEST_F( openmp, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::OpenMP >( 8192, 2048, 16, 16 );
 }
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp
index 82cbf3ea18ecf7c3c424c73fe3e41ebf4a4e0c26..aead381a11e5b5a88763d9622deac55c3ceaf631 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_a.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , impl_view_mapping_a ) {
+TEST_F( openmp, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::OpenMP >();
   test_view_mapping_operator< Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
index b2d4f87fdd417ab2d1036884dcce4b0df5793396..c802fb79caf081b103c6e65bf54d8e20fe3b7193 100644
--- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_ViewAPI_b.cpp
@@ -40,82 +40,85 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <openmp/TestOpenMP.hpp>
 
 namespace Test {
 
-TEST_F( openmp , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >();
+TEST_F( openmp, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , impl_view_mapping_b ) {
+TEST_F( openmp, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::OpenMP >();
   TestViewMappingAtomic< Kokkos::OpenMP >::run();
 }
 
-TEST_F( openmp, view_api) {
-  TestViewAPI< double , Kokkos::OpenMP >();
+TEST_F( openmp, view_api )
+{
+  TestViewAPI< double, Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , view_nested_view )
+TEST_F( openmp, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::OpenMP >();
 }
 
-
-
-TEST_F( openmp , view_remap )
+TEST_F( openmp, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::OpenMP > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::OpenMP > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::OpenMP > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::OpenMP > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::OpenMP > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::OpenMP > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( openmp , view_aggregate )
+TEST_F( openmp, view_aggregate )
 {
   TestViewAggregate< Kokkos::OpenMP >();
 }
 
-TEST_F( openmp , template_meta_functions )
+TEST_F( openmp, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::OpenMP >();
+  TestTemplateMetaFunctions< int, Kokkos::OpenMP >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads.hpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..907fe23ea5e7c6b11a52c6327787ddee0108f89e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_QTHREADS_HPP
+#define KOKKOS_TEST_QTHREADS_HPP
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_LAMBDA
+#undef KOKKOS_LAMBDA
+#endif
+#define KOKKOS_LAMBDA [=]
+
+#include <Kokkos_Core.hpp>
+
+#include <TestTile.hpp>
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+#include <TestAtomicOperations.hpp>
+#include <TestAtomicViews.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskScheduler.hpp>
+#include <TestMemoryPool.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestPolicyConstruction.hpp>
+#include <TestMDRange.hpp>
+
+namespace Test {
+
+class qthreads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    const unsigned threads_count = std::max( 1u, numa_count ) *
+                                   std::max( 2u, ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::Qthreads::initialize( threads_count );
+    Kokkos::print_configuration( std::cout, true );
+
+    srand( 10231 );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Qthreads::finalize();
+  }
+};
+
+} // namespace Test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e64c3305db616b09c24c2b47d64c9153e3aeb0df
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Atomics.cpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, atomics )
+{
+#if 0
+  const int loop_count = 1e4;
+
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Qthreads >( loop_count, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Qthreads >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Qthreads >( 100, 3 ) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Qthreads >( 100, 3 ) ) );
+#endif
+}
+
+TEST_F( qthreads, atomic_operations )
+{
+#if 0
+  const int start = 1; // Avoid zero for division.
+  const int end = 11;
+
+  for ( int i = start; i < end; ++i )
+  {
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Qthreads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Qthreads >( start, end - i, 4 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_views_integral )
+{
+#if 0
+  const long length = 1000000;
+
+  {
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Qthreads >( length, 8 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_views_nonintegral )
+{
+#if 0
+  const long length = 1000000;
+
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Qthreads >( length, 4 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, atomic_view_api )
+{
+#if 0
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0faec84056997dd0d1236ff8c00f2218b2549cf9
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Other.cpp
@@ -0,0 +1,213 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, init )
+{
+  ;
+}
+
+TEST_F( qthreads, md_range )
+{
+#if 0
+  TestMDRange_2D< Kokkos::Qthreads >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Qthreads >::test_for3( 100, 100, 100 );
+#endif
+}
+
+TEST_F( qthreads, policy_construction )
+{
+#if 0
+  TestRangePolicyConstruction< Kokkos::Qthreads >();
+  TestTeamPolicyConstruction< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, range_tag )
+{
+#if 0
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthreads, compiler_macros )
+{
+#if 0
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthreads >() ) );
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthreads, memory_pool )
+{
+#if 0
+  bool val = TestMemoryPool::test_mempool< Kokkos::Qthreads >( 128, 128000000 );
+  ASSERT_TRUE( val );
+
+  TestMemoryPool::test_mempool2< Kokkos::Qthreads >( 64, 4, 1000000, 2000000 );
+
+  TestMemoryPool::test_memory_exhaustion< Kokkos::Qthreads >();
+#endif
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_TASKDAG )
+
+TEST_F( qthreads, task_fib )
+{
+#if 0
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Qthreads >::run( i, ( i + 1 ) * ( i + 1 ) * 10000 );
+  }
+#endif
+}
+
+TEST_F( qthreads, task_depend )
+{
+#if 0
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Qthreads >::run( i );
+  }
+#endif
+}
+
+TEST_F( qthreads, task_team )
+{
+#if 0
+  TestTaskScheduler::TestTaskTeam< Kokkos::Qthreads >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Qthreads >::run( 1000 ); // Put back after testing.
+#endif
+}
+
+#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
+
+TEST_F( qthreads, cxx11 )
+{
+#if 0
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Qthreads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Qthreads >( 4 ) ) );
+  }
+#endif
+}
+
+#endif
+
+TEST_F( qthreads, tile_layout )
+{
+#if 0
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Qthreads, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Qthreads, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Qthreads, 8, 8 >( 9, 11 );
+#endif
+}
+
+TEST_F( qthreads, dispatch )
+{
+#if 0
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Qthreads >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2470ac15c45431e852981a94f792bb2710535d7
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Reductions.cpp
@@ -0,0 +1,168 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, long_reduce )
+{
+#if 0
+  TestReduce< long, Kokkos::Qthreads >( 0 );
+  TestReduce< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, double_reduce )
+{
+#if 0
+  TestReduce< double, Kokkos::Qthreads >( 0 );
+  TestReduce< double, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, reducers )
+{
+#if 0
+  TestReducers< int, Kokkos::Qthreads >::execute_integer();
+  TestReducers< size_t, Kokkos::Qthreads >::execute_integer();
+  TestReducers< double, Kokkos::Qthreads >::execute_float();
+  TestReducers< Kokkos::complex<double >, Kokkos::Qthreads>::execute_basic();
+#endif
+}
+
+TEST_F( qthreads, long_reduce_dynamic )
+{
+#if 0
+  TestReduceDynamic< long, Kokkos::Qthreads >( 0 );
+  TestReduceDynamic< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, double_reduce_dynamic )
+{
+#if 0
+  TestReduceDynamic< double, Kokkos::Qthreads >( 0 );
+  TestReduceDynamic< double, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, long_reduce_dynamic_view )
+{
+#if 0
+  TestReduceDynamicView< long, Kokkos::Qthreads >( 0 );
+  TestReduceDynamicView< long, Kokkos::Qthreads >( 1000000 );
+#endif
+}
+
+TEST_F( qthreads, scan )
+{
+#if 0
+  TestScan< Kokkos::Qthreads >::test_range( 1, 1000 );
+  TestScan< Kokkos::Qthreads >( 0 );
+  TestScan< Kokkos::Qthreads >( 100000 );
+  TestScan< Kokkos::Qthreads >( 10000000 );
+  Kokkos::Qthreads::fence();
+#endif
+}
+
+TEST_F( qthreads, scan_small )
+{
+#if 0
+  typedef TestScan< Kokkos::Qthreads, Kokkos::Impl::QthreadsExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
+    TestScanFunctor( 10 );
+    TestScanFunctor( 10000 );
+  }
+  TestScanFunctor( 1000000 );
+  TestScanFunctor( 10000000 );
+
+  Kokkos::Qthreads::fence();
+#endif
+}
+
+TEST_F( qthreads, team_scan )
+{
+#if 0
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+#endif
+}
+
+TEST_F( qthreads, team_long_reduce )
+{
+#if 0
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+#endif
+}
+
+TEST_F( qthreads, team_double_reduce )
+{
+#if 0
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+#endif
+}
+
+TEST_F( qthreads, reduction_deduction )
+{
+#if 0
+  TestCXX11::test_reduction_deduction< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab873359a748e6086533454f7a0842a5e8dee9e6
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_a.cpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_auto_1d_left )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_auto_1d_right )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_auto_1d_stride )
+{
+#if 0
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_assign_strided )
+{
+#if 0
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_0 )
+{
+#if 0
+  TestViewSubview::test_left_0< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_1 )
+{
+#if 0
+  TestViewSubview::test_left_1< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_2 )
+{
+#if 0
+  TestViewSubview::test_left_2< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_left_3 )
+{
+#if 0
+  TestViewSubview::test_left_3< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_0 )
+{
+#if 0
+  TestViewSubview::test_right_0< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_1 )
+{
+#if 0
+  TestViewSubview::test_right_1< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_right_3 )
+{
+#if 0
+  TestViewSubview::test_right_3< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199c5c795557bb4da254c24d320a99240768e014
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_b.cpp
@@ -0,0 +1,66 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_layoutleft_to_layoutleft )
+{
+#if 0
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+TEST_F( qthreads, view_subview_layoutright_to_layoutright )
+{
+#if 0
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f44909f3daffd71b13a12eba33b4e8e142e946ad
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c01.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bb936f8dd511034924d779362f34e10833b2668
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c02.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign_atomic )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..27073dfa814683a77a0edc602e23f3c3aadcd0e2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c03.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_1d_assign_randomaccess )
+{
+#if 0
+  TestViewSubview::test_1d_assign< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1b3cf488521b6ed84aa7eda62084ba737d485abf
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c04.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..34dda63e64da0cb39b1a7d977ff08477aa8bbfec
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c05.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d_atomic )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5a4ee50fb2f6b41ddfc504192a3815d4a1775f5e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c06.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_2d_from_3d_randomaccess )
+{
+#if 0
+  TestViewSubview::test_2d_subview_3d< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe386e34a8083a8bc2084b6957f57124a78d41c3
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c07.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3e0ab25291334f291adf3ba743c822eea552380
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c08.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left_atomic )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df1f570e9dce927b75c11695a11124564e39d567
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c09.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_left_randomaccess )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cc3c80d10d7b3fd544ed7b49fa56b9f2f4e8b5a7
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c10.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..14b331a4585efeb912c0ec7001cf0195657c60de
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c11.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right_atomic )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..571382e66f52d5a6c8294af1d117ebaeb6fe25f5
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c12.cpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, view_subview_3d_from_5d_right_randomaccess )
+{
+#if 0
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Qthreads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab984c5f30e05958c0c601256ada3c13a70ee68d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp
@@ -0,0 +1,12 @@
+#include <qthreads/TestQthreads_SubView_c01.cpp>
+#include <qthreads/TestQthreads_SubView_c02.cpp>
+#include <qthreads/TestQthreads_SubView_c03.cpp>
+#include <qthreads/TestQthreads_SubView_c04.cpp>
+#include <qthreads/TestQthreads_SubView_c05.cpp>
+#include <qthreads/TestQthreads_SubView_c06.cpp>
+#include <qthreads/TestQthreads_SubView_c07.cpp>
+#include <qthreads/TestQthreads_SubView_c08.cpp>
+#include <qthreads/TestQthreads_SubView_c09.cpp>
+#include <qthreads/TestQthreads_SubView_c10.cpp>
+#include <qthreads/TestQthreads_SubView_c11.cpp>
+#include <qthreads/TestQthreads_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e7b81283fbf27e97427defbf1b0894793cc44ed2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_Team.cpp
@@ -0,0 +1,143 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, team_tag )
+{
+#if 0
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
+
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
+#endif
+}
+
+TEST_F( qthreads, team_shared_request )
+{
+#if 0
+  TestSharedTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+TEST_F( qthreads, team_scratch_request )
+{
+#if 0
+  TestScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( qthreads, team_lambda_shared_request )
+{
+#if 0
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+#endif
+
+TEST_F( qthreads, shmem_size )
+{
+#if 0
+  TestShmemSize< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, multi_level_scratch )
+{
+#if 0
+  TestMultiLevelScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Qthreads, Kokkos::Schedule<Kokkos::Dynamic> >();
+#endif
+}
+
+TEST_F( qthreads, team_vector )
+{
+#if 0
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthreads >( 10 ) ) );
+#endif
+}
+
+#ifdef KOKKOS_COMPILER_GNU
+#if ( KOKKOS_COMPILER_GNU == 472 )
+#define SKIP_TEST
+#endif
+#endif
+
+#ifndef SKIP_TEST
+TEST_F( qthreads, triple_nested_parallelism )
+{
+#if 0
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Qthreads >( 8192, 2048, 16, 16 );
+#endif
+}
+#endif
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd876a36bfa457f3c5f895d604f38be27fa4e986
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_a.cpp
@@ -0,0 +1,56 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, impl_view_mapping_a )
+{
+#if 0
+  test_view_mapping< Kokkos::Qthreads >();
+  test_view_mapping_operator< Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adf048b61360b1aa9d49d9ce0f93453d580eb1a4
--- /dev/null
+++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_ViewAPI_b.cpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <qthreads/TestQthreads.hpp>
+
+namespace Test {
+
+TEST_F( qthreads, impl_shared_alloc )
+{
+#if 0
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, impl_view_mapping_b )
+{
+#if 0
+  test_view_mapping_subview< Kokkos::Qthreads >();
+  TestViewMappingAtomic< Kokkos::Qthreads >::run();
+#endif
+}
+
+TEST_F( qthreads, view_api )
+{
+#if 0
+  TestViewAPI< double, Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_nested_view )
+{
+#if 0
+  ::Test::view_nested_view< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, view_remap )
+{
+#if 0
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Qthreads > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Qthreads > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Qthreads > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
+#endif
+}
+
+TEST_F( qthreads, view_aggregate )
+{
+#if 0
+  TestViewAggregate< Kokkos::Qthreads >();
+#endif
+}
+
+TEST_F( qthreads, template_meta_functions )
+{
+#if 0
+  TestTemplateMetaFunctions< int, Kokkos::Qthreads >();
+#endif
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial.hpp b/lib/kokkos/core/unit_test/serial/TestSerial.hpp
index c0ffa6afb1843f7fe61693a778d9389e4c20fccb..03da07e065e371e636f1d2c59ba99a2832dd574c 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial.hpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial.hpp
@@ -40,11 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_SERIAL_HPP
 #define KOKKOS_TEST_SERIAL_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
+
 #ifdef KOKKOS_LAMBDA
 #undef KOKKOS_LAMBDA
 #endif
@@ -53,21 +56,14 @@
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
 #include <TestAtomic.hpp>
 #include <TestAtomicOperations.hpp>
-
 #include <TestAtomicViews.hpp>
-
 #include <TestRange.hpp>
 #include <TestTeam.hpp>
 #include <TestReduce.hpp>
@@ -76,15 +72,11 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
@@ -92,14 +84,16 @@ namespace Test {
 class serial : public ::testing::Test {
 protected:
   static void SetUpTestCase()
-    {
-      Kokkos::HostSpace::execution_space::initialize();
-    }
+  {
+    Kokkos::HostSpace::execution_space::initialize();
+  }
+
   static void TearDownTestCase()
-    {
-      Kokkos::HostSpace::execution_space::finalize();
-    }
+  {
+    Kokkos::HostSpace::execution_space::finalize();
+  }
 };
 
-}
+} // namespace Test
+
 #endif
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
index 729a76556dc4f3ff8110ba62b02dfc57ec878590..81ba532a3d45322ca561498585763d413256be3c 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Atomics.cpp
@@ -40,165 +40,165 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , atomics )
+TEST_F( serial, atomics )
 {
-  const int loop_count = 1e6 ;
+  const int loop_count = 1e6;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Serial >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Serial >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Serial >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Serial >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Serial >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Serial >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Serial >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Serial >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Serial >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Serial>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Serial >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Serial >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Serial >( 100, 3 ) ) );
 }
 
-TEST_F( serial , atomic_operations )
+TEST_F( serial, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Serial>(start, end-i, 12) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Serial>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Serial>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Serial>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Serial >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Serial >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Serial >( start, end - i, 4 ) ) );
   }
-
 }
 
 
-TEST_F( serial , atomic_views_integral )
+TEST_F( serial, atomic_views_integral )
 {
   const long length = 1000000;
-  {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Serial>(length, 8 ) ) );
 
+  {
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Serial >( length, 8 ) ) );
   }
 }
 
-TEST_F( serial , atomic_views_nonintegral )
+TEST_F( serial, atomic_views_nonintegral )
 {
   const long length = 1000000;
-  {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Serial>(length, 4 ) ) );
 
+  {
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Serial >( length, 4 ) ) );
   }
 }
 
-TEST_F( serial , atomic_view_api )
+TEST_F( serial, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::Serial>();
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
index 43fc4c358745f3f01032723d029796a78bcf76a1..b40ed3f4afc5b4176f02c2ad7d16a5ce19f2614b 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Other.cpp
@@ -40,50 +40,61 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , md_range ) {
-  TestMDRange_2D< Kokkos::Serial >::test_for2(100,100);
+TEST_F( serial , mdrange_for )
+{
+  TestMDRange_2D< Kokkos::Serial >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Serial >::test_for3( 100, 10, 100 );
+  TestMDRange_4D< Kokkos::Serial >::test_for4( 100, 10, 10, 10 );
+  TestMDRange_5D< Kokkos::Serial >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< Kokkos::Serial >::test_for6( 10, 10, 10, 10, 5, 5 );
+}
 
-  TestMDRange_3D< Kokkos::Serial >::test_for3(100,100,100);
+TEST_F( serial , mdrange_reduce )
+{
+  TestMDRange_2D< Kokkos::Serial >::test_reduce2( 100, 100 );
+  TestMDRange_3D< Kokkos::Serial >::test_reduce3( 100, 10, 100 );
 }
 
-TEST_F( serial, policy_construction) {
+TEST_F( serial, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::Serial >();
   TestTeamPolicyConstruction< Kokkos::Serial >();
 }
 
-TEST_F( serial , range_tag )
+TEST_F( serial, range_tag )
 {
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  TestRange< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( serial , compiler_macros )
+TEST_F( serial, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Serial >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( serial , memory_pool )
+TEST_F( serial, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::Serial >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -97,24 +108,24 @@ TEST_F( serial , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 
-TEST_F( serial , task_fib )
+TEST_F( serial, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::Serial >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Serial >::run( i );
   }
 }
 
-TEST_F( serial , task_depend )
+TEST_F( serial, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::Serial >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Serial >::run( i );
   }
 }
 
-TEST_F( serial , task_team )
+TEST_F( serial, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::Serial >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Serial >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::Serial >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Serial >::run( 1000 ); // Put back after testing.
 }
 
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -122,44 +133,40 @@ TEST_F( serial , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
-TEST_F( serial , cxx11 )
+TEST_F( serial, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Serial >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Serial >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( serial, tile_layout )
 {
-  TestTile::test< Kokkos::Serial , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Serial , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Serial , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Serial , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Serial , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::Serial, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Serial, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Serial, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Serial, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Serial, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Serial, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Serial, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Serial, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Serial, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Serial, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Serial, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Serial, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Serial, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Serial, 8, 8 >( 9, 11 );
 }
 
-
-
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
index 25b5ac6d16a8d101dd1e7d940007a107d1c814fc..8a3d518cfbea93b97d9a885ac061a79494676362 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Reductions.cpp
@@ -40,83 +40,90 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, long_reduce) {
-  TestReduce< long ,   Kokkos::Serial >( 0 );
-  TestReduce< long ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, long_reduce )
+{
+  TestReduce< long, Kokkos::Serial >( 0 );
+  TestReduce< long, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial, double_reduce) {
-  TestReduce< double ,   Kokkos::Serial >( 0 );
-  TestReduce< double ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, double_reduce )
+{
+  TestReduce< double, Kokkos::Serial >( 0 );
+  TestReduce< double, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial , reducers )
+TEST_F( serial, reducers )
 {
-  TestReducers<int, Kokkos::Serial>::execute_integer();
-  TestReducers<size_t, Kokkos::Serial>::execute_integer();
-  TestReducers<double, Kokkos::Serial>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::Serial>::execute_basic();
+  TestReducers< int, Kokkos::Serial >::execute_integer();
+  TestReducers< size_t, Kokkos::Serial >::execute_integer();
+  TestReducers< double, Kokkos::Serial >::execute_float();
+  TestReducers< Kokkos::complex<double >, Kokkos::Serial>::execute_basic();
 }
 
-TEST_F( serial, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Serial >( 0 );
-  TestReduceDynamic< long ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::Serial >( 0 );
+  TestReduceDynamic< long, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Serial >( 0 );
-  TestReduceDynamic< double ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::Serial >( 0 );
+  TestReduceDynamic< double, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Serial >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::Serial >( 1000000 );
+TEST_F( serial, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::Serial >( 0 );
+  TestReduceDynamicView< long, Kokkos::Serial >( 1000000 );
 }
 
-TEST_F( serial , scan )
+TEST_F( serial, scan )
 {
-  TestScan< Kokkos::Serial >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Serial >::test_range( 1, 1000 );
   TestScan< Kokkos::Serial >( 0 );
   TestScan< Kokkos::Serial >( 10 );
   TestScan< Kokkos::Serial >( 10000 );
 }
 
-TEST_F( serial  , team_scan )
+TEST_F( serial, team_scan )
 {
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( serial , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( serial, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( serial , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( serial, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( serial , reduction_deduction )
+TEST_F( serial, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
index bc838ccde4b36cf964d0da97500fdbd921a85aa0..3dc3e2019d9fd3927f422c689bfbd65fc45a997b 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Serial >();
+TEST_F( serial, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Serial >();
+TEST_F( serial, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Serial >();
+TEST_F( serial, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_assign_strided ) {
+TEST_F( serial, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_0 ) {
+TEST_F( serial, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_1 ) {
+TEST_F( serial, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_2 ) {
+TEST_F( serial, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_left_3 ) {
+TEST_F( serial, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_right_0 ) {
+TEST_F( serial, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_right_1 ) {
+TEST_F( serial, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::Serial >();
 }
 
-TEST_F( serial, view_subview_right_3 ) {
+TEST_F( serial, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
index e6a5b56d3ed48ac2301e56b944e4924dcb79451e..536c3bf1979a5b3b9bc33cd8768a86ca3367a8c7 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_layoutleft_to_layoutleft) {
+TEST_F( serial, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( serial, view_subview_layoutright_to_layoutright) {
+TEST_F( serial, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
index 0b7a0d3bfa6fa514195a4fd6241fc262f0ad884d..579a12bf782a34c4739c9e4a30685878dc55900e 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_1d_assign ) {
+TEST_F( serial, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
index 8ca7285c1f8331cb6992411d6b35d7bc054945a3..ff009fef27715a8b366e848267eaa4c6c10bc2d7 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
index 1d156c741524315d2fb66fdc5e852329d846d3ae..a20478433cd2b87f0e07a0e793143c4f6f2ddf40 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
index ebf0e5c99155afe17dea3807981d712e1d67c601..a34b26d9f79317b90dd0bfaf06385ad638d4757f 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_2d_from_3d ) {
+TEST_F( serial, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
index 74acb92f1b9e632a980b7d0141a54200aebbfd15..6d1882cf04e3d384773d384215cd0244ebd8cfcd 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
index 8075d46e0fe15c4c15a47e80f6172d4990fd6ce5..12fb883b63e12812c947facc4b070c0577d09783 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
index 9ce8222643a5d3a183fad578013945a67efd6847..8aae20c0239d5a6272879887c7626f0e1a0e2f2a 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_left ) {
+TEST_F( serial, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
index c8a5c8f33fdc70a2408aade42f21b3c451753b4c..e75db8d52dc1250b582d62c7e51b6bda8ce00b9b 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
index b66f15f17da1b7f0bcb24459678965dacee04f9b..b9cea2ce89c6f2bb311299ee6463ac34185245d8 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
index 5e5e3cf3d1af0f0755ab8fa3f8be9f846ff554e9..e5dbcead376ebdcb37a4bb79dfdfe1916b3e2d0d 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_right ) {
+TEST_F( serial, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
index 55a353bcafef5e852ec33c80d9084f7c2236efcc..3005030f934551a0f8ea5d6be7772cfefa605a98 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( serial, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
index a168e1e232ff5f71cce593be776496cbd7dd6c25..fee8cb7af2a20cdebafa9270932cda2457363602 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( serial, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Serial, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
index a489b0fcb585aa0e12310f09a0701188b8814045..24dc6b5061412c04998f734cab9f1367a9b7d4fe 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp
@@ -1,12 +1,12 @@
-#include<serial/TestSerial_SubView_c01.cpp>
-#include<serial/TestSerial_SubView_c02.cpp>
-#include<serial/TestSerial_SubView_c03.cpp>
-#include<serial/TestSerial_SubView_c04.cpp>
-#include<serial/TestSerial_SubView_c05.cpp>
-#include<serial/TestSerial_SubView_c06.cpp>
-#include<serial/TestSerial_SubView_c07.cpp>
-#include<serial/TestSerial_SubView_c08.cpp>
-#include<serial/TestSerial_SubView_c09.cpp>
-#include<serial/TestSerial_SubView_c10.cpp>
-#include<serial/TestSerial_SubView_c11.cpp>
-#include<serial/TestSerial_SubView_c12.cpp>
+#include <serial/TestSerial_SubView_c01.cpp>
+#include <serial/TestSerial_SubView_c02.cpp>
+#include <serial/TestSerial_SubView_c03.cpp>
+#include <serial/TestSerial_SubView_c04.cpp>
+#include <serial/TestSerial_SubView_c05.cpp>
+#include <serial/TestSerial_SubView_c06.cpp>
+#include <serial/TestSerial_SubView_c07.cpp>
+#include <serial/TestSerial_SubView_c08.cpp>
+#include <serial/TestSerial_SubView_c09.cpp>
+#include <serial/TestSerial_SubView_c10.cpp>
+#include <serial/TestSerial_SubView_c11.cpp>
+#include <serial/TestSerial_SubView_c12.cpp>
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp
index df400b4cb51587b76992c26ff28419b334b5d2d6..f13b2ce1b4bd20e92509fc9dc1801352ff3bb289 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_Team.cpp
@@ -40,62 +40,68 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , team_tag )
+TEST_F( serial, team_tag )
 {
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( serial , team_shared_request) {
-  TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( serial, team_shared_request )
+{
+  TestSharedTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( serial, team_scratch_request) {
-  TestScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( serial, team_scratch_request )
+{
+  TestScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( serial , team_lambda_shared_request) {
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( serial, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( serial, shmem_size) {
+TEST_F( serial, shmem_size )
+{
   TestShmemSize< Kokkos::Serial >();
 }
 
-TEST_F( serial, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::Serial , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( serial, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Serial, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( serial , team_vector )
+TEST_F( serial, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >( 10 ) ) );
 }
 
 #ifdef KOKKOS_COMPILER_GNU
@@ -107,11 +113,10 @@ TEST_F( serial , team_vector )
 #ifndef SKIP_TEST
 TEST_F( serial, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Serial >( 8192, 2048, 16, 16 );
 }
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp
index 4c655fe770f26fd8d6b239251c5d6301140faa09..2192159b8439a2b4fdd0fcc38b3be4d382973821 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_a.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , impl_view_mapping_a ) {
+TEST_F( serial, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::Serial >();
   test_view_mapping_operator< Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
index 4947f2eaaef607b04d680a7c9c64ae6f2d8e6087..8c48ad2ceda81ca46913e3d3206fac96e492950a 100644
--- a/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/serial/TestSerial_ViewAPI_b.cpp
@@ -40,82 +40,85 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <serial/TestSerial.hpp>
 
 namespace Test {
 
-TEST_F( serial , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >();
+TEST_F( serial, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Serial >();
 }
 
-TEST_F( serial , impl_view_mapping_b ) {
+TEST_F( serial, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::Serial >();
   TestViewMappingAtomic< Kokkos::Serial >::run();
 }
 
-TEST_F( serial, view_api) {
-  TestViewAPI< double , Kokkos::Serial >();
+TEST_F( serial, view_api )
+{
+  TestViewAPI< double, Kokkos::Serial >();
 }
 
-TEST_F( serial , view_nested_view )
+TEST_F( serial, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Serial >();
 }
 
-
-
-TEST_F( serial , view_remap )
+TEST_F( serial, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::Serial > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Serial > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Serial > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Serial > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Serial > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Serial > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( serial , view_aggregate )
+TEST_F( serial, view_aggregate )
 {
   TestViewAggregate< Kokkos::Serial >();
 }
 
-TEST_F( serial , template_meta_functions )
+TEST_F( serial, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::Serial >();
+  TestTemplateMetaFunctions< int, Kokkos::Serial >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads.hpp b/lib/kokkos/core/unit_test/threads/TestThreads.hpp
index 4f611cf99c7c0e4f3c4b26f0fada9c7c8469ddbe..0afd6772fefff3e2efd7d490d35f985346163fd6 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads.hpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads.hpp
@@ -40,11 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #ifndef KOKKOS_TEST_THREADS_HPP
 #define KOKKOS_TEST_THREADS_HPP
+
 #include <gtest/gtest.h>
 
 #include <Kokkos_Macros.hpp>
+
 #ifdef KOKKOS_LAMBDA
 #undef KOKKOS_LAMBDA
 #endif
@@ -53,13 +56,8 @@
 #include <Kokkos_Core.hpp>
 
 #include <TestTile.hpp>
-
-//----------------------------------------------------------------------------
-
 #include <TestSharedAlloc.hpp>
 #include <TestViewMapping.hpp>
-
-
 #include <TestViewAPI.hpp>
 #include <TestViewOfClass.hpp>
 #include <TestViewSubview.hpp>
@@ -74,15 +72,11 @@
 #include <TestCompilerMacros.hpp>
 #include <TestTaskScheduler.hpp>
 #include <TestMemoryPool.hpp>
-
-
 #include <TestCXX11.hpp>
 #include <TestCXX11Deduction.hpp>
 #include <TestTeamVector.hpp>
 #include <TestTemplateMetaFunctions.hpp>
-
 #include <TestPolicyConstruction.hpp>
-
 #include <TestMDRange.hpp>
 
 namespace Test {
@@ -95,13 +89,13 @@ protected:
     const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
     const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
 
-    unsigned threads_count = 0 ;
+    unsigned threads_count = 0;
 
-    threads_count = std::max( 1u , numa_count )
-                  * std::max( 2u , cores_per_numa * threads_per_core );
+    threads_count = std::max( 1u, numa_count )
+                  * std::max( 2u, cores_per_numa * threads_per_core );
 
     Kokkos::Threads::initialize( threads_count );
-    Kokkos::Threads::print_configuration( std::cout , true /* detailed */ );
+    Kokkos::print_configuration( std::cout, true /* detailed */ );
   }
 
   static void TearDownTestCase()
@@ -110,6 +104,6 @@ protected:
   }
 };
 
+} // namespace Test
 
-}
 #endif
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
index 6e24c4973ed7c37ff559a5ad023a69fabb607b29..d2a5ea5d6352acc79606082fd75c465b0b5b515e 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Atomics.cpp
@@ -40,165 +40,161 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , atomics )
+TEST_F( threads, atomics )
 {
-  const int loop_count = 1e4 ;
+  const int loop_count = 1e4;
 
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< unsigned long int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< long long int, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Threads >( loop_count, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Threads >( loop_count, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< double, Kokkos::Threads >( loop_count, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Threads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Threads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< float, Kokkos::Threads >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Threads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Threads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, Kokkos::Threads >( 100, 3 ) ) );
 
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Threads>(100,1) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Threads>(100,2) ) );
-  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<4> ,Kokkos::Threads>(100,3) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Threads >( 100, 1 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Threads >( 100, 2 ) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, Kokkos::Threads >( 100, 3 ) ) );
 }
 
-TEST_F( threads , atomic_operations )
+TEST_F( threads, atomic_operations )
 {
-  const int start = 1; //Avoid zero for division
+  const int start = 1; // Avoid zero for division.
   const int end = 11;
-  for (int i = start; i < end; ++i)
+  for ( int i = start; i < end; ++i )
   {
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<unsigned long int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 8 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 9 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 11 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType<long long int,Kokkos::Threads>(start, end-i, 12 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<double,Kokkos::Threads>(start, end-i, 4 ) ) );
-
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType<float,Kokkos::Threads>(start, end-i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< unsigned long int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 8 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 9 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 11 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestIntegralType< long long int, Kokkos::Threads >( start, end - i, 12 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< double, Kokkos::Threads >( start, end - i, 4 ) ) );
+
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicOperations::AtomicOperationsTestNonIntegralType< float, Kokkos::Threads >( start, end - i, 4 ) ) );
   }
-
 }
 
-
-TEST_F( threads , atomic_views_integral )
+TEST_F( threads, atomic_views_integral )
 {
   const long length = 1000000;
   {
-    //Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 4 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 5 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 6 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 7 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType<long, Kokkos::Threads>(length, 8 ) ) );
-
+    // Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 4 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 5 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 6 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 7 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestIntegralType< long, Kokkos::Threads >( length, 8 ) ) );
   }
 }
 
-TEST_F( threads , atomic_views_nonintegral )
+TEST_F( threads, atomic_views_nonintegral )
 {
   const long length = 1000000;
   {
-    //Non-Integral Types
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 1 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 2 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 3 ) ) );
-    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType<double,Kokkos::Threads>(length, 4 ) ) );
-
+    // Non-Integral Types.
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 1 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 2 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 3 ) ) );
+    ASSERT_TRUE( ( TestAtomicViews::AtomicViewsTestNonIntegralType< double, Kokkos::Threads >( length, 4 ) ) );
   }
 }
 
-TEST_F( threads , atomic_view_api )
+TEST_F( threads, atomic_view_api )
 {
-  TestAtomicViews::TestAtomicViewAPI<int, Kokkos::Threads>();
+  TestAtomicViews::TestAtomicViewAPI< int, Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
index ac0356eeb4c9c15d5409c0e9d10a772941de57d0..7d268c14547e4680c1ad57d8e66e2b1a4bfaf501 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Other.cpp
@@ -40,65 +40,74 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , init ) {
+TEST_F( threads, init )
+{
   ;
 }
 
-TEST_F( threads , md_range ) {
-  TestMDRange_2D< Kokkos::Threads >::test_for2(100,100);
+TEST_F( threads , mdrange_for ) {
+  TestMDRange_2D< Kokkos::Threads >::test_for2( 100, 100 );
+  TestMDRange_3D< Kokkos::Threads >::test_for3( 100, 10, 100 );
+  TestMDRange_4D< Kokkos::Threads >::test_for4( 100, 10, 10, 10 );
+  TestMDRange_5D< Kokkos::Threads >::test_for5( 100, 10, 10, 10, 5 );
+  TestMDRange_6D< Kokkos::Threads >::test_for6( 10, 10, 10, 10, 5, 5 );
+}
 
-  TestMDRange_3D< Kokkos::Threads >::test_for3(100,100,100);
+TEST_F( threads , mdrange_reduce ) {
+  TestMDRange_2D< Kokkos::Threads >::test_reduce2( 100, 100 );
+  TestMDRange_3D< Kokkos::Threads >::test_reduce3( 100, 10, 100 );
 }
 
-TEST_F( threads, policy_construction) {
+TEST_F( threads, policy_construction )
+{
   TestRangePolicyConstruction< Kokkos::Threads >();
   TestTeamPolicyConstruction< Kokkos::Threads >();
 }
 
-TEST_F( threads , range_tag )
+TEST_F( threads, range_tag )
 {
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(0);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(0);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(2);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(3);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(3);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(3);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(3);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_scan(1000);
-
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1001);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1001);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_scan(1001);
-  TestRange< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy(1000);
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 0 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 0 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 2 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 3 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 3 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 3 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 3 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_scan( 1000 );
+
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1001 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1001 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_scan( 1001 );
+  TestRange< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_dynamic_policy( 1000 );
 }
 
-
 //----------------------------------------------------------------------------
 
-TEST_F( threads , compiler_macros )
+TEST_F( threads, compiler_macros )
 {
   ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Threads >() ) );
 }
 
 //----------------------------------------------------------------------------
 
-TEST_F( threads , memory_pool )
+TEST_F( threads, memory_pool )
 {
   bool val = TestMemoryPool::test_mempool< Kokkos::Threads >( 128, 128000000 );
   ASSERT_TRUE( val );
@@ -112,24 +121,24 @@ TEST_F( threads , memory_pool )
 
 #if defined( KOKKOS_ENABLE_TASKDAG )
 /*
-TEST_F( threads , task_fib )
+TEST_F( threads, task_fib )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestFib< Kokkos::Threads >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestFib< Kokkos::Threads >::run( i );
   }
 }
 
-TEST_F( threads , task_depend )
+TEST_F( threads, task_depend )
 {
-  for ( int i = 0 ; i < 25 ; ++i ) {
-    TestTaskScheduler::TestTaskDependence< Kokkos::Threads >::run(i);
+  for ( int i = 0; i < 25; ++i ) {
+    TestTaskScheduler::TestTaskDependence< Kokkos::Threads >::run( i );
   }
 }
 
-TEST_F( threads , task_team )
+TEST_F( threads, task_team )
 {
-  TestTaskScheduler::TestTaskTeam< Kokkos::Threads >::run(1000);
-  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Threads >::run(1000); //put back after testing
+  TestTaskScheduler::TestTaskTeam< Kokkos::Threads >::run( 1000 );
+  //TestTaskScheduler::TestTaskTeamValue< Kokkos::Threads >::run( 1000 ); // Put back after testing.
 }
 */
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
@@ -137,53 +146,51 @@ TEST_F( threads , task_team )
 //----------------------------------------------------------------------------
 
 #if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
-TEST_F( threads , cxx11 )
+TEST_F( threads, cxx11 )
 {
-  if ( std::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Threads >::value ) {
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(1) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(2) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(3) ) );
-    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(4) ) );
+  if ( std::is_same< Kokkos::DefaultExecutionSpace, Kokkos::Threads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 1 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 2 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 3 ) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >( 4 ) ) );
   }
 }
 #endif
 
 TEST_F( threads, tile_layout )
 {
-  TestTile::test< Kokkos::Threads , 1 , 1 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 1 , 1 >( 2 , 3 );
-  TestTile::test< Kokkos::Threads , 1 , 1 >( 9 , 10 );
-
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 2 , 3 );
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 4 , 4 );
-  TestTile::test< Kokkos::Threads , 2 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Threads , 2 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Threads , 4 , 2 >( 9 , 9 );
-
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 4 , 4 );
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 9 , 9 );
-  TestTile::test< Kokkos::Threads , 4 , 4 >( 9 , 11 );
-
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 1 , 1 );
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 4 , 4 );
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 9 , 9 );
-  TestTile::test< Kokkos::Threads , 8 , 8 >( 9 , 11 );
+  TestTile::test< Kokkos::Threads, 1, 1 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 1, 1 >( 2, 3 );
+  TestTile::test< Kokkos::Threads, 1, 1 >( 9, 10 );
+
+  TestTile::test< Kokkos::Threads, 2, 2 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 2, 2 >( 2, 3 );
+  TestTile::test< Kokkos::Threads, 2, 2 >( 4, 4 );
+  TestTile::test< Kokkos::Threads, 2, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Threads, 2, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Threads, 4, 2 >( 9, 9 );
+
+  TestTile::test< Kokkos::Threads, 4, 4 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 4, 4 >( 4, 4 );
+  TestTile::test< Kokkos::Threads, 4, 4 >( 9, 9 );
+  TestTile::test< Kokkos::Threads, 4, 4 >( 9, 11 );
+
+  TestTile::test< Kokkos::Threads, 8, 8 >( 1, 1 );
+  TestTile::test< Kokkos::Threads, 8, 8 >( 4, 4 );
+  TestTile::test< Kokkos::Threads, 8, 8 >( 9, 9 );
+  TestTile::test< Kokkos::Threads, 8, 8 >( 9, 11 );
 }
 
-
-TEST_F( threads , dispatch )
+TEST_F( threads, dispatch )
 {
-  const int repeat = 100 ;
-  for ( int i = 0 ; i < repeat ; ++i ) {
-  for ( int j = 0 ; j < repeat ; ++j ) {
-    Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Threads >(0,j)
-                        , KOKKOS_LAMBDA( int ) {} );
-  }}
+  const int repeat = 100;
+  for ( int i = 0; i < repeat; ++i ) {
+    for ( int j = 0; j < repeat; ++j ) {
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Threads >( 0, j )
+                          , KOKKOS_LAMBDA( int ) {} );
+    }
+  }
 }
 
-
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
index a637d1e3ab654b402e49b7d3aec582e425d2592a..d2b75ca892b5abcf3f405aec37459f53c2a3aafc 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Reductions.cpp
@@ -40,46 +40,52 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, long_reduce) {
-  TestReduce< long ,   Kokkos::Threads >( 0 );
-  TestReduce< long ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, long_reduce )
+{
+  TestReduce< long, Kokkos::Threads >( 0 );
+  TestReduce< long, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads, double_reduce) {
-  TestReduce< double ,   Kokkos::Threads >( 0 );
-  TestReduce< double ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, double_reduce )
+{
+  TestReduce< double, Kokkos::Threads >( 0 );
+  TestReduce< double, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads , reducers )
+TEST_F( threads, reducers )
 {
-  TestReducers<int, Kokkos::Threads>::execute_integer();
-  TestReducers<size_t, Kokkos::Threads>::execute_integer();
-  TestReducers<double, Kokkos::Threads>::execute_float();
-  TestReducers<Kokkos::complex<double>, Kokkos::Threads>::execute_basic();
+  TestReducers< int, Kokkos::Threads >::execute_integer();
+  TestReducers< size_t, Kokkos::Threads >::execute_integer();
+  TestReducers< double, Kokkos::Threads >::execute_float();
+  TestReducers< Kokkos::complex<double>, Kokkos::Threads >::execute_basic();
 }
 
-TEST_F( threads, long_reduce_dynamic ) {
-  TestReduceDynamic< long ,   Kokkos::Threads >( 0 );
-  TestReduceDynamic< long ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, long_reduce_dynamic )
+{
+  TestReduceDynamic< long, Kokkos::Threads >( 0 );
+  TestReduceDynamic< long, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads, double_reduce_dynamic ) {
-  TestReduceDynamic< double ,   Kokkos::Threads >( 0 );
-  TestReduceDynamic< double ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, double_reduce_dynamic )
+{
+  TestReduceDynamic< double, Kokkos::Threads >( 0 );
+  TestReduceDynamic< double, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads, long_reduce_dynamic_view ) {
-  TestReduceDynamicView< long ,   Kokkos::Threads >( 0 );
-  TestReduceDynamicView< long ,   Kokkos::Threads >( 1000000 );
+TEST_F( threads, long_reduce_dynamic_view )
+{
+  TestReduceDynamicView< long, Kokkos::Threads >( 0 );
+  TestReduceDynamicView< long, Kokkos::Threads >( 1000000 );
 }
 
-TEST_F( threads , scan )
+TEST_F( threads, scan )
 {
-  TestScan< Kokkos::Threads >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Threads >::test_range( 1, 1000 );
   TestScan< Kokkos::Threads >( 0 );
   TestScan< Kokkos::Threads >( 100000 );
   TestScan< Kokkos::Threads >( 10000000 );
@@ -87,10 +93,11 @@ TEST_F( threads , scan )
 }
 
 #if 0
-TEST_F( threads , scan_small )
+TEST_F( threads, scan_small )
 {
-  typedef TestScan< Kokkos::Threads , Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor ;
-  for ( int i = 0 ; i < 1000 ; ++i ) {
+  typedef TestScan< Kokkos::Threads, Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor;
+
+  for ( int i = 0; i < 1000; ++i ) {
     TestScanFunctor( 10 );
     TestScanFunctor( 10000 );
   }
@@ -101,38 +108,39 @@ TEST_F( threads , scan_small )
 }
 #endif
 
-TEST_F( threads  , team_scan )
+TEST_F( threads, team_scan )
 {
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 10 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 10000 );
-  TestScanTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 10 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 10 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 10000 );
+  TestScanTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 10000 );
 }
 
-TEST_F( threads , team_long_reduce) {
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< long ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( threads, team_long_reduce )
+{
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< long, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( threads , team_double_reduce) {
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >( 100000 );
-  TestReduceTeam< double ,   Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
+TEST_F( threads, team_double_reduce )
+{
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 0 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 0 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 3 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 3 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >( 100000 );
+  TestReduceTeam< double, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >( 100000 );
 }
 
-TEST_F( threads , reduction_deduction )
+TEST_F( threads, reduction_deduction )
 {
   TestCXX11::test_reduction_deduction< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
index 2df9e19deb0130359d81b8c3cc001bb85ee7cb2f..68a9da6aedef550e94c037df93ff6dc741ff3589 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_a.cpp
@@ -40,53 +40,64 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_auto_1d_left ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Threads >();
+TEST_F( threads, view_subview_auto_1d_left )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft, Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_auto_1d_right ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Threads >();
+TEST_F( threads, view_subview_auto_1d_right )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight, Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_auto_1d_stride ) {
-  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Threads >();
+TEST_F( threads, view_subview_auto_1d_stride )
+{
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride, Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_assign_strided ) {
+TEST_F( threads, view_subview_assign_strided )
+{
   TestViewSubview::test_1d_strided_assignment< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_0 ) {
+TEST_F( threads, view_subview_left_0 )
+{
   TestViewSubview::test_left_0< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_1 ) {
+TEST_F( threads, view_subview_left_1 )
+{
   TestViewSubview::test_left_1< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_2 ) {
+TEST_F( threads, view_subview_left_2 )
+{
   TestViewSubview::test_left_2< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_left_3 ) {
+TEST_F( threads, view_subview_left_3 )
+{
   TestViewSubview::test_left_3< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_right_0 ) {
+TEST_F( threads, view_subview_right_0 )
+{
   TestViewSubview::test_right_0< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_right_1 ) {
+TEST_F( threads, view_subview_right_1 )
+{
   TestViewSubview::test_right_1< Kokkos::Threads >();
 }
 
-TEST_F( threads, view_subview_right_3 ) {
+TEST_F( threads, view_subview_right_3 )
+{
   TestViewSubview::test_right_3< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
index d57dbe97c0d38aaa6a2e48816eb9872a8585afb7..c5cf061e8289d9d8ac5ffea92d38c9cd91349922 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_b.cpp
@@ -40,21 +40,23 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_layoutleft_to_layoutleft) {
+TEST_F( threads, view_subview_layoutleft_to_layoutleft )
+{
   TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutleft_to_layoutleft< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-TEST_F( threads, view_subview_layoutright_to_layoutright) {
+TEST_F( threads, view_subview_layoutright_to_layoutright )
+{
   TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
-  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
+  TestViewSubview::test_layoutright_to_layoutright< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
index 67d998c0e86488df0023cc0138ffe022cdc52d94..9018c1f4f799c1f76ee082c57dedc644627c7a75 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c01.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_1d_assign ) {
+TEST_F( threads, view_subview_1d_assign )
+{
   TestViewSubview::test_1d_assign< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
index e340240c48d6d28c9bc4c79b777a3e1a4a8c4ddc..9483abd9cc3f78430f2234c71708fe0315a949a9 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c02.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_1d_assign_atomic ) {
-  TestViewSubview::test_1d_assign< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_1d_assign_atomic )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
index ad27fa0fa6cee9db3eb63c581a175eee0cdd6e4e..e252a26565bf6dad6387b87340c5c93cd2b3415f 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c03.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_1d_assign_randomaccess ) {
-  TestViewSubview::test_1d_assign< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_1d_assign_randomaccess )
+{
+  TestViewSubview::test_1d_assign< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
index 6fca47cc4ce41b56155fac8ce1d4b158d5e99c82..3e211b1a58542b6307a731c3765190e91132d4dd 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c04.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_2d_from_3d ) {
+TEST_F( threads, view_subview_2d_from_3d )
+{
   TestViewSubview::test_2d_subview_3d< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
index c7dfca941582dee3d667f60152854ea30b393548..865d50b1a1b918b99fb36d2a3e5c889a7c93e5a7 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c05.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_2d_from_3d_atomic ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_2d_from_3d_atomic )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
index 38e8394918614fdb528e9111d7fc1f54c7ff4d83..c5840073b6486226281942bfd0c0ad8e2052ff85 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c06.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_2d_from_3d_randomaccess ) {
-  TestViewSubview::test_2d_subview_3d< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_2d_from_3d_randomaccess )
+{
+  TestViewSubview::test_2d_subview_3d< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
index 1f01fe6b5e6104416bb1f2f680cafeab48cac1ad..7b8825ef628dbaa4449f7830abd4e227d842dccc 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c07.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_left ) {
+TEST_F( threads, view_subview_3d_from_5d_left )
+{
   TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
index e9a1ccbe30edcf7f512a5c20462df83cf52c3ac4..7bc16a5827a602193db55f7ffa044b38babef77d 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c08.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_left_atomic ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_3d_from_5d_left_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
index c8b6c8743dd25a97db5f00e5bc7157c9f040c5d9..57b87b6098bdd818c8e215ffb1d5938043746494 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c09.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_left_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_3d_from_5d_left_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_left< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
index 7cef6fa07be88859c063470857d775964c74f2fa..1875a883d485e1620430cadc59c09554dfc00ac1 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c10.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_right ) {
+TEST_F( threads, view_subview_3d_from_5d_right )
+{
   TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
index d67bf3157e337fef0af36dbba934f8bc22d74d0c..cf6428b18e333d66f4637fc92a45dc7f51052cc6 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c11.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_right_atomic ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::Atomic> >();
+TEST_F( threads, view_subview_3d_from_5d_right_atomic )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::Atomic> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
index e8a2c825cf3a9474d149d81a225cbadb16338cd7..7060fdb273c928d7346686c54d0a374188c47257 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c12.cpp
@@ -40,13 +40,14 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads, view_subview_3d_from_5d_right_randomaccess ) {
-  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads , Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
+TEST_F( threads, view_subview_3d_from_5d_right_randomaccess )
+{
+  TestViewSubview::test_3d_subview_5d_right< Kokkos::Threads, Kokkos::MemoryTraits<Kokkos::RandomAccess> >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp
index 4690be4d3a75d8e5a7b66676ecf6b0482952d116..d802d658309b4ecfbd28a5ec4ce6d17edc4a5f4a 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_Team.cpp
@@ -40,67 +40,73 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , team_tag )
+TEST_F( threads, team_tag )
 {
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(0);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(0);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(0);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(0);
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 0 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 0 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 0 );
 
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(2);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(2);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(2);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(2);
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 2 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 2 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 2 );
 
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >::test_reduce(1000);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_for(1000);
-  TestTeamPolicy< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce(1000);
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >::test_reduce( 1000 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >::test_reduce( 1000 );
 }
 
-TEST_F( threads , team_shared_request) {
-  TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestSharedTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( threads, team_shared_request )
+{
+  TestSharedTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestSharedTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( threads, team_scratch_request) {
-  TestScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( threads, team_scratch_request )
+{
+  TestScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-TEST_F( threads , team_lambda_shared_request) {
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+TEST_F( threads, team_lambda_shared_request )
+{
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestLambdaSharedTeam< Kokkos::HostSpace, Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 #endif
 
-TEST_F( threads, shmem_size) {
+TEST_F( threads, shmem_size )
+{
   TestShmemSize< Kokkos::Threads >();
 }
 
-TEST_F( threads, multi_level_scratch) {
-  TestMultiLevelScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Static> >();
-  TestMultiLevelScratchTeam< Kokkos::Threads , Kokkos::Schedule<Kokkos::Dynamic> >();
+TEST_F( threads, multi_level_scratch )
+{
+  TestMultiLevelScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam< Kokkos::Threads, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
-TEST_F( threads , team_vector )
+TEST_F( threads, team_vector )
 {
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(0) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(1) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(2) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(3) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(4) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(5) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(6) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(7) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(8) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(9) ) );
-  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 0 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 1 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 2 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 3 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 4 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 5 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 6 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 7 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 8 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 9 ) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >( 10 ) ) );
 }
 
 #ifdef KOKKOS_COMPILER_GNU
@@ -112,11 +118,10 @@ TEST_F( threads , team_vector )
 #ifndef SKIP_TEST
 TEST_F( threads, triple_nested_parallelism )
 {
-  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048 , 32 , 32 );
-  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048 , 32 , 16 );
-  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048 , 16 , 16 );
+  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048, 32, 32 );
+  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048, 32, 16 );
+  TestTripleNestedReduce< double, Kokkos::Threads >( 8192, 2048, 16, 16 );
 }
 #endif
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp
index 46a576b027fb2149302239ba31d6e53bd001e3ce..36eae287936ad9854dd030fc304506c3d3745c03 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_a.cpp
@@ -40,14 +40,15 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , impl_view_mapping_a ) {
+TEST_F( threads, impl_view_mapping_a )
+{
   test_view_mapping< Kokkos::Threads >();
   test_view_mapping_operator< Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
index b5d6ac843d8177149d53fe1cb52528c6ef760f3d..8c78d094435b3f524668cb1bffa44b5144749063 100644
--- a/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
+++ b/lib/kokkos/core/unit_test/threads/TestThreads_ViewAPI_b.cpp
@@ -40,82 +40,85 @@
 // ************************************************************************
 //@HEADER
 */
+
 #include <threads/TestThreads.hpp>
 
 namespace Test {
 
-TEST_F( threads , impl_shared_alloc ) {
-  test_shared_alloc< Kokkos::HostSpace , Kokkos::Threads >();
+TEST_F( threads, impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::HostSpace, Kokkos::Threads >();
 }
 
-TEST_F( threads , impl_view_mapping_b ) {
+TEST_F( threads, impl_view_mapping_b )
+{
   test_view_mapping_subview< Kokkos::Threads >();
   TestViewMappingAtomic< Kokkos::Threads >::run();
 }
 
-TEST_F( threads, view_api) {
-  TestViewAPI< double , Kokkos::Threads >();
+TEST_F( threads, view_api )
+{
+  TestViewAPI< double, Kokkos::Threads >();
 }
 
-TEST_F( threads , view_nested_view )
+TEST_F( threads, view_nested_view )
 {
   ::Test::view_nested_view< Kokkos::Threads >();
 }
 
-
-
-TEST_F( threads , view_remap )
+TEST_F( threads, view_remap )
 {
-  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
-
-  typedef Kokkos::View< double*[N1][N2][N3] ,
-                             Kokkos::LayoutRight ,
-                             Kokkos::Threads > output_type ;
-
-  typedef Kokkos::View< int**[N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Threads > input_type ;
-
-  typedef Kokkos::View< int*[N0][N2][N3] ,
-                             Kokkos::LayoutLeft ,
-                             Kokkos::Threads > diff_type ;
-
-  output_type output( "output" , N0 );
-  input_type  input ( "input" , N0 , N1 );
-  diff_type   diff  ( "diff" , N0 );
-
-  int value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    input(i0,i1,i2,i3) = ++value ;
-  }}}}
-
-  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
-  Kokkos::deep_copy( output , input );
-
-  value = 0 ;
-  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
-  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
-  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
-  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
-    ++value ;
-    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
-  }}}}
+  enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3],
+                        Kokkos::LayoutRight,
+                        Kokkos::Threads > output_type;
+
+  typedef Kokkos::View< int**[N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Threads > input_type;
+
+  typedef Kokkos::View< int*[N0][N2][N3],
+                        Kokkos::LayoutLeft,
+                        Kokkos::Threads > diff_type;
+
+  output_type output( "output", N0 );
+  input_type  input ( "input", N0, N1 );
+  diff_type   diff  ( "diff", N0 );
+
+  int value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    input( i0, i1, i2, i3 ) = ++value;
+  }
+
+  // Kokkos::deep_copy( diff, input ); // Throw with incompatible shape.
+  Kokkos::deep_copy( output, input );
+
+  value = 0;
+
+  for ( size_t i3 = 0; i3 < N3; ++i3 )
+  for ( size_t i2 = 0; i2 < N2; ++i2 )
+  for ( size_t i1 = 0; i1 < N1; ++i1 )
+  for ( size_t i0 = 0; i0 < N0; ++i0 )
+  {
+    ++value;
+    ASSERT_EQ( value, ( (int) output( i0, i1, i2, i3 ) ) );
+  }
 }
 
-//----------------------------------------------------------------------------
-
-TEST_F( threads , view_aggregate )
+TEST_F( threads, view_aggregate )
 {
   TestViewAggregate< Kokkos::Threads >();
 }
 
-TEST_F( threads , template_meta_functions )
+TEST_F( threads, template_meta_functions )
 {
-  TestTemplateMetaFunctions<int, Kokkos::Threads >();
+  TestTemplateMetaFunctions< int, Kokkos::Threads >();
 }
 
-} // namespace test
-
+} // namespace Test
diff --git a/lib/kokkos/doc/design_notes_space_instances.md b/lib/kokkos/doc/design_notes_space_instances.md
index 487fa25bcb32875ed3ba90821aba006a13cd506e..0124dfbc873285255fa92ff171dc5873056495ab 100644
--- a/lib/kokkos/doc/design_notes_space_instances.md
+++ b/lib/kokkos/doc/design_notes_space_instances.md
@@ -1,35 +1,41 @@
 # Design Notes for Execution and Memory Space Instances
 
+## Objective
 
-## Execution Spaces
+ * Enable Kokkos interoperability with coarse-grain tasking models
+ 
+## Requirements
 
-  *  Work is *dispatched* to an execution space instance
+ * Backwards compatable with existing Kokkos API
+ * Support existing Host execution spaces (Serial, Threads, OpenMP, maybe Qthreads)
+ * Support DARMA threading model (may require a new Host execution space)
+ * Support Uintah threading model, i.e. indepentant worker threadpools working of of shared task queues
+ 
+  
+## Execution Space
 
+  * Parallel work is *dispatched* on an execution space instance
+  
+  * Execution space instances are conceptually disjoint/independant from each other 
+  
 
-
-## Host Associated Execution Space Instances
-
-Vocabulary and examples assuming C++11 Threads Support Library
+## Host Execution Space Instances
 
   *  A host-side *control* thread dispatches work to an instance
 
-  * `this_thread` is the control thread
-
   * `main` is the initial control thread
 
-  *  An execution space instance is a pool of threads
+  *  A host execution space instance is an organized thread pool
 
-  *  All instances are disjoint thread pools
+  *  All instances are disjoint, i.e. hardware resources are not shared between instances
 
   *  Exactly one control thread is associated with
      an instance and only that control thread may
      dispatch work to to that instance
 
-  *  A control thread may be a member of an instance,
-     if so then it is also the control thread associated
-     with that instance
+  *  The control thread is a member of the instance
 
-  *  The pool of threads associated with an instances is not mutatable
+  *  The pool of threads associated with an instances is not mutatable during that instance existance
 
   *  The pool of threads associated with an instance may be masked
 
@@ -37,130 +43,89 @@ Vocabulary and examples assuming C++11 Threads Support Library
 
     -  Example: only one hyperthread per core of the instance
 
-    -  When a mask is applied to an instance that mask
-       remains until cleared or another mask is applied
-
-    -  Masking is portable by defining it as using a fraction
-       of the available resources (threads)
-
-  *  Instances are shared (referenced counted) objects,
-     just like `Kokkos::View`
-
-```
-struct StdThread {
-  void mask( float fraction );
-  void unmask() { mask( 1.0 ); }
-};
-```
-
-
-
-### Requesting an Execution Space Instance
-
-  *  `Space::request(` *who* `,` *what* `,` *control-opt* `)`
-
-  *  *who* is an identifier for subsquent queries regarding
-    who requested each instance
-
-  *  *what* is the number of threads and how they should be placed
-
-    -  Placement within locality-topology hierarchy; e.g., HWLOC
-
-    -  Compact within a level of hierarchy, or striped across that level;
-       e.g., socket or NUMA region
-
-    -  Granularity of request is core
-
-  *  *control-opt*  optionally specifies whether the instance
-     has a new control thread
-
-    -  *control-opt* includes a control function / closure
-
-    -  The new control thread is a member of the instance
-
-    -  The control function is called by the new control thread
-       and is passed a `const` instance
-
-    -  The instance is **not** returned to the creating control thread
-
-  *  `std::thread` that is not a member of an instance is
-     *hard blocked* on a `std::mutex`
-
-    -  One global mutex or one mutex per thread?
-
-  *  `std::thread` that is a member of an instance is
-     *spinning* waiting for work, or are working
-
-```
-struct StdThread {
-
-  struct Resource ;
-
-  static StdThread request(); // default
+    -  A mask can be applied during the policy creation of a parallel algorithm
+ 
+    -  Masking is portable by defining it as ceiling of fraction between [0.0, 1.0] 
+       of the available resources
 
-  static StdThread request( const std::string & , const Resource & );
-
-  // If the instance can be reserved then
-  // allocate a copy of ControlClosure and invoke
-  //   ControlClosure::operator()( const StdThread intance ) const
-  template< class ControlClosure >
-  static bool request( const std::string & , const Resource &
-                     , const ControlClosure & );
-};
 ```
-
-### Relinquishing an Execution Space Instance
-
-  *  De-referencing the last reference-counted instance
-     relinquishes the pool of threads
-
-  *  If a control thread was created for the instance then
-     it is relinquished when that control thread returns
-     from the control function
-
-    -  Requires the reference count to be zero, an error if not
-
-  *  No *forced* relinquish
-
-
-
-## CUDA Associated Execution Space Instances
-
-  *  Only a signle CUDA architecture
-
-  *  An instance is a device + stream
-
-  *  A stream is exclusive to an instance
-
-  *  Only a host-side control thread can dispatch work to an instance
-
-  *  Finite number of streams per device
-
-  *  ISSUE:  How to use CUDA `const` memory with multiple streams?
-
-  *  Masking can be mapped to restricting the number of CUDA blocks
-     to the fraction of available resources; e.g., maximum resident blocks
-
-
-### Requesting an Execution Space Instance
-
-  *  `Space::request(` *who* `,` *what* `)`
-
-  *  *who* is an identifier for subsquent queries regarding
-    who requested each instance
-
-  *  *what* is which device, the stream is a requested/relinquished resource
-
+class ExecutionSpace {
+public:
+  using execution_space = ExecutionSpace;
+  using memory_space = ...;
+  using device_type = Kokkos::Device<execution_space, memory_space>;
+  using array_layout = ...;
+  using size_type = ...;
+  using scratch_memory_space = ...;
+  
+  
+  class Instance
+  {
+    int thread_pool_size( int depth = 0 );
+    ...
+  };
+  
+  class InstanceRequest
+  {
+  public:
+    using Control = std::function< void( Instance * )>;
+    
+    InstanceRequest( Control control
+                   , unsigned thread_count
+                   , unsigned use_numa_count = 0
+                   , unsigned use_cores_per_numa = 0
+                   );    
+  
+  };
+  
+  static bool in_parallel();
+  
+  static bool sleep();
+  static bool wake();
+  
+  static void fence();
+  
+  static void print_configuration( std::ostream &, const bool detailed = false );
+  
+  static void initialize( unsigned thread_count = 0
+                        , unsigned use_numa_count = 0
+                        , unsigned use_cores_per_numa = 0
+                        );
+  
+  // Partition the current instance into the requested instances
+  // and run the given functions on the cooresponding instances
+  // will block until all the partitioned instances complete and 
+  // the original instance will be restored 
+  //
+  // Requires that the space has already been initialized
+  // Requires that the request can be statisfied by the current instance
+  //   i.e. the sum of number of requested threads must be less than the 
+  //   max_hardware_threads
+  //
+  // Each control functor will accept a handle to its new default instance
+  // Each instance must be independant of all other instances 
+  //   i.e. no assumption on scheduling between instances
+  // The user is responible for checking the return code for errors
+  static int run_instances( std::vector< InstanceRequest> const& requests );
+  
+  static void finalize();
+
+  static int is_initialized();
+  
+  static int concurrency();
+  
+  static int thread_pool_size( int depth = 0 );
+  
+  static int thread_pool_rank();
+  
+  static int max_hardware_threads();
+  
+  static int hardware_thread_id();
+                        
+ };
 
 ```
-struct Cuda {
+ 
 
-  struct Resource ;
-
-  static Cuda request();
-
-  static Cuda request( const std::string & , const Resource & );
-};
-```
 
 
diff --git a/lib/kokkos/example/md_skeleton/types.h b/lib/kokkos/example/md_skeleton/types.h
index 7f92b7cd0f8089d93c1e18e5dff3ad1508316867..c9689188a1c289c67e08dbe07707a51a0f8bff28 100644
--- a/lib/kokkos/example/md_skeleton/types.h
+++ b/lib/kokkos/example/md_skeleton/types.h
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -50,7 +50,7 @@
 
 typedef Kokkos::DefaultExecutionSpace execution_space ;
 
-#if ! defined( KOKKOS_HAVE_CUDA )
+#if ! defined( KOKKOS_ENABLE_CUDA )
   struct double2 {
     double x, y;
     KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
index 326d064105ecf2da945cf346cbaa9abbe27eab20..249d44ab559682ce2622842048b47af4613ec16f 100644
--- a/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
+++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/hello_world_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -100,7 +100,7 @@ int main (int argc, char* argv[]) {
   // order.  Parallel for loops may execute in any order.
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-#if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_for (15, KOKKOS_LAMBDA (const int i) {
       // printf works in a CUDA parallel kernel; std::ostream does not.
       printf ("Hello from i = %i\n", i);
diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
index 70eea4324022b8bcfd7e1266f5c47ef08380d8c9..f7f467ad2d1dbd866ad185776cea5d45a9abce3c 100644
--- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
+++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/simple_reduce_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -69,7 +69,7 @@ int main (int argc, char* argv[]) {
   // It also handles any other syntax needed for CUDA.
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-  #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_reduce (n, KOKKOS_LAMBDA (const int i, int& lsum) {
       lsum += i*i;
     }, sum);
@@ -85,7 +85,7 @@ int main (int argc, char* argv[]) {
   printf ("Sum of squares of integers from 0 to %i, "
           "computed sequentially, is %i\n", n - 1, seqSum);
   Kokkos::finalize ();
-#if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   return (sum == seqSum) ? 0 : -1;
 #else
   return 0;
diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp b/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
index dd0641be54087a76d45505d0e6777a4ebe1fd9d1..3450ad1bb468095a9d821a1c8e0560b256607166 100644
--- a/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
+++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/simple_view_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -99,7 +99,7 @@ int main (int argc, char* argv[]) {
   // ask for one.
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-  #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   Kokkos::parallel_for (10, KOKKOS_LAMBDA (const int i) {
     // Acesss the View just like a Fortran array.  The layout depends
     // on the View's memory space, so don't rely on the View's
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
index 216db7f125d16ed7150f2f2049506a723e9dcc79..9ea5e8b70711942cb61ef29f38144b52f81137e0 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/thread_teams_lambda.cpp
@@ -1,13 +1,13 @@
 /*
 //@HEADER
 // ************************************************************************
-// 
+//
 //                        Kokkos v. 2.0
 //              Copyright (2014) Sandia Corporation
-// 
+//
 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
 // the U.S. Government retains certain rights in this software.
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -36,7 +36,7 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //
 // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
-// 
+//
 // ************************************************************************
 //@HEADER
 */
@@ -79,7 +79,7 @@ int main (int narg, char* args[]) {
   int sum = 0;
   // We also need to protect the usage of a lambda against compiling
   // with a backend which doesn't support it (i.e. Cuda 6.5/7.0).
-  #if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+  #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
   parallel_reduce (policy, KOKKOS_LAMBDA (const team_member& thread, int& lsum) {
       lsum += 1;
       // TeamPolicy<>::member_type provides functions to query the
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
index e7bd9da36b4c1eaf60125e6c38f5e3bf7d33bf5d..e671293ff11ad8120766ed014128b25fb39089bc 100755
--- a/lib/kokkos/generate_makefile.bash
+++ b/lib/kokkos/generate_makefile.bash
@@ -5,153 +5,166 @@ MAKE_J_OPTION="32"
 
 while [[ $# > 0 ]]
 do
-key="$1"
+  key="$1"
 
-case $key in
+  case $key in
     --kokkos-path*)
-    KOKKOS_PATH="${key#*=}"
-    ;;
+      KOKKOS_PATH="${key#*=}"
+      ;;
+    --qthreads-path*)
+      QTHREADS_PATH="${key#*=}"
+      ;;
     --prefix*)
-    PREFIX="${key#*=}"
-    ;;
+      PREFIX="${key#*=}"
+      ;;
     --with-cuda)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
-    CUDA_PATH_NVCC=`which nvcc`
-    CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+      CUDA_PATH_NVCC=`which nvcc`
+      CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
+      ;;
     # Catch this before '--with-cuda*'
     --with-cuda-options*)
-    KOKKOS_CUDA_OPT="${key#*=}"
-    ;;
+      KOKKOS_CUDA_OPT="${key#*=}"
+      ;;
     --with-cuda*)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
-    CUDA_PATH="${key#*=}"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+      CUDA_PATH="${key#*=}"
+      ;;
     --with-openmp)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
+      ;;
     --with-pthread)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
+      ;;
     --with-serial)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
-    ;;
-    --with-qthread*)
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},Qthread"
-    QTHREAD_PATH="${key#*=}"
-    ;;
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
+      ;;
+    --with-qthreads*)
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},Qthreads"
+      if [ -z "$QTHREADS_PATH" ]; then
+        QTHREADS_PATH="${key#*=}"
+      fi
+      ;;
     --with-devices*)
-    DEVICES="${key#*=}"
-    KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
-    ;;
+      DEVICES="${key#*=}"
+      KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
+      ;;
     --with-gtest*)
-    GTEST_PATH="${key#*=}"
-    ;;
+      GTEST_PATH="${key#*=}"
+      ;;
     --with-hwloc*)
-    HWLOC_PATH="${key#*=}"
-    ;;
+      HWLOC_PATH="${key#*=}"
+      ;;
     --arch*)
-    KOKKOS_ARCH="${key#*=}"
-    ;;
+      KOKKOS_ARCH="${key#*=}"
+      ;;
     --cxxflags*)
-    CXXFLAGS="${key#*=}"
-    ;;
+      CXXFLAGS="${key#*=}"
+      ;;
     --ldflags*)
-    LDFLAGS="${key#*=}"
-    ;;
+      LDFLAGS="${key#*=}"
+      ;;
     --debug|-dbg)
-    KOKKOS_DEBUG=yes
-    ;;
+      KOKKOS_DEBUG=yes
+      ;;
     --make-j*)
-    MAKE_J_OPTION="${key#*=}"
-    ;;
+      MAKE_J_OPTION="${key#*=}"
+      ;;
     --compiler*)
-    COMPILER="${key#*=}"
-    CNUM=`which ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l`
-    if [ ${CNUM} -gt 0 ]; then
-      echo "Invalid compiler by --compiler command: '${COMPILER}'"
-      exit
-    fi
-    if [[ ! -n  ${COMPILER} ]]; then
-      echo "Empty compiler specified by --compiler command."
-      exit
-    fi
-    CNUM=`which ${COMPILER} | grep ${COMPILER} | wc -l`
-    if [ ${CNUM} -eq 0 ]; then
-      echo "Invalid compiler by --compiler command: '${COMPILER}'"
-      exit
-    fi 
-    ;;
-    --with-options*)
-    KOKKOS_OPT="${key#*=}"
-    ;;
+      COMPILER="${key#*=}"
+      CNUM=`which ${COMPILER} 2>&1 >/dev/null | grep "no ${COMPILER}" | wc -l`
+      if [ ${CNUM} -gt 0 ]; then
+        echo "Invalid compiler by --compiler command: '${COMPILER}'"
+        exit
+      fi
+      if [[ ! -n  ${COMPILER} ]]; then
+        echo "Empty compiler specified by --compiler command."
+        exit
+      fi
+      CNUM=`which ${COMPILER} | grep ${COMPILER} | wc -l`
+      if [ ${CNUM} -eq 0 ]; then
+        echo "Invalid compiler by --compiler command: '${COMPILER}'"
+        exit
+      fi 
+      ;;
+      --with-options*)
+      KOKKOS_OPT="${key#*=}"
+      ;;
     --help)
-    echo "Kokkos configure options:"
-    echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
-    echo "--prefix=/Install/Path:        Path to where the Kokkos library should be installed"
-    echo ""
-    echo "--with-cuda[=/Path/To/Cuda]:      enable Cuda and set path to Cuda Toolkit"
-    echo "--with-openmp:                    enable OpenMP backend"
-    echo "--with-pthread:                   enable Pthreads backend"
-    echo "--with-serial:                    enable Serial backend"
-    echo "--with-qthread=/Path/To/Qthread:  enable Qthread backend"
-    echo "--with-devices:                   explicitly add a set of backends"
-    echo ""
-    echo "--arch=[OPTIONS]:            set target architectures. Options are:"
-    echo "                               ARMv80         = ARMv8.0 Compatible CPU"
-    echo "                               ARMv81         = ARMv8.1 Compatible CPU"
-    echo "                               ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU"
-    echo "                               SNB            = Intel Sandy/Ivy Bridge CPUs"
-    echo "                               HSW            = Intel Haswell CPUs"
-    echo "                               BDW            = Intel Broadwell Xeon E-class CPUs"
-    echo "                               SKX            = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
-    echo "                               KNC            = Intel Knights Corner Xeon Phi"
-    echo "                               KNL            = Intel Knights Landing Xeon Phi"
-    echo "                               Kepler30       = NVIDIA Kepler generation CC 3.0"
-    echo "                               Kepler35       = NVIDIA Kepler generation CC 3.5"
-    echo "                               Kepler37       = NVIDIA Kepler generation CC 3.7"
-    echo "                               Pascal60       = NVIDIA Pascal generation CC 6.0"
-    echo "                               Pascal61       = NVIDIA Pascal generation CC 6.1"
-    echo "                               Maxwell50      = NVIDIA Maxwell generation CC 5.0"
-    echo "                               Power8         = IBM POWER8 CPUs"
-    echo "                               Power9         = IBM POWER9 CPUs"
-    echo ""
-    echo "--compiler=/Path/To/Compiler set the compiler"
-    echo "--debug,-dbg:                enable Debugging"
-    echo "--cxxflags=[FLAGS]           overwrite CXXFLAGS for library build and test build"
-    echo "                               This will still set certain required flags via"
-    echo "                               KOKKOS_CXXFLAGS (such as -fopenmp, --std=c++11, etc.)"
-    echo "--ldflags=[FLAGS]            overwrite LDFLAGS for library build and test build"
-    echo "                               This will still set certain required flags via"
-    echo "                               KOKKOS_LDFLAGS (such as -fopenmp, -lpthread, etc.)"
-    echo "--with-gtest=/Path/To/Gtest: set path to gtest (used in unit and performance tests"
-    echo "--with-hwloc=/Path/To/Hwloc: set path to hwloc"
-    echo "--with-options=[OPTIONS]:    additional options to Kokkos:"
-    echo "                               aggressive_vectorization = add ivdep on loops"
-    echo "--with-cuda-options=[OPT]:   additional options to CUDA:"
-    echo "                               force_uvm, use_ldg, enable_lambda, rdc"
-    echo "--make-j=[NUM]:              set -j flag used during build."
-    exit 0
-    ;;
+      echo "Kokkos configure options:"
+      echo "--kokkos-path=/Path/To/Kokkos:        Path to the Kokkos root directory."
+      echo "--qthreads-path=/Path/To/Qthreads:    Path to Qthreads install directory."
+      echo "                                        Overrides path given by --with-qthreads."
+      echo "--prefix=/Install/Path:               Path to install the Kokkos library."
+      echo ""
+      echo "--with-cuda[=/Path/To/Cuda]:          Enable Cuda and set path to Cuda Toolkit."
+      echo "--with-openmp:                        Enable OpenMP backend."
+      echo "--with-pthread:                       Enable Pthreads backend."
+      echo "--with-serial:                        Enable Serial backend."
+      echo "--with-qthreads[=/Path/To/Qthreads]:  Enable Qthreads backend."
+      echo "--with-devices:                       Explicitly add a set of backends."
+      echo ""
+      echo "--arch=[OPT]:  Set target architectures. Options are:"
+      echo "                 ARMv80         = ARMv8.0 Compatible CPU"
+      echo "                 ARMv81         = ARMv8.1 Compatible CPU"
+      echo "                 ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU"
+      echo "                 SNB            = Intel Sandy/Ivy Bridge CPUs"
+      echo "                 HSW            = Intel Haswell CPUs"
+      echo "                 BDW            = Intel Broadwell Xeon E-class CPUs"
+      echo "                 SKX            = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)"
+      echo "                 KNC            = Intel Knights Corner Xeon Phi"
+      echo "                 KNL            = Intel Knights Landing Xeon Phi"
+      echo "                 Kepler30       = NVIDIA Kepler generation CC 3.0"
+      echo "                 Kepler35       = NVIDIA Kepler generation CC 3.5"
+      echo "                 Kepler37       = NVIDIA Kepler generation CC 3.7"
+      echo "                 Pascal60       = NVIDIA Pascal generation CC 6.0"
+      echo "                 Pascal61       = NVIDIA Pascal generation CC 6.1"
+      echo "                 Maxwell50      = NVIDIA Maxwell generation CC 5.0"
+      echo "                 Power8         = IBM POWER8 CPUs"
+      echo "                 Power9         = IBM POWER9 CPUs"
+      echo ""
+      echo "--compiler=/Path/To/Compiler  Set the compiler."
+      echo "--debug,-dbg:                 Enable Debugging."
+      echo "--cxxflags=[FLAGS]            Overwrite CXXFLAGS for library build and test"
+      echo "                                build.  This will still set certain required"
+      echo "                                flags via KOKKOS_CXXFLAGS (such as -fopenmp,"
+      echo "                                --std=c++11, etc.)."
+      echo "--ldflags=[FLAGS]             Overwrite LDFLAGS for library build and test"
+      echo "                                build. This will still set certain required"
+      echo "                                flags via KOKKOS_LDFLAGS (such as -fopenmp,"
+      echo "                                -lpthread, etc.)."
+      echo "--with-gtest=/Path/To/Gtest:  Set path to gtest.  (Used in unit and performance"
+      echo "                                tests.)"
+      echo "--with-hwloc=/Path/To/Hwloc:  Set path to hwloc."
+      echo "--with-options=[OPT]:         Additional options to Kokkos:"
+      echo "                                aggressive_vectorization = add ivdep on loops"
+      echo "--with-cuda-options=[OPT]:    Additional options to CUDA:"
+      echo "                                force_uvm, use_ldg, enable_lambda, rdc"
+      echo "--make-j=[NUM]:               Set -j flag used during build."
+      exit 0
+      ;;
     *)
-    echo "warning: ignoring unknown option $key"
-    ;;
-esac
-shift
+      echo "warning: ignoring unknown option $key"
+      ;;
+  esac
+
+  shift
 done
 
-# If KOKKOS_PATH undefined, assume parent dir of this
-# script is the KOKKOS_PATH
+# Remove leading ',' from KOKKOS_DEVICES.
+KOKKOS_DEVICES=$(echo $KOKKOS_DEVICES | sed 's/^,//')
+
+# If KOKKOS_PATH undefined, assume parent dir of this script is the KOKKOS_PATH.
 if [ -z "$KOKKOS_PATH" ]; then
-    KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+  KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 else
-    # Ensure KOKKOS_PATH is abs path
-    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+  # Ensure KOKKOS_PATH is abs path
+  KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
 fi
 
 if [ "${KOKKOS_PATH}"  = "${PWD}" ] || [ "${KOKKOS_PATH}"  = "${PWD}/" ]; then
-echo "Running generate_makefile.sh in the Kokkos root directory is not allowed"
-exit 
+  echo "Running generate_makefile.sh in the Kokkos root directory is not allowed"
+  exit 
 fi
 
 KOKKOS_SRC_PATH=${KOKKOS_PATH}
@@ -160,52 +173,63 @@ KOKKOS_SETTINGS="KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH}"
 #KOKKOS_SETTINGS="KOKKOS_PATH=${KOKKOS_PATH}"
 
 if [ ${#COMPILER} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXX=${COMPILER}"
 fi
+
 if [ ${#KOKKOS_DEVICES} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
 fi
+
 if [ ${#KOKKOS_ARCH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_ARCH=${KOKKOS_ARCH}"
 fi
+
 if [ ${#KOKKOS_DEBUG} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
 fi
+
 if [ ${#CUDA_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CUDA_PATH=${CUDA_PATH}"
 fi
+
 if [ ${#CXXFLAGS} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} CXXFLAGS=\"${CXXFLAGS}\""
 fi
+
 if [ ${#LDFLAGS} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
 fi
+
 if [ ${#GTEST_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
 else
-GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
+  GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
 fi
+
 if [ ${#HWLOC_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc"
 fi
-if [ ${#QTHREAD_PATH} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} QTHREAD_PATH=${QTHREAD_PATH}"
+
+if [ ${#QTHREADS_PATH} -gt 0 ]; then
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} QTHREADS_PATH=${QTHREADS_PATH}"
 fi
+
 if [ ${#KOKKOS_OPT} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_OPTIONS=${KOKKOS_OPT}"
 fi
+
 if [ ${#KOKKOS_CUDA_OPT} -gt 0 ]; then
-KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} KOKKOS_CUDA_OPTIONS=${KOKKOS_CUDA_OPT}"
 fi
 
 KOKKOS_SETTINGS_NO_KOKKOS_PATH="${KOKKOS_SETTINGS}"
 
 KOKKOS_TEST_INSTALL_PATH="${PWD}/install"
 if [ ${#PREFIX} -gt 0 ]; then
-KOKKOS_INSTALL_PATH="${PREFIX}"
+  KOKKOS_INSTALL_PATH="${PREFIX}"
 else
-KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH}
+  KOKKOS_INSTALL_PATH=${KOKKOS_TEST_INSTALL_PATH}
 fi
 
 
@@ -229,7 +253,7 @@ mkdir example/fenl
 mkdir example/tutorial
 
 if [ ${#KOKKOS_ENABLE_EXAMPLE_ICHOL} -gt 0 ]; then
-mkdir example/ichol
+  mkdir example/ichol
 fi
 
 KOKKOS_SETTINGS="${KOKKOS_SETTINGS_NO_KOKKOS_PATH} KOKKOS_PATH=${KOKKOS_PATH}"
diff --git a/lib/linalg/Install.py b/lib/linalg/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7076ca52f28dd63d09dae244f8491110d4a7891
--- /dev/null
+++ b/lib/linalg/Install.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+
+# install.py tool to do build of the linear algebra library
+# used to automate the steps described in the README file in this dir
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# make the library
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.%s clean; make -f Makefile.%s" % (machine,machine)
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
diff --git a/lib/linalg/README b/lib/linalg/README
index 20f3ff094d1c9a0f5a27a039d1a6b524427e8af9..725df86c4cc363b32436287efeb182390d39e683 100644
--- a/lib/linalg/README
+++ b/lib/linalg/README
@@ -3,11 +3,16 @@ USER-AWPMD packages, and possibly by other packages in the future.
 
 Note that this is an *incomplete* subset of full BLAS/LAPACK.
 
-You should only need to build and use the resulting library in this
-directory if you want to build LAMMPS with the USER-ATC and/or
-USER-AWPMD packages AND you do not have any other suitable BLAS and
-LAPACK libraries installed on your system.  E.g. ATLAS, GOTO-BLAS,
-OpenBLAS, ACML, or MKL.
+You should only need to build and use the library in this directory if
+you want to build LAMMPS with the USER-ATC and/or USER-AWPMD packages
+AND you do not have any other suitable BLAS and LAPACK libraries
+installed on your system.  E.g. ATLAS, GOTO-BLAS, OpenBLAS, ACML, or
+MKL.
+
+You can type "make lib-linalg" from the src directory to see help on
+how to build this library via make commands, or you can do the same
+thing by typing "python Install.py" from within this directory, or you
+can do it manually by following the instructions below.
 
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
@@ -20,4 +25,5 @@ directory:
 liblinalg.a		the library LAMMPS will link against
 
 You can then include this library and its path in the Makefile.lammps
-file of any packages that need it, e.g. in lib/atc/Makefile.lammps.
+file of any packages that need it.  As an example, see the
+lib/atc/Makefile.lammps.linalg file.
diff --git a/lib/meam/Install.py b/lib/meam/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/meam/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/meam/README b/lib/meam/README
index 436259ee8121238041fb956f981e20feb67fd47f..b3111c131745e4275fc819fd014f3ce40eb194fd 100644
--- a/lib/meam/README
+++ b/lib/meam/README
@@ -15,6 +15,11 @@ links against when using the MEAM package.
 This library must be built with a F90 compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-meam" from the src directory to see help on how
+to build this library via make commands, or you can do the same thing
+by typing "python Install.py" from within this directory, or you can
+do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
diff --git a/lib/molfile/Makefile.lammps b/lib/molfile/Makefile.lammps
index 08118991a0cfb0e0f130bbd8cebf06bcf5f64026..a181f48aec088c89a8c8118db76ff9452dacadb7 100644
--- a/lib/molfile/Makefile.lammps
+++ b/lib/molfile/Makefile.lammps
@@ -6,6 +6,9 @@
 # When you build LAMMPS with the USER-MOLFILE package installed, it will
 # use the 3 settings in this file.  They should be set as follows.
 # 
+# The molfile_SYSINC setting is to point to the folder with the VMD
+# plugin headers. By default it points to bundled headers in this folder
+# 
 # The molfile_SYSLIB setting is for a system dynamic loading library
 # that will be used to load the molfile plugins.  It contains functions
 # like dlopen(), dlsym() and so on for dynamic linking of executable
@@ -24,7 +27,10 @@
 
 # Settings that the LAMMPS build will import when this package is installed
 
-molfile_SYSINC =
+# change this to -I/path/to/your/lib/vmd/plugins/include if the bundled
+# header files are incompatible with your VMD plugsins
+molfile_SYSINC =-I../../lib/molfile
+#
 ifneq ($(LIBOBJDIR),/Obj_mingw32)
 ifneq ($(LIBOBJDIR),/Obj_mingw64)
 ifneq ($(LIBOBJDIR),/Obj_mingw32-mpi)
diff --git a/lib/molfile/README b/lib/molfile/README
index 09ea3cc5c6fbc2f0849413e4df5abdd45b36a1ea..9e8260c202f2a426050eb0496f84770e6de30cb1 100644
--- a/lib/molfile/README
+++ b/lib/molfile/README
@@ -6,17 +6,30 @@ and write_dump commands in a LAMMPS input script.
 More information about the VMD molfile plugins can be found at
 http://www.ks.uiuc.edu/Research/vmd/plugins/molfile.
 
-More specifically, to be able to dynamically load and execute the
-plugins from inside LAMMPS, you need to link with a system library
-containing functions like dlopen(), dlsym() and so on for dynamic
-linking of executable code into an executable.  This library is
-defined by setting the molfile_SYSLIB variable in the Makefile.lammps
-file in this dir.
+NOTE: while the programming interface (API) of the VMD molfile plugins
+is backward compatible (i.e. you can expect to be able to compile this
+package for plugins from newer VMD packages), the binary interface
+(ABI) is not.  So it is necessary to compile this package with the
+VMD molfile plugin header files (vmdplugin.h and molfile_plugin.h) 
+matching VMD installation that the (binary) plugin files are taken from.
+These header files can be found inside the VMD installation tree under
+"plugins/include". For convenience, this package includes a set of
+header files that is compatible with VMD 1.9.3 (the current version
+in April 2017). You need to adjust the molfile_SYSINC variable in the
+Makefile.lammps file in this directory, in case you want to use VMD
+molfile plugins from a different version. The interface is compatible
+with plugins starting from VMD version 1.8.4.
+
+In order to be able to dynamically load and execute the plugins from
+inside LAMMPS, you need to link with a system library containing functions
+like dlopen(), dlsym() and so on for dynamic linking of executable code
+into an executable.  This library is defined by setting the molfile_SYSLIB
+variable in the Makefile.lammps file in this dir.
 
 For Linux and most current unix-like operating systems, this can be
 kept at the default setting of "-ldl" (on some platforms this library
 is called "-ldld").  For compilation on Windows, a slightly different
 mechanism is used that is part of the Windows programming environment
-and this library is not needed.
+and this kind of library is not needed.
 
 See the header of Makefile.lammps for more info.
diff --git a/src/USER-MOLFILE/molfile_plugin.h b/lib/molfile/molfile_plugin.h
similarity index 92%
rename from src/USER-MOLFILE/molfile_plugin.h
rename to lib/molfile/molfile_plugin.h
index 7a2d7ca42e6f35293e8d609a2c1c469b02c145cb..c79e7a5abf43516ea271d65a21a205f2819a2ca8 100644
--- a/src/USER-MOLFILE/molfile_plugin.h
+++ b/lib/molfile/molfile_plugin.h
@@ -11,7 +11,7 @@
  *
  *      $RCSfile: molfile_plugin.h,v $
  *      $Author: johns $       $Locker:  $             $State: Exp $
- *      $Revision: 1.103 $       $Date: 2011/03/05 03:56:11 $
+ *      $Revision: 1.108 $       $Date: 2016/02/26 03:17:01 $
  *
  ***************************************************************************/
 
@@ -60,6 +60,21 @@ typedef ssize_t molfile_ssize_t;      /**< for frame counts */
 #define MOLFILE_MAXWAVEPERTS     25   /**< maximum number of wavefunctions
                                        *   per timestep */
 
+/**
+ * Hard-coded direct-I/O page size constants for use by both VMD
+ * and the plugins that want to use direct, unbuffered I/O for high
+ * performance with SSDs etc.  We use two constants to define the
+ * range of hardware page sizes that we can support, so that we can
+ * add support for larger 8KB or 16KB page sizes in the future
+ * as they become more prevalent in high-end storage systems.
+ *
+ * At present, VMD uses a hard-coded 4KB page size to reduce memory
+ * fragmentation, but these constants will make it easier to enable the
+ * use of larger page sizes in the future if it becomes necessary.
+ */
+#define MOLFILE_DIRECTIO_MIN_BLOCK_SIZE 4096
+#define MOLFILE_DIRECTIO_MAX_BLOCK_SIZE 4096
+
 
 /**
  * File level comments, origin information, and annotations.
@@ -96,8 +111,17 @@ typedef struct {
   char resname[8];    /**< required residue name string          */
   int resid;          /**< required integer residue ID           */
   char segid[8];      /**< required segment name string, or ""   */
+#if 0 && vmdplugin_ABIVERSION > 17
+  /* The new PDB file formats allows for much larger structures, */
+  /* which can therefore require longer chain ID strings.  The   */
+  /* new PDBx/mmCIF file formats do not have length limits on    */
+  /* fields, so PDB chains could be arbitrarily long strings     */
+  /* in such files.  At present, we know we need at least 3-char */
+  /* chains for existing PDBx/mmCIF files.                       */
+  char chain[4];      /**< required chain name, or ""            */
+#else
   char chain[2];      /**< required chain name, or ""            */
-
+#endif
   /* rest are optional; use optflags to specify what's present   */
   char altloc[2];     /**< optional PDB alternate location code  */
   char insertion[2];  /**< optional PDB insertion code           */
@@ -107,6 +131,23 @@ typedef struct {
   float charge;       /**< optional charge value                 */
   float radius;       /**< optional radius value                 */
   int atomicnumber;   /**< optional element atomic number        */
+
+#if 0
+  char complex[16];
+  char assembly[16];
+  int qmregion;
+  int qmregionlink;
+  int qmlayer;
+  int qmlayerlink;
+  int qmfrag;
+  int qmfraglink;
+  string qmecp;
+  int qmadapt;
+  int qmect;          /**< boolean */
+  int qmparam;
+  int autoparam;
+#endif
+
 #if defined(DESRES_CTNUMBER)
   int ctnumber;       /**< mae ct block, 0-based, including meta */
 #endif
@@ -140,23 +181,19 @@ typedef struct {
 #define MOLFILE_QMTS_SCFITER       0x0002
 /*@}*/
 
-#if vmdplugin_ABIVERSION > 10
 typedef struct molfile_timestep_metadata {
   unsigned int count;                  /**< total # timesteps; -1 if unknown */
   unsigned int avg_bytes_per_timestep; /** bytes per timestep                */
   int has_velocities;                  /**< if timesteps have velocities     */
 } molfile_timestep_metadata_t;
-#endif
 
 /*
  * Per-timestep atom coordinates and periodic cell information
  */
 typedef struct {
   float *coords;        /**< coordinates of all atoms, arranged xyzxyzxyz   */
-#if vmdplugin_ABIVERSION > 10
   float *velocities;    /**< space for velocities of all atoms; same layout */
                         /**< NULL unless has_velocities is set              */
-#endif
 
   /*@{*/
   /**
@@ -169,9 +206,7 @@ typedef struct {
   float A, B, C, alpha, beta, gamma;
   /*@}*/
 
-#if vmdplugin_ABIVERSION > 10
   double physical_time; /**< physical time point associated with this frame */
-#endif
 
 #if defined(DESRES_READ_TIMESTEP2)
   /* HACK to support generic trajectory information */
@@ -213,14 +248,33 @@ typedef struct {
    * physical size of the box, this is the number of voxels in each
    * direction, independent of the shape of the volume set.
    */
-  int xsize;            /**< number of grid cells along the X axis          */
-  int ysize;            /**< number of grid cells along the Y axis          */
-  int zsize;            /**< number of grid cells along the Z axis          */
-
-  int has_color;        /**< flag indicating presence of voxel color data   */
+  int xsize;            /**< number of grid cells along the X axis           */
+  int ysize;            /**< number of grid cells along the Y axis           */
+  int zsize;            /**< number of grid cells along the Z axis           */
+
+#if vmdplugin_ABIVERSION > 16
+  int has_scalar;       /**< flag indicating presence of scalar volume       */
+  int has_gradient;     /**< flag indicating presence of vector volume       */
+  int has_variance;     /**< flag indicating presence of variance map        */
+#endif
+  int has_color;        /**< flag indicating presence of voxel color data    */
 } molfile_volumetric_t;
 
 
+#if vmdplugin_ABIVERSION > 16
+/**
+ * Volumetric dataset read/write structure with both flag/parameter sets
+ * and VMD-allocated pointers for fields to be used by the plugin.
+ */
+typedef struct {
+  int setidx;           /**< volumetric dataset index to load/save */
+  float *scalar;        /**< scalar density/potential field data   */
+  float *gradient;      /**< gradient vector field                 */
+  float *variance;      /**< variance map indicating signal/noise  */
+  float *rgb3f;         /**< RGB floating point color texture map  */
+  unsigned char *rgb3u; /**< RGB unsigned byte color texture map   */
+} molfile_volumetric_readwrite_t;
+#endif
 
 
 /**************************************************************
@@ -231,9 +285,6 @@ typedef struct {
  **************************************************************
  **************************************************************/
 
-#if vmdplugin_ABIVERSION > 9
-
-
 /* macros for the convergence status of a QM calculation. */
 #define MOLFILE_QMSTATUS_UNKNOWN       -1 /* don't know yet */
 #define MOLFILE_QMSTATUS_OPT_CONV       0 /* optimization converged */
@@ -485,8 +536,6 @@ typedef struct {
 } molfile_qm_timestep_t;
 
 
-#endif
-
 /**************************************************************
  **************************************************************/
 
@@ -609,12 +658,8 @@ typedef struct {
    * This function can be called only after read_structure().
    * Return MOLFILE_SUCCESS if no errors occur.
    */
-#if vmdplugin_ABIVERSION > 14
   int (*read_bonds)(void *, int *nbonds, int **from, int **to, float **bondorder,
                     int **bondtype, int *nbondtypes, char ***bondtypename);
-#else
-  int (*read_bonds)(void *, int *nbonds, int **from, int **to, float **bondorder);
-#endif
 
   /**
    * XXX this function will be augmented and possibly superceded by a
@@ -684,6 +729,9 @@ typedef struct {
    */
   int (* read_volumetric_data)(void *, int set, float *datablock,
         float *colorblock);
+#if vmdplugin_ABIVERSION > 16
+  int (* read_volumetric_data_ex)(void *, molfile_volumetric_readwrite_t *v);
+#endif
 
   /**
    * Read raw graphics data stored in this file.   Return the number of data
@@ -723,14 +771,9 @@ typedef struct {
    * bondtypenames can only be used of bondtypes is also given.
    * Return MOLFILE_SUCCESS if no errors occur.
    */
-#if vmdplugin_ABIVERSION > 14
   int (* write_bonds)(void *, int nbonds, int *from, int *to, float *bondorder,
                      int *bondtype, int nbondtypes, char **bondtypename);
-#else
-  int (* write_bonds)(void *, int nbonds, int *from, int *to, float *bondorder);
-#endif
 
-#if vmdplugin_ABIVERSION > 9
   /**
    * Write the specified volumetric data set into the space pointed to by
    * datablock.  The * allocated for the datablock must be equal to
@@ -740,8 +783,11 @@ typedef struct {
    */
   int (* write_volumetric_data)(void *, molfile_volumetric_t *metadata,
                                 float *datablock, float *colorblock);
+#if vmdplugin_ABIVERSION > 16
+  int (* write_volumetric_data_ex)(void *, molfile_volumetric_t *metadata,
+                                   molfile_volumetric_readwrite_t *v);
+#endif
 
-#if vmdplugin_ABIVERSION > 15
   /**
    * Read in Angles, Dihedrals, Impropers, and Cross Terms and optionally types.
    * (Cross terms pertain to the CHARMM/NAMD CMAP feature)
@@ -764,33 +810,6 @@ typedef struct {
                        const int *impropers, const int *impropertypes, int numimpropertypes,
                        const char **impropertypenames, int numcterms,  const int *cterms,
                        int ctermcols, int ctermrows);
-#else
-  /**
-   * Read in Angles, Dihedrals, Impropers, and Cross Terms
-   * Forces are in Kcal/mol
-   * (Cross terms pertain to the CHARMM/NAMD CMAP feature, forces are given
-   *  as a 2-D matrix)
-   */
-  int (* read_angles)(void *,
-                int *numangles,    int **angles,    double **angleforces,
-                int *numdihedrals, int **dihedrals, double **dihedralforces,
-                int *numimpropers, int **impropers, double **improperforces,
-                int *numcterms,    int **cterms,
-                int *ctermcols,    int *ctermrows,  double **ctermforces);
-
-  /**
-   * Write out Angles, Dihedrals, Impropers, and Cross Terms
-   * Forces are in Kcal/mol
-   * (Cross terms pertain to the CHARMM/NAMD CMAP feature, forces are given
-   *  as a 2-D matrix)
-   */
-  int (* write_angles)(void *,
-        int numangles,    const int *angles,    const double *angleforces,
-        int numdihedrals, const int *dihedrals, const double *dihedralforces,
-        int numimpropers, const int *impropers, const double *improperforces,
-        int numcterms,   const int *cterms,
-        int ctermcols, int ctermrows, const double *ctermforces);
-#endif
 
 
   /**
@@ -839,14 +858,9 @@ typedef struct {
    */
   int (* read_timestep)(void *, int natoms, molfile_timestep_t *,
                         molfile_qm_metadata_t *, molfile_qm_timestep_t *);
-#endif
 
-#if vmdplugin_ABIVERSION > 10
   int (* read_timestep_metadata)(void *, molfile_timestep_metadata_t *);
-#endif
-#if vmdplugin_ABIVERSION > 11
   int (* read_qm_timestep_metadata)(void *, molfile_qm_timestep_metadata_t *);
-#endif
 
 #if defined(DESRES_READ_TIMESTEP2)
   /**
@@ -864,7 +878,6 @@ typedef struct {
                                   double * times );
 #endif
 
-#if vmdplugin_ABIVERSION > 13
   /**
    *  Console output, READ-ONLY function pointer.
    *  Function pointer that plugins can use for printing to the host
@@ -883,8 +896,8 @@ typedef struct {
    *      application-provided services
    */
   int (* cons_fputs)(const int, const char*);
-#endif
 
 } molfile_plugin_t;
 
 #endif
+
diff --git a/src/USER-MOLFILE/vmdplugin.h b/lib/molfile/vmdplugin.h
similarity index 98%
rename from src/USER-MOLFILE/vmdplugin.h
rename to lib/molfile/vmdplugin.h
index 37299408fefe51a4ce9b80a613562349c402cd89..842d1e431c67238c5849ff5026a2f3c28ec5dd97 100644
--- a/src/USER-MOLFILE/vmdplugin.h
+++ b/lib/molfile/vmdplugin.h
@@ -11,7 +11,7 @@
  *
  *      $RCSfile: vmdplugin.h,v $
  *      $Author: johns $       $Locker:  $             $State: Exp $
- *      $Revision: 1.32 $       $Date: 2009/02/24 05:12:35 $
+ *      $Revision: 1.33 $       $Date: 2015/10/29 05:10:54 $
  *
  ***************************************************************************/
 
@@ -144,7 +144,7 @@ typedef struct {
 /**
  * Use this macro to initialize the abiversion member of each plugin
  */
-#define vmdplugin_ABIVERSION  16
+#define vmdplugin_ABIVERSION  17
 
 /*@{*/
 /** Use this macro to indicate a plugin's thread-safety at registration time */
diff --git a/lib/mscg/Install.py b/lib/mscg/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54723261452d1af2ddf1c837985089fd71aa2cd
--- /dev/null
+++ b/lib/mscg/Install.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+
+# Install.py tool to download, unpack, build, and link to the MS-CG library
+# used to automate the steps described in the README file in this dir
+
+import sys,os,re,commands
+
+# help message
+
+help = """
+Syntax: python Install.py -h hpath hdir -g -b [suffix] -l
+  specify one or more options, order does not matter
+  -h = set home dir of MS-CG to be hpath/hdir
+       hpath can be full path, contain '~' or '.' chars
+       default hpath = . = lib/mscg
+       default hdir = MSCG-release-master = what GitHub zipfile unpacks to
+  -g = grab (download) zipfile from MS-CG GitHub website
+       unpack it to hpath/hdir
+       hpath must already exist
+       if hdir already exists, it will be deleted before unpack
+  -b = build MS-CG library in its src dir
+       optional suffix specifies which src/Make/Makefile.suffix to use
+       default suffix = g++_simple
+  -l = create 2 softlinks (includelink,liblink) in lib/mscg to MS-CG src dir
+"""
+
+# settings
+
+url = "https://github.com/uchicago-voth/MSCG-release/archive/master.zip"
+zipfile = "MS-CG-master.zip"
+zipdir = "MSCG-release-master"
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# expand to full path name
+# process leading '~' or relative path
+  
+def fullpath(path):
+  return os.path.abspath(os.path.expanduser(path))
+  
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+homepath = "."
+homedir = zipdir
+
+grabflag = 0
+buildflag = 0
+msuffix = "g++_simple"
+linkflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-h":
+    if iarg+3 > nargs: error()
+    homepath = args[iarg+1]
+    homedir = args[iarg+2]
+    iarg += 3
+  elif args[iarg] == "-g":
+    grabflag = 1
+    iarg += 1
+  elif args[iarg] == "-b":
+    buildflag = 1
+    if iarg+1 < nargs and args[iarg+1][0] != '-':
+      msuffix = args[iarg+1]
+      iarg += 1
+    iarg += 1
+  elif args[iarg] == "-l":
+    linkflag = 1
+    iarg += 1
+  else: error()
+
+homepath = fullpath(homepath)
+if not os.path.isdir(homepath): error("MS-CG path does not exist")
+homedir = "%s/%s" % (homepath,homedir)
+
+# download and unpack MS-CG zipfile
+
+if grabflag:
+  print "Downloading MS-CG ..."
+  cmd = "curl -L %s > %s/%s" % (url,homepath,zipfile)
+  print cmd
+  print commands.getoutput(cmd)
+
+  print "Unpacking MS-CG zipfile ..."
+  if os.path.exists("%s/%s" % (homepath,zipdir)):
+    commands.getoutput("rm -rf %s/%s" % (homepath,zipdir))
+  cmd = "cd %s; unzip %s" % (homepath,zipfile)
+  commands.getoutput(cmd)
+  if os.path.basename(homedir) != zipdir:
+    if os.path.exists(homedir): commands.getoutput("rm -rf %s" % homedir)
+    os.rename("%s/%s" % (homepath,zipdir),homedir)
+
+# build MS-CG
+
+if buildflag:
+  print "Building MS-CG ..."
+  cmd = "cd %s/src; cp Make/Makefile.%s .; make -f Makefile.%s" % \
+      (homedir,msuffix,msuffix)
+  txt = commands.getoutput(cmd)
+  print txt
+
+# create 2 links in lib/mscg to MS-CG src dir
+
+if linkflag:
+  print "Creating links to MS-CG include and lib files"
+  if os.path.isfile("includelink") or os.path.islink("includelink"):
+    os.remove("includelink")
+  if os.path.isfile("liblink") or os.path.islink("liblink"):
+    os.remove("liblink")
+  cmd = "ln -s %s/src includelink" % homedir
+  commands.getoutput(cmd)
+  cmd = "ln -s %s/src liblink" % homedir
+  commands.getoutput(cmd)
diff --git a/lib/mscg/Makefile.lammps b/lib/mscg/Makefile.lammps
index 0aa55b087dfc2ccd0fd22de84a82ee422f2b9178..f0d9a9b8a071b887115b46793514017fd11edf6f 100644
--- a/lib/mscg/Makefile.lammps
+++ b/lib/mscg/Makefile.lammps
@@ -1,5 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used
 
-mscg_SYSINC = 
-mscg_SYSLIB = -lm -lgsl -llapack -lcblas
+mscg_SYSINC = -std=c++11
+mscg_SYSLIB = -lm -lgsl -llapack -lgslcblas
 mscg_SYSPATH = 
diff --git a/lib/mscg/README b/lib/mscg/README
index cc4fc9a667bf1244d1975a4a2520fadbf7a388b3..b73c8563cd1bc4d76051fcff05acb6c9d4fbaee2 100755
--- a/lib/mscg/README
+++ b/lib/mscg/README
@@ -6,6 +6,15 @@ The MS-CG library is available at
 https://github.com/uchicago-voth/MSCG-release and was developed by
 Jacob Wagner in Greg Voth's group at the University of Chicago.
 
+This library requires a compiler with C++11 support (e.g., g++ v4.9+),
+LAPACK, and the GNU scientific library (GSL v 2.1+).
+
+You can type "make lib-mscg" from the src directory to see help on how
+to download and build this library via make commands, or you can do
+the same thing by typing "python Install.py" from within this
+directory, or you can do it manually by following the instructions
+below.
+
 -----------------
 
 You must perform the following steps yourself.
@@ -14,16 +23,21 @@ You must perform the following steps yourself.
     either as a tarball or via SVN, and unpack the tarball either in
     this /lib/mscg directory or somewhere else on your system.
 
-2.  Compile MS-CG from within its home directory using your makefile choice:
+2.  Ensure that you have LAPACK and GSL (or Intel MKL) as well as a compiler
+    with support for C++11.
+    
+3.  Compile MS-CG from within its home directory using your makefile of choice:
     % make -f Makefile."name" libmscg.a
+	It is recommended that you start with Makefile.g++_simple
+        for most machines
 
-3.  There is no need to install MS-CG if you only wish 
+4.  There is no need to install MS-CG if you only wish 
     to use it from LAMMPS.
 
-4.  Create two soft links in this dir (lib/mscg) to the MS-CG src
+5.  Create two soft links in this dir (lib/mscg) to the MS-CG src
     directory.  E.g if you built MS-CG in this dir:
-      % ln -s mscgfm-master/src includelink
-      % ln -s mscgfm-master/src liblink
+      % ln -s src includelink
+      % ln -s src liblink
     These links could instead be set to the include and lib
     directories created by a MS-CG install, e.g.
       % ln -s /usr/local/include includelink
@@ -46,8 +60,8 @@ somewhere else, you will also need to repeat steps 1,2,3.
 
 The Makefile.lammps file in this directory is there for compatibility
 with the way other libraries under the lib dir are linked with by
-LAMMPS.  MS-CG requires the GSL, LAPACK, and BLAS libraries as listed
-in Makefile.lammps.  If they are not in default locations where your
+LAMMPS.  MS-CG requires the GSL and LAPACK libraries as listed in
+Makefile.lammps.  If they are not in default locations where your
 LD_LIBRARY_PATH environment settings can find them, then you should
 add the approrpriate -L paths to the mscg_SYSPATH variable in
 Makefile.lammps.
diff --git a/lib/netcdf/README b/lib/netcdf/README
index 00db8df0012c2eec5d425db197cbb34f0c6c577e..b18ea1d276fdaf98a50bc882df31f7e006d04a6d 100644
--- a/lib/netcdf/README
+++ b/lib/netcdf/README
@@ -1,6 +1,9 @@
 The Makefile.lammps file in this directory is used when building
 LAMMPS with packages that make use of the NetCDF library or its
-parallel version.  The file has several settings needed to compile
+parallel version.  For example, the USER-NETCDF package which adds
+dump netcdf and dump netcdf/mpiio commands.
+
+The file has several settings needed to compile
 and link LAMMPS with the NetCDF and parallel NetCDF support.
 For any regular NetCDF installation, all required flags should be
 autodetected. Please note that parallel NetCDF support is
diff --git a/lib/poems/Install.py b/lib/poems/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/poems/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/poems/README b/lib/poems/README
index 836595bdd1b06401e9d90ca735cbb1653403d2db..e0ded85e463ebb6528f79dfb7223f64900545458 100644
--- a/lib/poems/README
+++ b/lib/poems/README
@@ -40,6 +40,11 @@ links against when using the POEMA package.
 This library must be built with a C++ compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-poems" from the src directory to see help on
+how to build this library via make commands, or you can do the same
+thing by typing "python Install.py" from within this directory, or you
+can do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
diff --git a/lib/qmmm/Install.py b/lib/qmmm/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/qmmm/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/qmmm/README b/lib/qmmm/README
index b50f25ed695f5d52305253896f3b6f4976890cc7..2746c9e86e291e1c71253d1753326e6d7bfa7c06 100644
--- a/lib/qmmm/README
+++ b/lib/qmmm/README
@@ -18,6 +18,15 @@ the only option. Adding support for a different QM code will require
 to write a new version of the top-level wrapper code, pwqmmm.c, and
 also an interface layer into the QM code similar to the one in QE.
 
+You can type "make lib-qmmm" from the src directory to see help on how
+to build this library (steps 1 and 2 below) via make commands, or you
+can do the same thing by typing "python Install.py" from within this
+directory, or you can do it manually by following the instructions
+below.
+
+However you perform steps 1 and 2, you will need to perform steps 3
+and 4 manually, as outlined below.
+
 -------------------------------------------------
 
 WARNING: This is experimental code under developement and is provided
diff --git a/lib/reax/Install.py b/lib/reax/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b426f9282819ce196b6cf030aef477e3769d66
--- /dev/null
+++ b/lib/reax/Install.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python
+
+# install.py tool to do a generic build of a library
+# soft linked to by many of the lib/Install.py files
+# used to automate the steps described in the corresponding lib/README
+
+import sys,commands,os
+
+# help message
+
+help = """
+Syntax: python Install.py -m machine -e suffix
+  specify -m and optionally -e, order does not matter
+  -m = peform a clean followed by "make -f Makefile.machine"
+       machine = suffix of a lib/Makefile.* file
+  -e = set EXTRAMAKE variable in Makefile.machine to Makefile.lammps.suffix
+       does not alter existing Makefile.machine
+"""
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+machine = None
+extraflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-m":
+    if iarg+2 > nargs: error()
+    machine = args[iarg+1]
+    iarg += 2  
+  elif args[iarg] == "-e":
+    if iarg+2 > nargs: error()
+    extraflag = 1
+    suffix = args[iarg+1]
+    iarg += 2  
+  else: error()
+
+# set lib from working dir
+
+cwd = os.getcwd()
+lib = os.path.basename(cwd)
+
+# create Makefile.auto as copy of Makefile.machine
+# reset EXTRAMAKE if requested
+  
+if not os.path.exists("Makefile.%s" % machine):
+  error("lib/%s/Makefile.%s does not exist" % (lib,machine))
+
+lines = open("Makefile.%s" % machine,'r').readlines()
+fp = open("Makefile.auto",'w')
+
+for line in lines:
+  words = line.split()
+  if len(words) == 3 and extraflag and \
+        words[0] == "EXTRAMAKE" and words[1] == '=':
+    line = line.replace(words[2],"Makefile.lammps.%s" % suffix)
+  print >>fp,line,
+
+fp.close()
+
+# make the library via Makefile.auto
+
+print "Building lib%s.a ..." % lib
+cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
+txt = commands.getoutput(cmd)
+print txt
+
+if os.path.exists("lib%s.a" % lib): print "Build was successful"
+else: error("Build of lib/%s/lib%s.a was NOT successful" % (lib,lib))
+if not os.path.exists("Makefile.lammps"):
+  print "lib/%s/Makefile.lammps was NOT created" % lib
diff --git a/lib/reax/README b/lib/reax/README
index 2840a242a5fc5850154825dda8304e8a41aa8076..f21a47061827aab3c4c26fc8b3ccfbb5133b8fe4 100644
--- a/lib/reax/README
+++ b/lib/reax/README
@@ -17,6 +17,11 @@ links against when using the REAX package.
 This library must be built with a F90 compiler, before LAMMPS is
 built, so LAMMPS can link against it.
 
+You can type "make lib-reax" from the src directory to see help on how
+to build this library via make commands, or you can do the same thing
+by typing "python Install.py" from within this directory, or you can
+do it manually by following the instructions below.
+
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
diff --git a/lib/smd/Install.py b/lib/smd/Install.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc0a3187ce546e81176d780297588a91b04dccce
--- /dev/null
+++ b/lib/smd/Install.py
@@ -0,0 +1,103 @@
+#!/usr/bin/env python
+
+# Install.py tool to download, unpack, and point to the Eigen library
+# used to automate the steps described in the README file in this dir
+
+import sys,os,re,glob,commands
+
+# help message
+
+help = """
+Syntax: python Install.py -h hpath hdir -g -l
+  specify one or more options, order does not matter
+  -h = set home dir of Eigen to be hpath/hdir
+       hpath can be full path, contain '~' or '.' chars
+       default hpath = . = lib/smd
+       default hdir = "ee" = what tarball unpacks to (eigen-eigen-*)
+  -g = grab (download) tarball from http://eigen.tuxfamily.org website
+       unpack it to hpath/hdir
+       hpath must already exist
+       if hdir already exists, it will be deleted before unpack
+  -l = create softlink (includelink) in lib/smd to Eigen src dir
+"""
+
+# settings
+
+url = "http://bitbucket.org/eigen/eigen/get/3.3.3.tar.gz"
+tarball = "eigen.tar.gz"
+
+# print error message or help
+
+def error(str=None):
+  if not str: print help
+  else: print "ERROR",str
+  sys.exit()
+
+# expand to full path name
+# process leading '~' or relative path
+  
+def fullpath(path):
+  return os.path.abspath(os.path.expanduser(path))
+  
+# parse args
+
+args = sys.argv[1:]
+nargs = len(args)
+if nargs == 0: error()
+
+homepath = "."
+homedir = "ee"
+
+grabflag = 0
+linkflag = 0
+
+iarg = 0
+while iarg < nargs:
+  if args[iarg] == "-h":
+    if iarg+3 > nargs: error()
+    homepath = args[iarg+1]
+    homedir = args[iarg+2]
+    iarg += 3
+  elif args[iarg] == "-g":
+    grabflag = 1
+    iarg += 1
+  elif args[iarg] == "-l":
+    linkflag = 1
+    iarg += 1
+  else: error()
+
+homepath = fullpath(homepath)
+if not os.path.isdir(homepath): error("Eigen path does not exist")
+
+# download and unpack Eigen tarball
+# glob to find name of dir it unpacks to
+
+if grabflag:
+  print "Downloading Eigen ..."
+  cmd = "curl -L %s > %s/%s" % (url,homepath,tarball)
+  print cmd
+  print commands.getoutput(cmd)
+
+  print "Unpacking Eigen tarball ..."
+  edir = glob.glob("%s/eigen-eigen-*" % homepath)
+  for one in edir:
+    if os.path.isdir(one): commands.getoutput("rm -rf %s" % one)
+  cmd = "cd %s; tar zxvf %s" % (homepath,tarball)
+  commands.getoutput(cmd)
+  if homedir != "ee":
+    if os.path.exists(homedir): commands.getoutput("rm -rf %s" % homedir)
+    edir = glob.glob("%s/eigen-eigen-*" % homepath)
+    os.rename(edir[0],"%s/%s" % (homepath,homedir))
+
+# create link in lib/smd to Eigen src dir
+
+if linkflag:
+  print "Creating link to Eigen files"
+  if os.path.isfile("includelink") or os.path.islink("includelink"):
+    os.remove("includelink")
+  if homedir == "ee":
+    edir = glob.glob("%s/eigen-eigen-*" % homepath)
+    linkdir = edir[0]
+  else: linkdir = "%s/%s" % (homepath,homedir)
+  cmd = "ln -s %s includelink" % linkdir
+  commands.getoutput(cmd)
diff --git a/lib/smd/README b/lib/smd/README
index 846c440dae3c5fbb978e2edfbc8c6a88512509b7..1bd5902a1f4738e1e4caf1d886b1f39d8ef55761 100644
--- a/lib/smd/README
+++ b/lib/smd/README
@@ -4,9 +4,12 @@ to use the USER-SMD package in a LAMMPS input script.
 The Eigen library is available at http://eigen.tuxfamily.org.  It's
 a general C++ template library for linear algebra.
 
-You must perform the following steps yourself, or you can use the
-install.py Python script to automate any or all steps of the process.
-Type "python install.py" for instructions.
+You can type "make lib-smd" from the src directory to see help on how
+to download build this library via make commands, or you can do the
+same thing by typing "python Install.py" from within this directory,
+or you can do it manually by following the instructions below.
+
+Instructions:
 
 1.  Download the Eigen tarball at http://eigen.tuxfamily.org and
     unpack the tarball either in this /lib/smd directory or somewhere
diff --git a/lib/voronoi/Install.py b/lib/voronoi/Install.py
index 8ae08917e5bbde5c923617df136cd11e799ffbbc..7d847183b38fb3e0733df56f33a4ccffcf479bbc 100644
--- a/lib/voronoi/Install.py
+++ b/lib/voronoi/Install.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# install.py tool to download, unpack, build, and link to the Voro++ library
+# Install.py tool to download, unpack, build, and link to the Voro++ library
 # used to automate the steps described in the README file in this dir
 
 import sys,os,re,urllib,commands
@@ -8,19 +8,20 @@ import sys,os,re,urllib,commands
 # help message
 
 help = """
-Syntax: install.py -v version -g gdir [gname] -b bdir -l ldir
+Syntax: python Install.py -v version -h hpath hdir -g -b -l
   specify one or more options, order does not matter
-  gdir,bdir,ldir can be paths relative to lib/latte, full paths, or contain ~
   -v = version of Voro++ to download and build
-       default = voro++-0.4.6 (current as of Jan 2015)
-  -g = grab (download) from math.lbl.gov/voro++ website
-       unpack tarfile in gdir to produce version dir (e.g. voro++-0.4.6)
-       if optional gname specified, rename version dir to gname within gdir
-  -b = build Voro++, bdir = Voro++ home directory
-       note that bdir must include the version suffix unless renamed
-  -l = create 2 softlinks (includelink,liblink)
-         in lib/voronoi to src dir of ldir = Voro++ home directory
-       note that ldir must include the version suffix unless renamed
+       default version = voro++-0.4.6 (current as of Jan 2015)
+  -h = set home dir of Voro++ to be hpath/hdir
+       hpath can be full path, contain '~' or '.' chars
+       default hpath = . = lib/voronoi
+       default hdir = voro++-0.4.6 = what tarball unpacks to
+  -g = grab (download) tarball from math.lbl.gov/voro++ website
+       unpack it to hpath/hdir
+       hpath must already exist
+       if hdir already exists, it will be deleted before unpack
+  -b = build Voro++ library in its src dir
+  -l = create 2 softlinks (includelink,liblink) in lib/voronoi to Voro++ src dir
 """
 
 # settings
@@ -47,6 +48,9 @@ args = sys.argv[1:]
 nargs = len(args)
 if nargs == 0: error()
 
+homepath = "."
+homedir = version
+
 grabflag = 0
 buildflag = 0
 linkflag = 0
@@ -56,49 +60,47 @@ while iarg < nargs:
   if args[iarg] == "-v":
     if iarg+2 > nargs: error()
     version = args[iarg+1]
-    iarg += 2  
+    iarg += 2
+  elif args[iarg] == "-h":
+    if iarg+3 > nargs: error()
+    homepath = args[iarg+1]
+    homedir = args[iarg+2]
+    iarg += 3
   elif args[iarg] == "-g":
-    if iarg+2 > nargs: error()
     grabflag = 1
-    grabdir = args[iarg+1]
-    grabname = None
-    if iarg+2 < nargs and args[iarg+2][0] != '-':
-      grabname = args[iarg+2]
-      iarg += 1
-    iarg += 2
+    iarg += 1
   elif args[iarg] == "-b":
-    if iarg+2 > nargs: error()
     buildflag = 1
-    builddir = args[iarg+1]
-    iarg += 2
+    iarg += 1
   elif args[iarg] == "-l":
-    if iarg+2 > nargs: error()
     linkflag = 1
-    linkdir = args[iarg+1]
-    iarg += 2
+    iarg += 1
   else: error()
 
+homepath = fullpath(homepath)
+if not os.path.isdir(homepath): error("Voro++ path does not exist")
+homedir = "%s/%s" % (homepath,homedir)
+
 # download and unpack Voro++ tarball
 
 if grabflag:
   print "Downloading Voro++ ..."
-  grabdir = fullpath(grabdir)
-  if not os.path.isdir(grabdir): error("Grab directory does not exist")
-  urllib.urlretrieve(url,"%s/%s.tar.gz" % (grabdir,version))
+  urllib.urlretrieve(url,"%s/%s.tar.gz" % (homepath,version))
   
   print "Unpacking Voro++ tarball ..."
-  tardir = "%s/%s" % (grabdir,version)
-  if os.path.exists(tardir): commands.getoutput("rm -rf %s" % tardir)
-  cmd = "cd %s; tar zxvf %s.tar.gz" % (grabdir,version)
-  txt = commands.getoutput(cmd)
-  print tardir,grabdir,grabname
-  if grabname: os.rename(tardir,"%s/%s" % (grabdir,grabname))
+  if os.path.exists("%s/%s" % (homepath,version)):
+    commands.getoutput("rm -rf %s/%s" % (homepath,version))
+  cmd = "cd %s; tar zxvf %s.tar.gz" % (homepath,version)
+  commands.getoutput(cmd)
+  if os.path.basename(homedir) != version:
+    if os.path.exists(homedir): commands.getoutput("rm -rf %s" % homedir)
+    os.rename("%s/%s" % (homepath,version),homedir)
 
 # build Voro++
 
 if buildflag:
   print "Building Voro++ ..."
-  cmd = "cd %s; make" % builddir
+  cmd = "cd %s; make" % homedir
   txt = commands.getoutput(cmd)
   print txt
 
@@ -110,7 +112,7 @@ if linkflag:
     os.remove("includelink")
   if os.path.isfile("liblink") or os.path.islink("liblink"):
     os.remove("liblink")
-  cmd = "ln -s %s/src includelink" % linkdir
+  cmd = "ln -s %s/src includelink" % homedir
   commands.getoutput(cmd)
-  cmd = "ln -s %s/src liblink" % linkdir
+  cmd = "ln -s %s/src liblink" % homedir
   commands.getoutput(cmd)
diff --git a/lib/voronoi/README b/lib/voronoi/README
index 2507a9bae40d363380ce99404e12291e7986c933..9863632be0d1ea7e3f5c4158197291a5cd43ab30 100644
--- a/lib/voronoi/README
+++ b/lib/voronoi/README
@@ -6,11 +6,15 @@ The Voro++ library is available at http://math.lbl.gov/voro++ and was
 developed by Chris H. Rycroft while at UC Berkeley / Lawrence Berkeley
 Laboratory.
 
+You can type "make lib-voronoi" from the src directory to see help on
+how to download and build this library via make commands, or you can
+do the same thing by typing "python Install.py" from within this
+directory, or you can do it manually by following the instructions
+below.
+
 -----------------
 
-You must perform the following steps yourself, or you can use the
-Install.py Python script to automate any or all steps of the process.
-Type "python Install.py" for instructions.
+Instructions:
 
 1.  Download Voro++ at http://math.lbl.gov/voro++/download
     either as a tarball or via SVN, and unpack the
diff --git a/lib/vtk/Makefile.lammps b/lib/vtk/Makefile.lammps
index e3b28ed92828aaf5002b7147363914b2435751f9..b86856a9c615e552e0b41d27fa88bd85c8e52dd6 100644
--- a/lib/vtk/Makefile.lammps
+++ b/lib/vtk/Makefile.lammps
@@ -1,13 +1,12 @@
 # Settings that the LAMMPS build will import when this package library is used
-#
+
 # settings for VTK-5.8.0 on RHEL/CentOS 6.x
  vtk_SYSINC = -I/usr/include/vtk
  vtk_SYSLIB = -lvtkCommon -lvtkIO 
  vtk_SYSPATH = -L/usr/lib64/vtk
-#
+
 # settings for VTK 6.2.0 on Fedora 23
 #vtk_SYSINC = -I/usr/include/vtk
 #vtk_SYSLIB = -lvtkCommonCore -lvtkIOCore -lvtkCommonDataModel -lvtkIOXML -lvtkIOLegacy -lvtkIOParallelXML
 #vtk_SYSPATH = -L/usr/lib64/vtk
-#
 
diff --git a/lib/vtk/README b/lib/vtk/README
index 11add94f52bf7ba00c732a9c5897eecafc7645a8..61e2a40c23d0e56c94c9c2d5767138d2387c6d43 100644
--- a/lib/vtk/README
+++ b/lib/vtk/README
@@ -1,14 +1,15 @@
-The Makefile.lammps file in this directory is used when building LAMMPS with
-its USER-VTK package installed.  The file has several settings needed to
-compile and link LAMMPS with the VTK library.  You should choose a
-Makefile.lammps.* file compatible with your system and your version of VTK, and
-copy it to Makefile.lammps before building LAMMPS itself.  You may need to edit
-one of the provided files to match your system.
+The Makefile.lammps file in this directory is used when building
+LAMMPS with its USER-VTK package installed.  The file has several
+settings needed to compile and link LAMMPS with the VTK library.  You
+should choose a Makefile.lammps.* file compatible with your system and
+your version of VTK, and copy it to Makefile.lammps before building
+LAMMPS itself.  You may need to edit one of the provided files to
+match your system.
 
-If you create a new Makefile.lammps file suitable for some version of VTK on
-some system, that is not a match to one of the provided Makefile.lammps.*
-files, you can send it to the developers, and we can include it in the
-distribution for others to use.
+If you create a new Makefile.lammps file suitable for some version of
+VTK on some system, that is not a match to one of the provided
+Makefile.lammps.* files, you can send it to the developers, and we can
+include it in the distribution for others to use.
 
 To illustrate, these are example settings from the
 Makefile.lammps.ubuntu14.04_vtk6 file:
@@ -19,10 +20,11 @@ vtk_SYSPATH =
 
 vtk_SYSINC refers to the include directory of the installed VTK library
 
-vtk_SYSLIB refers to the libraries needed to link to from an application
-(LAMMPS in this case) to "embed" VTK in the application. VTK consists of
-multiple shared libraries which are needed when using the USER-VTK package.
+vtk_SYSLIB refers to the libraries needed to link to from an
+application (LAMMPS in this case) to "embed" VTK in the
+application. VTK consists of multiple shared libraries which are
+needed when using the USER-VTK package.
 
-vtk_SYSPATH = refers to the path (e.g. -L/usr/local/lib) where the VTK library
-can be found.  You may not need this setting if the path is already included in
-your LD_LIBRARY_PATH environment variable.
+vtk_SYSPATH = refers to the path (e.g. -L/usr/local/lib) where the VTK
+library can be found.  You may not need this setting if the path is
+already included in your LD_LIBRARY_PATH environment variable.
diff --git a/src/.gitignore b/src/.gitignore
index bb6f0a392e587fdb36e459cbff34ca0851757399..1327704e4731a8ac1f9ee8ecbffb171e723a2fa8 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -47,8 +47,6 @@
 /dump_molfile.h
 /molfile_interface.cpp
 /molfile_interface.h
-/molfile_plugin.h
-/vmdplugin.h
 /type_detector.h
 
 /intel_buffers.cpp
@@ -76,8 +74,8 @@
 /pair_awpmd_cut.cpp
 /pair_awpmd_cut.h
 
-/dihedral_charmmfsh.cpp
-/dihedral_charmmfsh.h
+/dihedral_charmmfsw.cpp
+/dihedral_charmmfsw.h
 /pair_lj_charmmfsw_coul_charmmfsh.cpp
 /pair_lj_charmmfsw_coul_charmmfsh.h
 /pair_lj_charmmfsw_coul_long.cpp
@@ -163,6 +161,8 @@
 /bond_nonlinear.h
 /bond_oxdna_fene.cpp
 /bond_oxdna_fene.h
+/bond_oxdna2_fene.cpp
+/bond_oxdna2_fene.h
 /bond_quartic.cpp
 /bond_quartic.h
 /bond_table.cpp
@@ -770,6 +770,8 @@
 /pair_nm_cut_coul_long.h
 /pair_oxdna_*.cpp
 /pair_oxdna_*.h
+/pair_oxdna2_*.cpp
+/pair_oxdna2_*.h
 /mf_oxdna.h
 /pair_peri_eps.cpp
 /pair_peri_eps.h
diff --git a/src/Depend.sh b/src/Depend.sh
index 5a48a7c1631eeac683fb0a4380789638d32cf2b7..520d9ae2bffd9f25ce364f80618592c0079006bc 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -109,7 +109,7 @@ if (test $1 = "RIGID") then
   depend USER-OMP
 fi
 
-if (test $1 = "USER-CG-CMM") then
+if (test $1 = "USER-CGSDK") then
   depend GPU
   depend KOKKOS
   depend USER-OMP
diff --git a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
index 0b8d0f3b31b57113346d45dc57e65fb50163c16b..77c0dc06601887f0d3451c9c3cd49480bb11e218 100644
--- a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
@@ -48,7 +48,7 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int cmml_gpu_init(const int ntypes, double **cutsq, int **lj_type,
+int sdkl_gpu_init(const int ntypes, double **cutsq, int **lj_type,
                   double **host_lj1, double **host_lj2, double **host_lj3,
                   double **host_lj4, double **offset, double *special_lj,
                   const int nlocal, const int nall, const int max_nbors,
@@ -56,8 +56,8 @@ int cmml_gpu_init(const int ntypes, double **cutsq, int **lj_type,
                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald);
-void cmml_gpu_clear();
-int ** cmml_gpu_compute_n(const int ago, const int inum, const int nall,
+void sdkl_gpu_clear();
+int ** sdkl_gpu_compute_n(const int ago, const int inum, const int nall,
                           double **host_x, int *host_type, double *sublo,
                           double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
@@ -65,13 +65,13 @@ int ** cmml_gpu_compute_n(const int ago, const int inum, const int nall,
                           int **ilist, int **jnum, const double cpu_time,
                           bool &success, double *host_q, double *boxlo,
                           double *prd);
-void cmml_gpu_compute(const int ago, const int inum, const int nall,
+void sdkl_gpu_compute(const int ago, const int inum, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
                       const bool eatom, const bool vatom, int &host_start,
                       const double cpu_time, bool &success, double *host_q,
                       const int nlocal, double *boxlo, double *prd);
-double cmml_gpu_bytes();
+double sdkl_gpu_bytes();
 
 #include "lj_sdk_common.h"
 
@@ -95,7 +95,7 @@ PairLJSDKCoulLongGPU::PairLJSDKCoulLongGPU(LAMMPS *lmp) :
 
 PairLJSDKCoulLongGPU::~PairLJSDKCoulLongGPU()
 {
-  cmml_gpu_clear();
+  sdkl_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -112,7 +112,7 @@ void PairLJSDKCoulLongGPU::compute(int eflag, int vflag)
   int *ilist, *numneigh, **firstneigh;
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
-    firstneigh = cmml_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+    firstneigh = sdkl_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
                                     atom->type, domain->sublo, domain->subhi,
                                     atom->tag, atom->nspecial, atom->special,
                                     eflag, vflag, eflag_atom, vflag_atom,
@@ -124,7 +124,7 @@ void PairLJSDKCoulLongGPU::compute(int eflag, int vflag)
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
-    cmml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+    sdkl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
                      ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
                      vflag_atom, host_start, cpu_time, success, atom->q,
                      atom->nlocal, domain->boxlo, domain->prd);
@@ -185,7 +185,7 @@ void PairLJSDKCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  int success = cmml_gpu_init(atom->ntypes+1, cutsq, lj_type, lj1, lj2, lj3,
+  int success = sdkl_gpu_init(atom->ntypes+1, cutsq, lj_type, lj1, lj2, lj3,
                               lj4, offset, force->special_lj, atom->nlocal,
                               atom->nlocal+atom->nghost, 300, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
@@ -205,7 +205,7 @@ void PairLJSDKCoulLongGPU::init_style()
 double PairLJSDKCoulLongGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
-  return bytes + cmml_gpu_bytes();
+  return bytes + sdkl_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_lj_sdk_coul_long_gpu.h b/src/GPU/pair_lj_sdk_coul_long_gpu.h
index 61de272979a525c5b34e147c29fb8abd77bfb0da..3248e949779b11cd98f2395b51efd906f9916d41 100644
--- a/src/GPU/pair_lj_sdk_coul_long_gpu.h
+++ b/src/GPU/pair_lj_sdk_coul_long_gpu.h
@@ -14,7 +14,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/coul/long/gpu,PairLJSDKCoulLongGPU)
-PairStyle(cg/cmm/coul/long/gpu,PairLJSDKCoulLongGPU)
 
 #else
 
diff --git a/src/GPU/pair_lj_sdk_gpu.cpp b/src/GPU/pair_lj_sdk_gpu.cpp
index e7e9b690f3e05fd5727f922c4cca44de660e874c..67103181d53a852325a7c5ee8411fbb037695f52 100644
--- a/src/GPU/pair_lj_sdk_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_gpu.cpp
@@ -39,26 +39,26 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                  double **host_lj1, double **host_lj2, double **host_lj3,
                  double **host_lj4, double **offset, double *special_lj,
                  const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen);
-void cmm_gpu_clear();
-int ** cmm_gpu_compute_n(const int ago, const int inum, const int nall,
+void sdk_gpu_clear();
+int ** sdk_gpu_compute_n(const int ago, const int inum, const int nall,
                          double **host_x, int *host_type, double *sublo,
                          double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
                          const double cpu_time, bool &success);
-void cmm_gpu_compute(const int ago, const int inum, const int nall,
+void sdk_gpu_compute(const int ago, const int inum, const int nall,
                      double **host_x, int *host_type, int *ilist, int *numj,
                      int **firstneigh, const bool eflag, const bool vflag,
                      const bool eatom, const bool vatom, int &host_start,
                      const double cpu_time, bool &success);
-double cmm_gpu_bytes();
+double sdk_gpu_bytes();
 
 #include "lj_sdk_common.h"
 
@@ -80,7 +80,7 @@ PairLJSDKGPU::PairLJSDKGPU(LAMMPS *lmp) : PairLJSDK(lmp), gpu_mode(GPU_FORCE)
 
 PairLJSDKGPU::~PairLJSDKGPU()
 {
-  cmm_gpu_clear();
+  sdk_gpu_clear();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -97,7 +97,7 @@ void PairLJSDKGPU::compute(int eflag, int vflag)
   int *ilist, *numneigh, **firstneigh;
   if (gpu_mode != GPU_FORCE) {
     inum = atom->nlocal;
-    firstneigh = cmm_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+    firstneigh = sdk_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
                                    atom->type, domain->sublo, domain->subhi,
                                    atom->tag, atom->nspecial, atom->special,
                                    eflag, vflag, eflag_atom, vflag_atom,
@@ -108,7 +108,7 @@ void PairLJSDKGPU::compute(int eflag, int vflag)
     ilist = list->ilist;
     numneigh = list->numneigh;
     firstneigh = list->firstneigh;
-    cmm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+    sdk_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
                     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
                     vflag_atom, host_start, cpu_time, success);
   }
@@ -154,7 +154,7 @@ void PairLJSDKGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  int success = cmm_gpu_init(atom->ntypes+1,cutsq,lj_type,lj1,lj2,lj3,lj4,
+  int success = sdk_gpu_init(atom->ntypes+1,cutsq,lj_type,lj1,lj2,lj3,lj4,
                              offset, force->special_lj, atom->nlocal,
                              atom->nlocal+atom->nghost, 300, maxspecial,
                              cell_size, gpu_mode, screen);
@@ -172,7 +172,7 @@ void PairLJSDKGPU::init_style()
 double PairLJSDKGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
-  return bytes + cmm_gpu_bytes();
+  return bytes + sdk_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/pair_lj_sdk_gpu.h b/src/GPU/pair_lj_sdk_gpu.h
index 610fb8b0e473d6c6799340e5b69464a0dc6bebf2..3865b3404692ffa185c7bf0889e75ca5925ad831 100644
--- a/src/GPU/pair_lj_sdk_gpu.h
+++ b/src/GPU/pair_lj_sdk_gpu.h
@@ -14,7 +14,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/gpu,PairLJSDKGPU)
-PairStyle(cg/cmm/gpu,PairLJSDKGPU)
 
 #else
 
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index bbebc36c12aad95e19ca28282b0b8122fafb8853..790b9224c2ba3670622cfc03f2e332b660b7d5c4 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -183,8 +183,8 @@ action pair_lj_sdk_kokkos.cpp pair_lj_sdk.cpp
 action pair_lj_sdk_kokkos.h pair_lj_sdk.h
 action pair_morse_kokkos.cpp
 action pair_morse_kokkos.h
-action pair_reax_c_kokkos.cpp pair_reax_c.cpp
-action pair_reax_c_kokkos.h pair_reax_c.h
+action pair_reaxc_kokkos.cpp pair_reaxc.cpp
+action pair_reaxc_kokkos.h pair_reaxc.h
 action pair_sw_kokkos.cpp pair_sw.cpp
 action pair_sw_kokkos.h pair_sw.h
 action pair_vashishta_kokkos.cpp pair_vashishta.cpp
diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp
index 48fc3a352cfa28f13db6fd2bdd12205b6107c0c8..34b868aadc1b51c176ec55f946a05d2305e19b87 100644
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@@ -80,22 +80,22 @@ void AtomVecAngleKokkos::grow(int n)
   memory->grow_kokkos(atomKK->k_molecule,atomKK->molecule,nmax,"atom:molecule");
   memory->grow_kokkos(atomKK->k_nspecial,atomKK->nspecial,nmax,3,"atom:nspecial");
   memory->grow_kokkos(atomKK->k_special,atomKK->special,nmax,atomKK->maxspecial,
-		      "atom:special");
+                      "atom:special");
   memory->grow_kokkos(atomKK->k_num_bond,atomKK->num_bond,nmax,"atom:num_bond");
   memory->grow_kokkos(atomKK->k_bond_type,atomKK->bond_type,nmax,atomKK->bond_per_atom,
-		      "atom:bond_type");
+                      "atom:bond_type");
   memory->grow_kokkos(atomKK->k_bond_atom,atomKK->bond_atom,nmax,atomKK->bond_per_atom,
-		      "atom:bond_atom");
+                      "atom:bond_atom");
 
   memory->grow_kokkos(atomKK->k_num_angle,atomKK->num_angle,nmax,"atom:num_angle");
   memory->grow_kokkos(atomKK->k_angle_type,atomKK->angle_type,nmax,atomKK->angle_per_atom,
-		      "atom:angle_type");
+                      "atom:angle_type");
   memory->grow_kokkos(atomKK->k_angle_atom1,atomKK->angle_atom1,nmax,atomKK->angle_per_atom,
-		      "atom:angle_atom1");
+                      "atom:angle_atom1");
   memory->grow_kokkos(atomKK->k_angle_atom2,atomKK->angle_atom2,nmax,atomKK->angle_per_atom,
-		      "atom:angle_atom2");
+                      "atom:angle_atom2");
   memory->grow_kokkos(atomKK->k_angle_atom3,atomKK->angle_atom3,nmax,atomKK->angle_per_atom,
-		      "atom:angle_atom3");
+                      "atom:angle_atom3");
 
   grow_reset();
   sync(Host,ALL_MASK);
@@ -241,7 +241,7 @@ struct AtomVecAngleKokkos_PackComm {
       _xprd(xprd),_yprd(yprd),_zprd(zprd),
       _xy(xy),_xz(xz),_yz(yz) {
         const size_t maxsend = (buf.view<DeviceType>().dimension_0()
-				*buf.view<DeviceType>().dimension_1())/3;
+                                *buf.view<DeviceType>().dimension_1())/3;
         const size_t elements = 3;
         buffer_view<DeviceType>(_buf,buf,maxsend,elements);
         _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
@@ -272,11 +272,11 @@ struct AtomVecAngleKokkos_PackComm {
 /* ---------------------------------------------------------------------- */
 
 int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
-					 const DAT::tdual_int_2d &list,
-					 const int & iswap,
-					 const DAT::tdual_xfloat_2d &buf,
-					 const int &pbc_flag,
-					 const int* const pbc)
+                                         const DAT::tdual_int_2d &list,
+                                         const int & iswap,
+                                         const DAT::tdual_xfloat_2d &buf,
+                                         const int &pbc_flag,
+                                         const int* const pbc)
 {
   // Check whether to always run forward communication on the host
   // Choose correct forward PackComm kernel
@@ -339,7 +339,7 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
     LMPDeviceType::fence();
   }
 
-	return n*size_forward;
+  return n*size_forward;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -714,18 +714,18 @@ struct AtomVecAngleKokkos_PackBorder {
           _buf(i,0) = _x(j,0);
           _buf(i,1) = _x(j,1);
           _buf(i,2) = _x(j,2);
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
-          _buf(i,6) = _molecule(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = d_ubuf(_molecule(j)).d;
       } else {
           _buf(i,0) = _x(j,0) + _dx;
           _buf(i,1) = _x(j,1) + _dy;
           _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
-          _buf(i,6) = _molecule(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = d_ubuf(_molecule(j)).d;
       }
   }
 };
@@ -957,10 +957,10 @@ struct AtomVecAngleKokkos_UnpackBorder {
       _x(i+_first,0) = _buf(i,0);
       _x(i+_first,1) = _buf(i,1);
       _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = static_cast<tagint> (_buf(i,3));
-      _type(i+_first) = static_cast<int>  (_buf(i,4));
-      _mask(i+_first) = static_cast<int>  (_buf(i,5));
-      _molecule(i+_first) = static_cast<tagint> (_buf(i,6));
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
+      _molecule(i+_first) = (tagint) d_ubuf(_buf(i,6)).i;
 
   }
 };
@@ -1165,28 +1165,28 @@ struct AtomVecAngleKokkos_PackExchangeFunctor {
     _buf(mysend,m++) = _v(i,0);
     _buf(mysend,m++) = _v(i,1);
     _buf(mysend,m++) = _v(i,2);
-    _buf(mysend,m++) = _tag(i);
-    _buf(mysend,m++) = _type(i);
-    _buf(mysend,m++) = _mask(i);
-    _buf(mysend,m++) = _image(i);
-    _buf(mysend,m++) = _molecule(i);
-    _buf(mysend,m++) = _num_bond(i);
+    _buf(mysend,m++) = d_ubuf(_tag(i)).d;
+    _buf(mysend,m++) = d_ubuf(_type(i)).d;
+    _buf(mysend,m++) = d_ubuf(_mask(i)).d;
+    _buf(mysend,m++) = d_ubuf(_image(i)).d;
+    _buf(mysend,m++) = d_ubuf(_molecule(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_bond(i)).d;
     for (k = 0; k < _num_bond(i); k++) {
-      _buf(mysend,m++) = _bond_type(i,k);
-      _buf(mysend,m++) = _bond_atom(i,k);
+      _buf(mysend,m++) = d_ubuf(_bond_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_bond_atom(i,k)).d;
     }
-    _buf(mysend,m++) = _num_angle(i);
+    _buf(mysend,m++) = d_ubuf(_num_angle(i)).d;
     for (k = 0; k < _num_angle(i); k++) {
-      _buf(mysend,m++) = _angle_type(i,k);
-      _buf(mysend,m++) = _angle_atom1(i,k);
-      _buf(mysend,m++) = _angle_atom2(i,k);
-      _buf(mysend,m++) = _angle_atom3(i,k);
+      _buf(mysend,m++) = d_ubuf(_angle_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom3(i,k)).d;
     }
-    _buf(mysend,m++) = _nspecial(i,0);
-    _buf(mysend,m++) = _nspecial(i,1);
-    _buf(mysend,m++) = _nspecial(i,2);
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,0)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,1)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,2)).d;
     for (k = 0; k < _nspecial(i,2); k++)
-      _buf(mysend,m++) = _special(i,k);
+      _buf(mysend,m++) = d_ubuf(_special(i,k)).d;
 
     const int j = _copylist(mysend);
 
@@ -1350,7 +1350,7 @@ struct AtomVecAngleKokkos_UnpackExchangeFunctor {
     _lo(lo),_hi(hi){
     elements =17+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom;
     const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*
-			     buf.template view<DeviceType>().dimension_1())/elements;
+                             buf.template view<DeviceType>().dimension_1())/elements;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
   }
 
@@ -1366,30 +1366,30 @@ struct AtomVecAngleKokkos_UnpackExchangeFunctor {
       _v(i,0) = _buf(myrecv,m++);
       _v(i,1) = _buf(myrecv,m++);
       _v(i,2) = _buf(myrecv,m++);
-      _tag(i) = _buf(myrecv,m++);
-      _type(i) = _buf(myrecv,m++);
-      _mask(i) = _buf(myrecv,m++);
-      _image(i) = _buf(myrecv,m++);
+      _tag(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _type(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _mask(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _image(i) = (imageint) d_ubuf(_buf(myrecv,m++)).i;
 
-      _molecule(i) = _buf(myrecv,m++);
-      _num_bond(i) = _buf(myrecv,m++);
+      _molecule(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _num_bond(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       int k;
       for (k = 0; k < _num_bond(i); k++) {
-        _bond_type(i,k) = _buf(myrecv,m++);
-        _bond_atom(i,k) = _buf(myrecv,m++);
+        _bond_type(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _bond_atom(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_angle(i) =  _buf(myrecv,m++);
+      _num_angle(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _num_angle(i); k++) {
-	_angle_type(i,k) = _buf(myrecv,m++);
-	_angle_atom1(i,k) = _buf(myrecv,m++);
-	_angle_atom2(i,k) = _buf(myrecv,m++);
-	_angle_atom3(i,k) = _buf(myrecv,m++);
+        _angle_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _nspecial(i,0) = _buf(myrecv,m++);
-      _nspecial(i,1) = _buf(myrecv,m++);
-      _nspecial(i,2) = _buf(myrecv,m++);
+      _nspecial(i,0) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,1) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,2) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _nspecial(i,2); k++)
-        _special(i,k) = _buf(myrecv,m++);
+        _special(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
     }
   }
 };
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index dc254e6a7ed238afa2b50f6f519dafea869f3079..d040bd35531320292d885279152df1fda372c2b7 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -619,16 +619,16 @@ struct AtomVecAtomicKokkos_PackBorder {
           _buf(i,0) = _x(j,0);
           _buf(i,1) = _x(j,1);
           _buf(i,2) = _x(j,2);
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
       } else {
           _buf(i,0) = _x(j,0) + _dx;
           _buf(i,1) = _x(j,1) + _dy;
           _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
       }
   }
 };
@@ -836,9 +836,9 @@ struct AtomVecAtomicKokkos_UnpackBorder {
       _x(i+_first,0) = _buf(i,0);
       _x(i+_first,1) = _buf(i,1);
       _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = static_cast<tagint> (_buf(i,3));
-      _type(i+_first) = static_cast<int>  (_buf(i,4));
-      _mask(i+_first) = static_cast<int>  (_buf(i,5));
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
 //      printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first));
   }
 };
@@ -977,10 +977,10 @@ struct AtomVecAtomicKokkos_PackExchangeFunctor {
     _buf(mysend,4) = _v(i,0);
     _buf(mysend,5) = _v(i,1);
     _buf(mysend,6) = _v(i,2);
-    _buf(mysend,7) = _tag[i];
-    _buf(mysend,8) = _type[i];
-    _buf(mysend,9) = _mask[i];
-    _buf(mysend,10) = _image[i];
+    _buf(mysend,7) = d_ubuf(_tag[i]).d;
+    _buf(mysend,8) = d_ubuf(_type[i]).d;
+    _buf(mysend,9) = d_ubuf(_mask[i]).d;
+    _buf(mysend,10) = d_ubuf(_image[i]).d;
     const int j = _copylist(mysend);
 
     if(j>-1) {
@@ -1091,10 +1091,10 @@ struct AtomVecAtomicKokkos_UnpackExchangeFunctor {
       _v(i,0) = _buf(myrecv,4);
       _v(i,1) = _buf(myrecv,5);
       _v(i,2) = _buf(myrecv,6);
-      _tag[i] = _buf(myrecv,7);
-      _type[i] = _buf(myrecv,8);
-      _mask[i] = _buf(myrecv,9);
-      _image[i] = _buf(myrecv,10);
+      _tag[i] = (tagint) d_ubuf(_buf(myrecv,7)).i;
+      _type[i] = (int) d_ubuf(_buf(myrecv,8)).i;
+      _mask[i] = (int) d_ubuf(_buf(myrecv,9)).i;
+      _image[i] = (imageint) d_ubuf(_buf(myrecv,10)).i;
     }
   }
 };
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index f10decac28e9f8a8e5845548ae44dca33f5b9767..c46c49cb293fad7fcec6f395f0aa206788cf7561 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -662,18 +662,18 @@ struct AtomVecBondKokkos_PackBorder {
           _buf(i,0) = _x(j,0);
           _buf(i,1) = _x(j,1);
           _buf(i,2) = _x(j,2);
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
-          _buf(i,6) = _molecule(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = d_ubuf(_molecule(j)).d;
       } else {
           _buf(i,0) = _x(j,0) + _dx;
           _buf(i,1) = _x(j,1) + _dy;
           _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
-          _buf(i,6) = _molecule(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = d_ubuf(_molecule(j)).d;
       }
   }
 };
@@ -905,10 +905,10 @@ struct AtomVecBondKokkos_UnpackBorder {
       _x(i+_first,0) = _buf(i,0);
       _x(i+_first,1) = _buf(i,1);
       _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = static_cast<tagint> (_buf(i,3));
-      _type(i+_first) = static_cast<int>  (_buf(i,4));
-      _mask(i+_first) = static_cast<int>  (_buf(i,5));
-      _molecule(i+_first) = static_cast<tagint> (_buf(i,6));
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
+      _molecule(i+_first) = (tagint) d_ubuf(_buf(i,6)).i;
 
   }
 };
@@ -1095,21 +1095,21 @@ struct AtomVecBondKokkos_PackExchangeFunctor {
     _buf(mysend,m++) = _v(i,0);
     _buf(mysend,m++) = _v(i,1);
     _buf(mysend,m++) = _v(i,2);
-    _buf(mysend,m++) = _tag(i);
-    _buf(mysend,m++) = _type(i);
-    _buf(mysend,m++) = _mask(i);
-    _buf(mysend,m++) = _image(i);
-    _buf(mysend,m++) = _molecule(i);
-    _buf(mysend,m++) = _num_bond(i);
+    _buf(mysend,m++) = d_ubuf(_tag(i)).d;
+    _buf(mysend,m++) = d_ubuf(_type(i)).d;
+    _buf(mysend,m++) = d_ubuf(_mask(i)).d;
+    _buf(mysend,m++) = d_ubuf(_image(i)).d;
+    _buf(mysend,m++) = d_ubuf(_molecule(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_bond(i)).d;
     for (k = 0; k < _num_bond(i); k++) {
-      _buf(mysend,m++) = _bond_type(i,k);
-      _buf(mysend,m++) = _bond_atom(i,k);
+      _buf(mysend,m++) = d_ubuf(_bond_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_bond_atom(i,k)).d;
     }
-    _buf(mysend,m++) = _nspecial(i,0);
-    _buf(mysend,m++) = _nspecial(i,1);
-    _buf(mysend,m++) = _nspecial(i,2);
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,0)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,1)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,2)).d;
     for (k = 0; k < _nspecial(i,2); k++)
-      _buf(mysend,m++) = _special(i,k);
+      _buf(mysend,m++) = d_ubuf(_special(i,k)).d;
 
     const int j = _copylist(mysend);
 
@@ -1267,23 +1267,23 @@ struct AtomVecBondKokkos_UnpackExchangeFunctor {
       _v(i,0) = _buf(myrecv,m++);
       _v(i,1) = _buf(myrecv,m++);
       _v(i,2) = _buf(myrecv,m++);
-      _tag(i) = _buf(myrecv,m++);
-      _type(i) = _buf(myrecv,m++);
-      _mask(i) = _buf(myrecv,m++);
-      _image(i) = _buf(myrecv,m++);
+      _tag(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _type(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _mask(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _image(i) = (imageint) d_ubuf(_buf(myrecv,m++)).i;
 
-      _molecule(i) = _buf(myrecv,m++);
-      _num_bond(i) = _buf(myrecv,m++);
+      _molecule(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _num_bond(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       int k;
       for (k = 0; k < _num_bond(i); k++) {
-        _bond_type(i,k) = _buf(myrecv,m++);
-        _bond_atom(i,k) = _buf(myrecv,m++);
+        _bond_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _bond_atom(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _nspecial(i,0) = _buf(myrecv,m++);
-      _nspecial(i,1) = _buf(myrecv,m++);
-      _nspecial(i,2) = _buf(myrecv,m++);
+      _nspecial(i,0) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,1) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,2) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _nspecial(i,2); k++)
-        _special(i,k) = _buf(myrecv,m++);
+        _special(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
     }
   }
 };
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index f6952f127ce8455ce15fd455271e45d462f80c26..856660d1e9cdecf8c7fab09b53ad20326329f422 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -323,7 +323,7 @@ struct AtomVecChargeKokkos_PackCommSelf {
 /* ---------------------------------------------------------------------- */
 
 int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
+                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
   if(commKK->forward_comm_on_host) {
     sync(Host,X_MASK);
     modified(Host,X_MASK);
@@ -631,17 +631,17 @@ struct AtomVecChargeKokkos_PackBorder {
           _buf(i,0) = _x(j,0);
           _buf(i,1) = _x(j,1);
           _buf(i,2) = _x(j,2);
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
           _buf(i,6) = _q(j);
       } else {
           _buf(i,0) = _x(j,0) + _dx;
           _buf(i,1) = _x(j,1) + _dy;
           _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
           _buf(i,6) = _q(j);
       }
   }
@@ -872,9 +872,9 @@ struct AtomVecChargeKokkos_UnpackBorder {
       _x(i+_first,0) = _buf(i,0);
       _x(i+_first,1) = _buf(i,1);
       _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = static_cast<tagint> (_buf(i,3));
-      _type(i+_first) = static_cast<int>  (_buf(i,4));
-      _mask(i+_first) = static_cast<int>  (_buf(i,5));
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
       _q(i+_first) = _buf(i,6);
   }
 };
@@ -1039,10 +1039,10 @@ struct AtomVecChargeKokkos_PackExchangeFunctor {
     _buf(mysend,4) = _v(i,0);
     _buf(mysend,5) = _v(i,1);
     _buf(mysend,6) = _v(i,2);
-    _buf(mysend,7) = _tag[i];
-    _buf(mysend,8) = _type[i];
-    _buf(mysend,9) = _mask[i];
-    _buf(mysend,10) = _image[i];
+    _buf(mysend,7) = d_ubuf(_tag[i]).d;
+    _buf(mysend,8) = d_ubuf(_type[i]).d;
+    _buf(mysend,9) = d_ubuf(_mask[i]).d;
+    _buf(mysend,10) = d_ubuf(_image[i]).d;
     _buf(mysend,11) = _q[i];
     const int j = _copylist(mysend);
 
@@ -1163,10 +1163,10 @@ struct AtomVecChargeKokkos_UnpackExchangeFunctor {
       _v(i,0) = _buf(myrecv,4);
       _v(i,1) = _buf(myrecv,5);
       _v(i,2) = _buf(myrecv,6);
-      _tag[i] = _buf(myrecv,7);
-      _type[i] = _buf(myrecv,8);
-      _mask[i] = _buf(myrecv,9);
-      _image[i] = _buf(myrecv,10);
+      _tag[i] = (tagint) d_ubuf(_buf(myrecv,7)).i;
+      _type[i] = (int) d_ubuf(_buf(myrecv,8)).i;
+      _mask[i] = (int) d_ubuf(_buf(myrecv,9)).i;
+      _image[i] = (imageint) d_ubuf(_buf(myrecv,10)).i;
       _q[i] = _buf(myrecv,11);
     }
   }
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index 731168b6eacd2760f1cb6bb46ecde7c6147fb733..fa4cf18ae386c4bec29d064b32f8bb359e9fed72 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -761,17 +761,6 @@ void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf)
 
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecFullKokkos_PackBorder {
-  union ubuf {
-    double d;
-    int64_t i;
-    KOKKOS_INLINE_FUNCTION
-    ubuf(double arg) : d(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int64_t arg) : i(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int arg) : i(arg) {}
-  };
-
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
 
@@ -808,20 +797,20 @@ struct AtomVecFullKokkos_PackBorder {
           _buf(i,0) = _x(j,0);
           _buf(i,1) = _x(j,1);
           _buf(i,2) = _x(j,2);
-          _buf(i,3) = ubuf(_tag(j)).d;
-          _buf(i,4) = ubuf(_type(j)).d;
-          _buf(i,5) = ubuf(_mask(j)).d;
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
           _buf(i,6) = _q(j);
-          _buf(i,7) = ubuf(_molecule(j)).d;
+          _buf(i,7) = d_ubuf(_molecule(j)).d;
       } else {
           _buf(i,0) = _x(j,0) + _dx;
           _buf(i,1) = _x(j,1) + _dy;
           _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = ubuf(_tag(j)).d;
-          _buf(i,4) = ubuf(_type(j)).d;
-          _buf(i,5) = ubuf(_mask(j)).d;
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
           _buf(i,6) = _q(j);
-          _buf(i,7) = ubuf(_molecule(j)).d;
+          _buf(i,7) = d_ubuf(_molecule(j)).d;
       }
   }
 };
@@ -1030,17 +1019,6 @@ int AtomVecFullKokkos::pack_border_hybrid(int n, int *list, double *buf)
 
 template<class DeviceType>
 struct AtomVecFullKokkos_UnpackBorder {
-  union ubuf {
-    double d;
-    int64_t i;
-    KOKKOS_INLINE_FUNCTION
-    ubuf(double arg) : d(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int64_t arg) : i(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int arg) : i(arg) {}
-  };
-
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
 
@@ -1072,11 +1050,11 @@ struct AtomVecFullKokkos_UnpackBorder {
       _x(i+_first,0) = _buf(i,0);
       _x(i+_first,1) = _buf(i,1);
       _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = (tagint) ubuf(_buf(i,3)).i;
-      _type(i+_first) = (int) ubuf(_buf(i,4)).i;
-      _mask(i+_first) = (int) ubuf(_buf(i,5)).i;
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
       _q(i+_first) = _buf(i,6);
-      _molecule(i+_first) = (tagint) ubuf(_buf(i,7)).i;
+      _molecule(i+_first) = (tagint) d_ubuf(_buf(i,7)).i;
 
   }
 };
@@ -1178,18 +1156,6 @@ int AtomVecFullKokkos::unpack_border_hybrid(int n, int first, double *buf)
 
 template<class DeviceType>
 struct AtomVecFullKokkos_PackExchangeFunctor {
-
-  union ubuf {
-    double d;
-    int64_t i;
-    KOKKOS_INLINE_FUNCTION
-    ubuf(double arg) : d(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int64_t arg) : i(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int arg) : i(arg) {}
-  };
-
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   typename AT::t_x_array_randomread _x;
@@ -1328,7 +1294,7 @@ struct AtomVecFullKokkos_PackExchangeFunctor {
     elements = 20+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom+
       5*atom->dihedral_per_atom + 5*atom->improper_per_atom;
     const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*
-			     buf.template view<DeviceType>().dimension_1())/elements;
+                             buf.template view<DeviceType>().dimension_1())/elements;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
   }
 
@@ -1344,46 +1310,46 @@ struct AtomVecFullKokkos_PackExchangeFunctor {
     _buf(mysend,m++) = _v(i,0);
     _buf(mysend,m++) = _v(i,1);
     _buf(mysend,m++) = _v(i,2);
-    _buf(mysend,m++) = ubuf(_tag(i)).d;
-    _buf(mysend,m++) = ubuf(_type(i)).d;
-    _buf(mysend,m++) = ubuf(_mask(i)).d;
-    _buf(mysend,m++) = ubuf(_image(i)).d;
+    _buf(mysend,m++) = d_ubuf(_tag(i)).d;
+    _buf(mysend,m++) = d_ubuf(_type(i)).d;
+    _buf(mysend,m++) = d_ubuf(_mask(i)).d;
+    _buf(mysend,m++) = d_ubuf(_image(i)).d;
     _buf(mysend,m++) = _q(i);
-    _buf(mysend,m++) = ubuf(_molecule(i)).d;
-    _buf(mysend,m++) = ubuf(_num_bond(i)).d;
+    _buf(mysend,m++) = d_ubuf(_molecule(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_bond(i)).d;
     for (k = 0; k < _num_bond(i); k++) {
-      _buf(mysend,m++) = ubuf(_bond_type(i,k)).d;
-      _buf(mysend,m++) = ubuf(_bond_atom(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_bond_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_bond_atom(i,k)).d;
     }
-    _buf(mysend,m++) = ubuf(_num_angle(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_angle(i)).d;
     for (k = 0; k < _num_angle(i); k++) {
-      _buf(mysend,m++) = ubuf(_angle_type(i,k)).d;
-      _buf(mysend,m++) = ubuf(_angle_atom1(i,k)).d;
-      _buf(mysend,m++) = ubuf(_angle_atom2(i,k)).d;
-      _buf(mysend,m++) = ubuf(_angle_atom3(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom3(i,k)).d;
     }
-    _buf(mysend,m++) = ubuf(_num_dihedral(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_dihedral(i)).d;
     for (k = 0; k < _num_dihedral(i); k++) {
-      _buf(mysend,m++) = ubuf(_dihedral_type(i,k)).d;
-      _buf(mysend,m++) = ubuf(_dihedral_atom1(i,k)).d;
-      _buf(mysend,m++) = ubuf(_dihedral_atom2(i,k)).d;
-      _buf(mysend,m++) = ubuf(_dihedral_atom3(i,k)).d;
-      _buf(mysend,m++) = ubuf(_dihedral_atom4(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom3(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom4(i,k)).d;
     }
-    _buf(mysend,m++) = ubuf(_num_improper(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_improper(i)).d;
     for (k = 0; k < _num_improper(i); k++) {
-      _buf(mysend,m++) = ubuf(_improper_type(i,k)).d;
-      _buf(mysend,m++) = ubuf(_improper_atom1(i,k)).d;
-      _buf(mysend,m++) = ubuf(_improper_atom2(i,k)).d;
-      _buf(mysend,m++) = ubuf(_improper_atom3(i,k)).d;
-      _buf(mysend,m++) = ubuf(_improper_atom4(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom3(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom4(i,k)).d;
     }
 
-    _buf(mysend,m++) = ubuf(_nspecial(i,0)).d;
-    _buf(mysend,m++) = ubuf(_nspecial(i,1)).d;
-    _buf(mysend,m++) = ubuf(_nspecial(i,2)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,0)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,1)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,2)).d;
     for (k = 0; k < _nspecial(i,2); k++)
-      _buf(mysend,m++) = ubuf(_special(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_special(i,k)).d;
 
     const int j = _copylist(mysend);
 
@@ -1531,18 +1497,6 @@ int AtomVecFullKokkos::pack_exchange(int i, double *buf)
 
 template<class DeviceType>
 struct AtomVecFullKokkos_UnpackExchangeFunctor {
-
-  union ubuf {
-    double d;
-    int64_t i;
-    KOKKOS_INLINE_FUNCTION
-    ubuf(double arg) : d(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int64_t arg) : i(arg) {}
-    KOKKOS_INLINE_FUNCTION
-    ubuf(int arg) : i(arg) {}
-  };
-
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
   typename AT::t_x_array _x;
@@ -1617,7 +1571,7 @@ struct AtomVecFullKokkos_UnpackExchangeFunctor {
     elements = 20+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom+
       5*atom->dihedral_per_atom + 5*atom->improper_per_atom;
     const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*
-			     buf.template view<DeviceType>().dimension_1())/elements;
+                             buf.template view<DeviceType>().dimension_1())/elements;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
   }
 
@@ -1633,46 +1587,46 @@ struct AtomVecFullKokkos_UnpackExchangeFunctor {
       _v(i,0) = _buf(myrecv,m++);
       _v(i,1) = _buf(myrecv,m++);
       _v(i,2) = _buf(myrecv,m++);
-      _tag(i) = (tagint) ubuf(_buf(myrecv,m++)).i;
-      _type(i) = (int) ubuf(_buf(myrecv,m++)).i;
-      _mask(i) = (int) ubuf(_buf(myrecv,m++)).i;
-      _image(i) = (imageint) ubuf(_buf(myrecv,m++)).i;
+      _tag(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _type(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _mask(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _image(i) = (imageint) d_ubuf(_buf(myrecv,m++)).i;
       _q(i) = _buf(myrecv,m++);
-      _molecule(i) = (tagint) ubuf(_buf(myrecv,m++)).i;
-      _num_bond(i) = (int) ubuf(_buf(myrecv,m++)).i;
+      _molecule(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _num_bond(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       int k;
       for (k = 0; k < _num_bond(i); k++) {
-        _bond_type(i,k) = (int) ubuf(_buf(myrecv,m++)).i;
-        _bond_atom(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
+        _bond_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _bond_atom(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_angle(i) = (int) ubuf(_buf(myrecv,m++)).i;
+      _num_angle(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _num_angle(i); k++) {
-        _angle_type(i,k) = (int) ubuf(_buf(myrecv,m++)).i;
-        _angle_atom1(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _angle_atom2(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _angle_atom3(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
+        _angle_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_dihedral(i) = (int) ubuf(_buf(myrecv,m++)).i;
+      _num_dihedral(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _num_dihedral(i); k++) {
-        _dihedral_type(i,k) = (int) ubuf(_buf(myrecv,m++)).i;
-        _dihedral_atom1(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _dihedral_atom2(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _dihedral_atom3(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _dihedral_atom4(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
+        _dihedral_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom4(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_improper(i) = (int) ubuf(_buf(myrecv,m++)).i;
+      _num_improper(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _num_improper(i); k++) {
-        _improper_type(i,k) = (int) ubuf(_buf(myrecv,m++)).i;
-        _improper_atom1(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _improper_atom2(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _improper_atom3(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
-        _improper_atom4(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
+        _improper_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom4(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _nspecial(i,0) = (int) ubuf(_buf(myrecv,m++)).i;
-      _nspecial(i,1) = (int) ubuf(_buf(myrecv,m++)).i;
-      _nspecial(i,2) = (int) ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,0) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,1) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,2) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _nspecial(i,2); k++)
-        _special(i,k) = (tagint) ubuf(_buf(myrecv,m++)).i;
+        _special(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
     }
   }
 };
diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h
index 7ac66f162696e4d4814676134b7ff8ffc626d4ae..7f593f235f6736d0dd966c6989feb40632705b8a 100644
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@@ -20,6 +20,17 @@
 
 namespace LAMMPS_NS {
 
+union d_ubuf {
+  double d;
+  int64_t i;
+  KOKKOS_INLINE_FUNCTION
+  d_ubuf(double arg) : d(arg) {}
+  KOKKOS_INLINE_FUNCTION
+  d_ubuf(int64_t arg) : i(arg) {}
+  KOKKOS_INLINE_FUNCTION
+  d_ubuf(int arg) : i(arg) {}
+};
+
 class AtomVecKokkos : public AtomVec {
  public:
   AtomVecKokkos(class LAMMPS *);
diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.cpp b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
index 4fd8114376f23e17ad5c3a0d0a793402ebf3a30f..5c16ac15135ec642a2a3fa95b31c01cbf41c71fb 100644
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@@ -786,18 +786,18 @@ struct AtomVecMolecularKokkos_PackBorder {
           _buf(i,0) = _x(j,0);
           _buf(i,1) = _x(j,1);
           _buf(i,2) = _x(j,2);
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
-          _buf(i,6) = _molecule(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = d_ubuf(_molecule(j)).d;
       } else {
           _buf(i,0) = _x(j,0) + _dx;
           _buf(i,1) = _x(j,1) + _dy;
           _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = _tag(j);
-          _buf(i,4) = _type(j);
-          _buf(i,5) = _mask(j);
-          _buf(i,6) = _molecule(j);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = d_ubuf(_molecule(j)).d;
       }
   }
 };
@@ -1029,10 +1029,10 @@ struct AtomVecMolecularKokkos_UnpackBorder {
       _x(i+_first,0) = _buf(i,0);
       _x(i+_first,1) = _buf(i,1);
       _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = static_cast<tagint> (_buf(i,3));
-      _type(i+_first) = static_cast<int>  (_buf(i,4));
-      _mask(i+_first) = static_cast<int>  (_buf(i,5));
-      _molecule(i+_first) = static_cast<tagint> (_buf(i,6));
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
+      _molecule(i+_first) = (tagint) d_ubuf(_buf(i,6)).i;
 
   }
 };
@@ -1263,7 +1263,7 @@ struct AtomVecMolecularKokkos_PackExchangeFunctor {
     elements = 19+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom+
       5*atom->dihedral_per_atom + 5*atom->improper_per_atom;
     const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*
-			     buf.template view<DeviceType>().dimension_1())/elements;
+                             buf.template view<DeviceType>().dimension_1())/elements;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
   }
 
@@ -1279,45 +1279,45 @@ struct AtomVecMolecularKokkos_PackExchangeFunctor {
     _buf(mysend,m++) = _v(i,0);
     _buf(mysend,m++) = _v(i,1);
     _buf(mysend,m++) = _v(i,2);
-    _buf(mysend,m++) = _tag(i);
-    _buf(mysend,m++) = _type(i);
-    _buf(mysend,m++) = _mask(i);
-    _buf(mysend,m++) = _image(i);
-    _buf(mysend,m++) = _molecule(i);
-    _buf(mysend,m++) = _num_bond(i);
+    _buf(mysend,m++) = d_ubuf(_tag(i)).d;
+    _buf(mysend,m++) = d_ubuf(_type(i)).d;
+    _buf(mysend,m++) = d_ubuf(_mask(i)).d;
+    _buf(mysend,m++) = d_ubuf(_image(i)).d;
+    _buf(mysend,m++) = d_ubuf(_molecule(i)).d;
+    _buf(mysend,m++) = d_ubuf(_num_bond(i)).d;
     for (k = 0; k < _num_bond(i); k++) {
-      _buf(mysend,m++) = _bond_type(i,k);
-      _buf(mysend,m++) = _bond_atom(i,k);
+      _buf(mysend,m++) = d_ubuf(_bond_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_bond_atom(i,k)).d;
     }
-    _buf(mysend,m++) = _num_angle(i);
+    _buf(mysend,m++) = d_ubuf(_num_angle(i)).d;
     for (k = 0; k < _num_angle(i); k++) {
-      _buf(mysend,m++) = _angle_type(i,k);
-      _buf(mysend,m++) = _angle_atom1(i,k);
-      _buf(mysend,m++) = _angle_atom2(i,k);
-      _buf(mysend,m++) = _angle_atom3(i,k);
+      _buf(mysend,m++) = d_ubuf(_angle_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_angle_atom3(i,k)).d;
     }
-    _buf(mysend,m++) = _num_dihedral(i);
+    _buf(mysend,m++) = d_ubuf(_num_dihedral(i)).d;
     for (k = 0; k < _num_dihedral(i); k++) {
-      _buf(mysend,m++) = _dihedral_type(i,k);
-      _buf(mysend,m++) = _dihedral_atom1(i,k);
-      _buf(mysend,m++) = _dihedral_atom2(i,k);
-      _buf(mysend,m++) = _dihedral_atom3(i,k);
-      _buf(mysend,m++) = _dihedral_atom4(i,k);
+      _buf(mysend,m++) = d_ubuf(_dihedral_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom3(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_dihedral_atom4(i,k)).d;
     }
-    _buf(mysend,m++) = _num_improper(i);
+    _buf(mysend,m++) = d_ubuf(_num_improper(i)).d;
     for (k = 0; k < _num_improper(i); k++) {
-      _buf(mysend,m++) = _improper_type(i,k);
-      _buf(mysend,m++) = _improper_atom1(i,k);
-      _buf(mysend,m++) = _improper_atom2(i,k);
-      _buf(mysend,m++) = _improper_atom3(i,k);
-      _buf(mysend,m++) = _improper_atom4(i,k);
+      _buf(mysend,m++) = d_ubuf(_improper_type(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom1(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom2(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom3(i,k)).d;
+      _buf(mysend,m++) = d_ubuf(_improper_atom4(i,k)).d;
     }
 
-    _buf(mysend,m++) = _nspecial(i,0);
-    _buf(mysend,m++) = _nspecial(i,1);
-    _buf(mysend,m++) = _nspecial(i,2);
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,0)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,1)).d;
+    _buf(mysend,m++) = d_ubuf(_nspecial(i,2)).d;
     for (k = 0; k < _nspecial(i,2); k++)
-      _buf(mysend,m++) = _special(i,k);
+      _buf(mysend,m++) = d_ubuf(_special(i,k)).d;
 
     const int j = _copylist(mysend);
 
@@ -1536,7 +1536,7 @@ struct AtomVecMolecularKokkos_UnpackExchangeFunctor {
     elements = 19+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom+
       5*atom->dihedral_per_atom + 5*atom->improper_per_atom;
     const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*
-			     buf.template view<DeviceType>().dimension_1())/elements;
+                             buf.template view<DeviceType>().dimension_1())/elements;
     buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
   }
 
@@ -1552,46 +1552,46 @@ struct AtomVecMolecularKokkos_UnpackExchangeFunctor {
       _v(i,0) = _buf(myrecv,m++);
       _v(i,1) = _buf(myrecv,m++);
       _v(i,2) = _buf(myrecv,m++);
-      _tag(i) = _buf(myrecv,m++);
-      _type(i) = _buf(myrecv,m++);
-      _mask(i) = _buf(myrecv,m++);
-      _image(i) = _buf(myrecv,m++);
+      _tag(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _type(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _mask(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _image(i) = (imageint) d_ubuf(_buf(myrecv,m++)).i;
 
-      _molecule(i) = _buf(myrecv,m++);
-      _num_bond(i) = _buf(myrecv,m++);
+      _molecule(i) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+      _num_bond(i) = (int) d_ubuf(_buf(myrecv,m++)).i;
       int k;
       for (k = 0; k < _num_bond(i); k++) {
-        _bond_type(i,k) = _buf(myrecv,m++);
-        _bond_atom(i,k) = _buf(myrecv,m++);
+        _bond_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _bond_atom(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_angle(i) =  _buf(myrecv,m++);
+      _num_angle(i) =  (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _num_angle(i); k++) {
-        _angle_type(i,k) = _buf(myrecv,m++);
-        _angle_atom1(i,k) = _buf(myrecv,m++);
-        _angle_atom2(i,k) = _buf(myrecv,m++);
-        _angle_atom3(i,k) = _buf(myrecv,m++);
+        _angle_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _angle_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_dihedral(i) =  _buf(myrecv,m++);
+      _num_dihedral(i) = d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _num_dihedral(i); k++) {
-        _dihedral_type(i,k) = _buf(myrecv,m++);
-        _dihedral_atom1(i,k) = _buf(myrecv,m++);
-        _dihedral_atom2(i,k) = _buf(myrecv,m++);
-        _dihedral_atom3(i,k) = _buf(myrecv,m++);
-        _dihedral_atom4(i,k) = _buf(myrecv,m++);
+        _dihedral_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _dihedral_atom4(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _num_improper(i) =  _buf(myrecv,m++);
-      for (k = 0; k < _num_improper(i); k++) {
-        _improper_type(i,k) = _buf(myrecv,m++);
-        _improper_atom1(i,k) = _buf(myrecv,m++);
-        _improper_atom2(i,k) = _buf(myrecv,m++);
-        _improper_atom3(i,k) = _buf(myrecv,m++);
-        _improper_atom4(i,k) = _buf(myrecv,m++);
+      _num_improper(i) =  (int) d_ubuf(_buf(myrecv,m++)).i;
+      for (k = 0; k < (int) _num_improper(i); k++) {
+        _improper_type(i,k) = (int) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom1(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom2(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom3(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
+        _improper_atom4(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
       }
-      _nspecial(i,0) = _buf(myrecv,m++);
-      _nspecial(i,1) = _buf(myrecv,m++);
-      _nspecial(i,2) = _buf(myrecv,m++);
+      _nspecial(i,0) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,1) = (int) d_ubuf(_buf(myrecv,m++)).i;
+      _nspecial(i,2) = (int) d_ubuf(_buf(myrecv,m++)).i;
       for (k = 0; k < _nspecial(i,2); k++)
-        _special(i,k) = _buf(myrecv,m++);
+        _special(i,k) = (tagint) d_ubuf(_buf(myrecv,m++)).i;
     }
   }
 };
diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index 3b8d5a85ea600db699b128f75c59d62a33aa64c2..2e46b85fd2e7800b1883d0925f1285f061edc12e 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -37,7 +37,7 @@
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
-#include "pair_reax_c_kokkos.h"
+#include "pair_reaxc_kokkos.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
@@ -50,7 +50,8 @@ using namespace FixConst;
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-FixQEqReaxKokkos<DeviceType>::FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
+FixQEqReaxKokkos<DeviceType>::
+FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
   FixQEqReax(lmp, narg, arg)
 {
   kokkosable = 1;
diff --git a/src/KOKKOS/fix_reaxc_bonds_kokkos.cpp b/src/KOKKOS/fix_reaxc_bonds_kokkos.cpp
index 7688d6745a06c64bf4b0dafc69f928b770a2bb0d..e4fb9385a5167d01fb29356b22dbe7fc922c6aca 100644
--- a/src/KOKKOS/fix_reaxc_bonds_kokkos.cpp
+++ b/src/KOKKOS/fix_reaxc_bonds_kokkos.cpp
@@ -21,7 +21,7 @@
 #include "fix_reaxc_bonds_kokkos.h"
 #include "atom.h"
 #include "update.h"
-#include "pair_reax_c_kokkos.h"
+#include "pair_reaxc_kokkos.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
diff --git a/src/KOKKOS/fix_reaxc_species_kokkos.cpp b/src/KOKKOS/fix_reaxc_species_kokkos.cpp
index 17b42174c6a2460920535da39bd331dd0c6c5b1b..ce84de30cb2eaf83cf8230e916f82ef19754de33 100644
--- a/src/KOKKOS/fix_reaxc_species_kokkos.cpp
+++ b/src/KOKKOS/fix_reaxc_species_kokkos.cpp
@@ -23,7 +23,7 @@
 #include "fix_reaxc_species_kokkos.h"
 #include "domain.h"
 #include "update.h"
-#include "pair_reax_c_kokkos.h"
+#include "pair_reaxc_kokkos.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
@@ -156,4 +156,4 @@ void FixReaxCSpeciesKokkos::FindMolecule()
     if (looptot >= 400*nprocs) break;
 
   }
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/modify_kokkos.cpp b/src/KOKKOS/modify_kokkos.cpp
index b4a89c8e39586c080abfd4ea99b9efd3368d29fb..c9242f21166a4a0745781336ab58a4586ca09ce5 100644
--- a/src/KOKKOS/modify_kokkos.cpp
+++ b/src/KOKKOS/modify_kokkos.cpp
@@ -44,17 +44,19 @@ void ModifyKokkos::setup(int vflag)
   if (update->whichflag == 1)
     for (int i = 0; i < nfix; i++) {
       atomKK->sync(fix[i]->execution_space,fix[i]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[i]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[i]->setup(vflag);
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[i]->execution_space,fix[i]->datamask_modify);
     }
   else if (update->whichflag == 2)
     for (int i = 0; i < nfix; i++) {
       atomKK->sync(fix[i]->execution_space,fix[i]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[i]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[i]->min_setup(vflag);
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[i]->execution_space,fix[i]->datamask_modify);
     }
 }
@@ -70,9 +72,10 @@ void ModifyKokkos::setup_pre_exchange()
     for (int i = 0; i < n_pre_exchange; i++) {
       atomKK->sync(fix[list_pre_exchange[i]]->execution_space,
                    fix[list_pre_exchange[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_pre_exchange[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_pre_exchange[i]]->setup_pre_exchange();
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_pre_exchange[i]]->execution_space,
                        fix[list_pre_exchange[i]]->datamask_modify);
     }
@@ -80,9 +83,10 @@ void ModifyKokkos::setup_pre_exchange()
     for (int i = 0; i < n_min_pre_exchange; i++) {
       atomKK->sync(fix[list_min_pre_exchange[i]]->execution_space,
                    fix[list_min_pre_exchange[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_min_pre_exchange[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_min_pre_exchange[i]]->setup_pre_exchange();
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_min_pre_exchange[i]]->execution_space,
                        fix[list_min_pre_exchange[i]]->datamask_modify);
     }
@@ -99,9 +103,10 @@ void ModifyKokkos::setup_pre_neighbor()
     for (int i = 0; i < n_pre_neighbor; i++) {
       atomKK->sync(fix[list_pre_neighbor[i]]->execution_space,
                    fix[list_pre_neighbor[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_pre_neighbor[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_pre_neighbor[i]]->setup_pre_neighbor();
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_pre_neighbor[i]]->execution_space,
                        fix[list_pre_neighbor[i]]->datamask_modify);
     }
@@ -109,9 +114,10 @@ void ModifyKokkos::setup_pre_neighbor()
     for (int i = 0; i < n_min_pre_neighbor; i++) {
       atomKK->sync(fix[list_min_pre_neighbor[i]]->execution_space,
                    fix[list_min_pre_neighbor[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_min_pre_neighbor[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_min_pre_neighbor[i]]->setup_pre_neighbor();
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_min_pre_neighbor[i]]->execution_space,
                        fix[list_min_pre_neighbor[i]]->datamask_modify);
     }
@@ -128,9 +134,10 @@ void ModifyKokkos::setup_pre_force(int vflag)
     for (int i = 0; i < n_pre_force; i++) {
       atomKK->sync(fix[list_pre_force[i]]->execution_space,
                    fix[list_pre_force[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_pre_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_pre_force[i]]->setup_pre_force(vflag);
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_pre_force[i]]->execution_space,
                        fix[list_pre_force[i]]->datamask_modify);
     }
@@ -138,9 +145,10 @@ void ModifyKokkos::setup_pre_force(int vflag)
     for (int i = 0; i < n_min_pre_force; i++) {
       atomKK->sync(fix[list_min_pre_force[i]]->execution_space,
                    fix[list_min_pre_force[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_min_pre_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_min_pre_force[i]]->setup_pre_force(vflag);
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_min_pre_force[i]]->execution_space,
                        fix[list_min_pre_force[i]]->datamask_modify);
     }
@@ -157,9 +165,10 @@ void ModifyKokkos::setup_pre_reverse(int eflag, int vflag)
     for (int i = 0; i < n_pre_reverse; i++) {
       atomKK->sync(fix[list_pre_reverse[i]]->execution_space,
                    fix[list_pre_reverse[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_pre_reverse[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_pre_reverse[i]]->setup_pre_reverse(eflag,vflag);
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_pre_reverse[i]]->execution_space,
                        fix[list_pre_reverse[i]]->datamask_modify);
     }
@@ -167,9 +176,10 @@ void ModifyKokkos::setup_pre_reverse(int eflag, int vflag)
     for (int i = 0; i < n_min_pre_reverse; i++) {
       atomKK->sync(fix[list_min_pre_reverse[i]]->execution_space,
                    fix[list_min_pre_reverse[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_min_pre_reverse[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_min_pre_reverse[i]]->setup_pre_reverse(eflag,vflag);
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_min_pre_reverse[i]]->execution_space,
                        fix[list_min_pre_reverse[i]]->datamask_modify);
     }
@@ -184,9 +194,10 @@ void ModifyKokkos::initial_integrate(int vflag)
   for (int i = 0; i < n_initial_integrate; i++) {
     atomKK->sync(fix[list_initial_integrate[i]]->execution_space,
                  fix[list_initial_integrate[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_initial_integrate[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_initial_integrate[i]]->initial_integrate(vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_initial_integrate[i]]->execution_space,
                      fix[list_initial_integrate[i]]->datamask_modify);
   }
@@ -201,9 +212,10 @@ void ModifyKokkos::post_integrate()
   for (int i = 0; i < n_post_integrate; i++) {
     atomKK->sync(fix[list_post_integrate[i]]->execution_space,
                  fix[list_post_integrate[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_post_integrate[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_post_integrate[i]]->post_integrate();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_post_integrate[i]]->execution_space,
                      fix[list_post_integrate[i]]->datamask_modify);
   }
@@ -218,9 +230,10 @@ void ModifyKokkos::pre_exchange()
   for (int i = 0; i < n_pre_exchange; i++) {
     atomKK->sync(fix[list_pre_exchange[i]]->execution_space,
                  fix[list_pre_exchange[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_pre_exchange[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_pre_exchange[i]]->pre_exchange();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_pre_exchange[i]]->execution_space,
                      fix[list_pre_exchange[i]]->datamask_modify);
   }
@@ -235,9 +248,10 @@ void ModifyKokkos::pre_neighbor()
   for (int i = 0; i < n_pre_neighbor; i++) {
     atomKK->sync(fix[list_pre_neighbor[i]]->execution_space,
                  fix[list_pre_neighbor[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_pre_neighbor[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_pre_neighbor[i]]->pre_neighbor();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_pre_neighbor[i]]->execution_space,
                      fix[list_pre_neighbor[i]]->datamask_modify);
   }
@@ -252,9 +266,10 @@ void ModifyKokkos::pre_force(int vflag)
   for (int i = 0; i < n_pre_force; i++) {
     atomKK->sync(fix[list_pre_force[i]]->execution_space,
                  fix[list_pre_force[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_pre_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_pre_force[i]]->pre_force(vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_pre_force[i]]->execution_space,
                      fix[list_pre_force[i]]->datamask_modify);
   }
@@ -269,9 +284,10 @@ void ModifyKokkos::pre_reverse(int eflag, int vflag)
   for (int i = 0; i < n_pre_reverse; i++) {
     atomKK->sync(fix[list_pre_reverse[i]]->execution_space,
                  fix[list_pre_reverse[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_pre_reverse[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_pre_reverse[i]]->pre_reverse(eflag,vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_pre_reverse[i]]->execution_space,
                      fix[list_pre_reverse[i]]->datamask_modify);
   }
@@ -286,9 +302,10 @@ void ModifyKokkos::post_force(int vflag)
   for (int i = 0; i < n_post_force; i++) {
     atomKK->sync(fix[list_post_force[i]]->execution_space,
                  fix[list_post_force[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_post_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_post_force[i]]->post_force(vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_post_force[i]]->execution_space,
                      fix[list_post_force[i]]->datamask_modify);
   }
@@ -303,9 +320,10 @@ void ModifyKokkos::final_integrate()
   for (int i = 0; i < n_final_integrate; i++) {
     atomKK->sync(fix[list_final_integrate[i]]->execution_space,
                  fix[list_final_integrate[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_final_integrate[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_final_integrate[i]]->final_integrate();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_final_integrate[i]]->execution_space,
                      fix[list_final_integrate[i]]->datamask_modify);
   }
@@ -322,9 +340,10 @@ void ModifyKokkos::end_of_step()
     if (update->ntimestep % end_of_step_every[i] == 0) {
       atomKK->sync(fix[list_end_of_step[i]]->execution_space,
                    fix[list_end_of_step[i]]->datamask_read);
+      int prev_auto_sync = lmp->kokkos->auto_sync;
       if (!fix[list_end_of_step[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
       fix[list_end_of_step[i]]->end_of_step();
-      lmp->kokkos->auto_sync = 0;
+      lmp->kokkos->auto_sync = prev_auto_sync;
       atomKK->modified(fix[list_end_of_step[i]]->execution_space,
                        fix[list_end_of_step[i]]->datamask_modify);
     }
@@ -342,9 +361,10 @@ double ModifyKokkos::thermo_energy()
   for (int i = 0; i < n_thermo_energy; i++) {
     atomKK->sync(fix[list_thermo_energy[i]]->execution_space,
                  fix[list_thermo_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_thermo_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     energy += fix[list_thermo_energy[i]]->compute_scalar();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_thermo_energy[i]]->execution_space,
                      fix[list_thermo_energy[i]]->datamask_modify);
   }
@@ -375,9 +395,10 @@ void ModifyKokkos::setup_pre_force_respa(int vflag, int ilevel)
   for (int i = 0; i < n_pre_force; i++) {
     atomKK->sync(fix[list_pre_force[i]]->execution_space,
                  fix[list_pre_force[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_pre_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_pre_force[i]]->setup_pre_force_respa(vflag,ilevel);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_pre_force[i]]->execution_space,
                      fix[list_pre_force[i]]->datamask_modify);
   }
@@ -392,10 +413,11 @@ void ModifyKokkos::initial_integrate_respa(int vflag, int ilevel, int iloop)
   for (int i = 0; i < n_initial_integrate_respa; i++) {
     atomKK->sync(fix[list_initial_integrate_respa[i]]->execution_space,
                  fix[list_initial_integrate_respa[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_initial_integrate_respa[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_initial_integrate_respa[i]]->
       initial_integrate_respa(vflag,ilevel,iloop);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_initial_integrate_respa[i]]->execution_space,
                      fix[list_initial_integrate_respa[i]]->datamask_modify);
   }
@@ -410,9 +432,10 @@ void ModifyKokkos::post_integrate_respa(int ilevel, int iloop)
   for (int i = 0; i < n_post_integrate_respa; i++) {
     atomKK->sync(fix[list_post_integrate_respa[i]]->execution_space,
                  fix[list_post_integrate_respa[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_post_integrate_respa[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_post_integrate_respa[i]]->post_integrate_respa(ilevel,iloop);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_post_integrate_respa[i]]->execution_space,
                      fix[list_post_integrate_respa[i]]->datamask_modify);
   }
@@ -427,9 +450,10 @@ void ModifyKokkos::pre_force_respa(int vflag, int ilevel, int iloop)
   for (int i = 0; i < n_pre_force_respa; i++) {
     atomKK->sync(fix[list_pre_force_respa[i]]->execution_space,
                  fix[list_pre_force_respa[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_pre_force_respa[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_pre_force_respa[i]]->pre_force_respa(vflag,ilevel,iloop);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_pre_force_respa[i]]->execution_space,
                      fix[list_pre_force_respa[i]]->datamask_modify);
   }
@@ -444,9 +468,10 @@ void ModifyKokkos::post_force_respa(int vflag, int ilevel, int iloop)
   for (int i = 0; i < n_post_force_respa; i++) {
     atomKK->sync(fix[list_post_force_respa[i]]->execution_space,
                  fix[list_post_force_respa[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_post_force_respa[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_post_force_respa[i]]->post_force_respa(vflag,ilevel,iloop);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_post_force_respa[i]]->execution_space,
                      fix[list_post_force_respa[i]]->datamask_modify);
   }
@@ -461,9 +486,10 @@ void ModifyKokkos::final_integrate_respa(int ilevel, int iloop)
   for (int i = 0; i < n_final_integrate_respa; i++) {
     atomKK->sync(fix[list_final_integrate_respa[i]]->execution_space,
                  fix[list_final_integrate_respa[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_final_integrate_respa[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_final_integrate_respa[i]]->final_integrate_respa(ilevel,iloop);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_final_integrate_respa[i]]->execution_space,
                      fix[list_final_integrate_respa[i]]->datamask_modify);
   }
@@ -478,9 +504,10 @@ void ModifyKokkos::min_pre_exchange()
   for (int i = 0; i < n_min_pre_exchange; i++) {
     atomKK->sync(fix[list_min_pre_exchange[i]]->execution_space,
                  fix[list_min_pre_exchange[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_pre_exchange[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_pre_exchange[i]]->min_pre_exchange();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_pre_exchange[i]]->execution_space,
                      fix[list_min_pre_exchange[i]]->datamask_modify);
   }
@@ -495,9 +522,10 @@ void ModifyKokkos::min_pre_neighbor()
   for (int i = 0; i < n_min_pre_neighbor; i++) {
     atomKK->sync(fix[list_min_pre_neighbor[i]]->execution_space,
                  fix[list_min_pre_neighbor[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_pre_neighbor[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_pre_neighbor[i]]->min_pre_neighbor();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_pre_neighbor[i]]->execution_space,
                      fix[list_min_pre_neighbor[i]]->datamask_modify);
   }
@@ -512,9 +540,10 @@ void ModifyKokkos::min_pre_force(int vflag)
   for (int i = 0; i < n_min_pre_force; i++) {
     atomKK->sync(fix[list_min_pre_force[i]]->execution_space,
                  fix[list_min_pre_force[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_pre_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_pre_force[i]]->min_pre_force(vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_pre_force[i]]->execution_space,
                      fix[list_min_pre_force[i]]->datamask_modify);
   }
@@ -529,9 +558,10 @@ void ModifyKokkos::min_pre_reverse(int eflag, int vflag)
   for (int i = 0; i < n_min_pre_reverse; i++) {
     atomKK->sync(fix[list_min_pre_reverse[i]]->execution_space,
                  fix[list_min_pre_reverse[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_pre_reverse[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_pre_reverse[i]]->min_pre_reverse(eflag,vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_pre_reverse[i]]->execution_space,
                      fix[list_min_pre_reverse[i]]->datamask_modify);
   }
@@ -546,9 +576,10 @@ void ModifyKokkos::min_post_force(int vflag)
   for (int i = 0; i < n_min_post_force; i++) {
     atomKK->sync(fix[list_min_post_force[i]]->execution_space,
                  fix[list_min_post_force[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_post_force[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_post_force[i]]->min_post_force(vflag);
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_post_force[i]]->execution_space,
                      fix[list_min_post_force[i]]->datamask_modify);
   }
@@ -568,10 +599,11 @@ double ModifyKokkos::min_energy(double *fextra)
   for (int i = 0; i < n_min_energy; i++) {
     ifix = list_min_energy[i];
     atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[ifix]->kokkosable) lmp->kokkos->auto_sync = 1;
     eng += fix[ifix]->min_energy(&fextra[index]);
     index += fix[ifix]->min_dof();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify);
   }
   return eng;
@@ -586,9 +618,10 @@ void ModifyKokkos::min_store()
   for (int i = 0; i < n_min_energy; i++) {
     atomKK->sync(fix[list_min_energy[i]]->execution_space,
                  fix[list_min_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_energy[i]]->min_store();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_energy[i]]->execution_space,
                      fix[list_min_energy[i]]->datamask_modify);
   }
@@ -603,9 +636,10 @@ void ModifyKokkos::min_clearstore()
   for (int i = 0; i < n_min_energy; i++) {
     atomKK->sync(fix[list_min_energy[i]]->execution_space,
                  fix[list_min_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_energy[i]]->min_clearstore();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_energy[i]]->execution_space,
                      fix[list_min_energy[i]]->datamask_modify);
   }
@@ -616,9 +650,10 @@ void ModifyKokkos::min_pushstore()
   for (int i = 0; i < n_min_energy; i++) {
     atomKK->sync(fix[list_min_energy[i]]->execution_space,
                  fix[list_min_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_energy[i]]->min_pushstore();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_energy[i]]->execution_space,
                      fix[list_min_energy[i]]->datamask_modify);
   }
@@ -629,9 +664,10 @@ void ModifyKokkos::min_popstore()
   for (int i = 0; i < n_min_energy; i++) {
     atomKK->sync(fix[list_min_energy[i]]->execution_space,
                  fix[list_min_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[list_min_energy[i]]->min_popstore();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_energy[i]]->execution_space,
                      fix[list_min_energy[i]]->datamask_modify);
   }
@@ -649,10 +685,11 @@ void ModifyKokkos::min_step(double alpha, double *hextra)
   for (int i = 0; i < n_min_energy; i++) {
     ifix = list_min_energy[i];
     atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[ifix]->kokkosable) lmp->kokkos->auto_sync = 1;
     fix[ifix]->min_step(alpha,&hextra[index]);
     index += fix[ifix]->min_dof();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify);
   }
 }
@@ -670,11 +707,12 @@ double ModifyKokkos::max_alpha(double *hextra)
   for (int i = 0; i < n_min_energy; i++) {
     ifix = list_min_energy[i];
     atomKK->sync(fix[ifix]->execution_space,fix[ifix]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[ifix]->kokkosable) lmp->kokkos->auto_sync = 1;
     double alpha_one = fix[ifix]->max_alpha(&hextra[index]);
     alpha = MIN(alpha,alpha_one);
     index += fix[ifix]->min_dof();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[ifix]->execution_space,fix[ifix]->datamask_modify);
   }
   return alpha;
@@ -690,9 +728,10 @@ int ModifyKokkos::min_dof()
   for (int i = 0; i < n_min_energy; i++) {
     atomKK->sync(fix[list_min_energy[i]]->execution_space,
                  fix[list_min_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     ndof += fix[list_min_energy[i]]->min_dof();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     atomKK->modified(fix[list_min_energy[i]]->execution_space,
                      fix[list_min_energy[i]]->datamask_modify);
   }
@@ -710,9 +749,10 @@ int ModifyKokkos::min_reset_ref()
   for (int i = 0; i < n_min_energy; i++) {
     atomKK->sync(fix[list_min_energy[i]]->execution_space,
                  fix[list_min_energy[i]]->datamask_read);
+    int prev_auto_sync = lmp->kokkos->auto_sync;
     if (!fix[list_min_energy[i]]->kokkosable) lmp->kokkos->auto_sync = 1;
     itmp = fix[list_min_energy[i]]->min_reset_ref();
-    lmp->kokkos->auto_sync = 0;
+    lmp->kokkos->auto_sync = prev_auto_sync;
     if (itmp) itmpall = 1;
     atomKK->modified(fix[list_min_energy[i]]->execution_space,
                      fix[list_min_energy[i]]->datamask_modify);
diff --git a/src/KOKKOS/pair_reax_c_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp
similarity index 99%
rename from src/KOKKOS/pair_reax_c_kokkos.cpp
rename to src/KOKKOS/pair_reaxc_kokkos.cpp
index acf9c754cdc70367e4f7bc96311291e92d6fe46a..59369b5e082c3b383012670da52697f98e7a0163 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxc_kokkos.cpp
@@ -19,7 +19,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "pair_reax_c_kokkos.h"
+#include "pair_reaxc_kokkos.h"
 #include "kokkos.h"
 #include "atom_kokkos.h"
 #include "comm.h"
@@ -2294,12 +2294,12 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti2<NEIGHFLAG,EVF
   int numbonds = d_bo_num[i];
 
   e_lp = 0.0;
-  if (numbonds > 0)
+  if (numbonds > 0 || control->enobondsflag)
     e_lp = p_lp2 * d_Delta_lp[i] * inv_expvd2;
   const F_FLOAT dElp = p_lp2 * inv_expvd2 + 75.0 * p_lp2 * d_Delta_lp[i] * expvd2 * inv_expvd2*inv_expvd2;
   const F_FLOAT CElp = dElp * d_dDelta_lp[i];
 
-  if (numbonds > 0)
+  if (numbonds > 0 || control->enobondsflag)
     a_CdDelta[i] += CElp;
 
   if (eflag) ev.ereax[0] += e_lp;
@@ -2336,7 +2336,7 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti2<NEIGHFLAG,EVF
   const F_FLOAT inv_exp_ovun8 = 1.0 / (1.0 + exp_ovun8);
 
   e_un = 0;
-  if (numbonds > 0)
+  if (numbonds > 0 || control->enobondsflag)
     e_un = -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
   if (eflag) ev.ereax[2] += e_un;
@@ -2356,7 +2356,7 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeMulti2<NEIGHFLAG,EVF
   // multibody forces
 
   a_CdDelta[i] += CEover3;
-  if (numbonds > 0)
+  if (numbonds > 0 || control->enobondsflag)
     a_CdDelta[i] += CEunder3;
 
   const int j_start = d_bo_first[i];
diff --git a/src/KOKKOS/pair_reax_c_kokkos.h b/src/KOKKOS/pair_reaxc_kokkos.h
similarity index 99%
rename from src/KOKKOS/pair_reax_c_kokkos.h
rename to src/KOKKOS/pair_reaxc_kokkos.h
index 8a0c08b6609c4052bd4398056292fe325b0f630e..59c4d196d5fb22f1edc6c839d56a8cb7e639242b 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.h
+++ b/src/KOKKOS/pair_reaxc_kokkos.h
@@ -25,7 +25,7 @@ PairStyle(reax/c/kk/host,PairReaxCKokkos<LMPHostType>)
 
 #include <stdio.h>
 #include "pair_kokkos.h"
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "neigh_list_kokkos.h"
 #include "reaxc_types.h"
 
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index 53b40423762c0e2750a523fe5f06eb3f60b15b68..e4a3f857d3d24829055c6ce5dc6cfd1ff1a02d52 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -64,14 +64,17 @@ VerletKokkos::VerletKokkos(LAMMPS *lmp, int narg, char **arg) :
    setup before run
 ------------------------------------------------------------------------- */
 
-void VerletKokkos::setup()
+void VerletKokkos::setup(int flag)
 {
   if (comm->me == 0 && screen) {
     fprintf(screen,"Setting up Verlet run ...\n");
-    fprintf(screen,"  Unit style    : %s\n", update->unit_style);
-    fprintf(screen,"  Current step  : " BIGINT_FORMAT "\n", update->ntimestep);
-    fprintf(screen,"  Time step     : %g\n", update->dt);
-    timer->print_timeout(screen);
+    if (flag) {
+      fprintf(screen,"  Unit style    : %s\n", update->unit_style);
+      fprintf(screen,"  Current step  : " BIGINT_FORMAT "\n",
+              update->ntimestep);
+      fprintf(screen,"  Time step     : %g\n", update->dt);
+      timer->print_timeout(screen);
+    }
   }
 
   update->setupflag = 1;
@@ -169,7 +172,7 @@ void VerletKokkos::setup()
   if (force->newton) comm->reverse_comm();
 
   modify->setup(vflag);
-  output->setup();
+  output->setup(flag);
   lmp->kokkos->auto_sync = 1;
   update->setupflag = 1;
 }
diff --git a/src/KOKKOS/verlet_kokkos.h b/src/KOKKOS/verlet_kokkos.h
index 03a93833245dfcf41d36f069eb4b4839b5c156df..6455239204f87844b321aaa63216b7fe67c0a57a 100644
--- a/src/KOKKOS/verlet_kokkos.h
+++ b/src/KOKKOS/verlet_kokkos.h
@@ -29,7 +29,7 @@ class VerletKokkos : public Verlet {
  public:
   VerletKokkos(class LAMMPS *, int, char **);
   ~VerletKokkos() {}
-  void setup();
+  void setup(int flag=1);
   void setup_minimal(int);
   void run(int);
 
diff --git a/src/KSPACE/ewald_disp.cpp b/src/KSPACE/ewald_disp.cpp
index 467a748d08843e506517b79e92d5613e2293325e..85e3da921b68b19154d96dcf871aba5399b63e28 100644
--- a/src/KSPACE/ewald_disp.cpp
+++ b/src/KSPACE/ewald_disp.cpp
@@ -138,13 +138,14 @@ void EwaldDisp::init()
       nsums += n[k];
     }
 
-  if (!gewaldflag) g_ewald = 0.0;
+  if (!gewaldflag) g_ewald = g_ewald_6 = 1.0;
   pair->init();  // so B is defined
   init_coeffs();
   init_coeff_sums();
   if (function[0]) qsum_qsq();
   else qsqsum = qsum = 0.0;
   natoms_original = atom->natoms;
+  if (!gewaldflag) g_ewald = g_ewald_6 = 0.0;
 
   // turn off coulombic if no charge
 
@@ -218,8 +219,8 @@ void EwaldDisp::init()
   }
 
   if (!comm->me) {
-      if (screen) fprintf(screen, "  G vector = %g\n", g_ewald);
-      if (logfile) fprintf(logfile, "  G vector = %g\n", g_ewald);
+      if (screen) fprintf(screen, "  G vector = %g,   accuracy = %g\n", g_ewald,accuracy);
+      if (logfile) fprintf(logfile, "  G vector = %g   accuracy = %g\n", g_ewald,accuracy);
   }
 
   g_ewald_6 = g_ewald;
diff --git a/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp b/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
index 11c7a147e76f41b005986a1685f8140d168ea426..6e17a9bbd7b0cc7fde788379910fea98979d7fa0 100644
--- a/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
+++ b/src/KSPACE/pair_lj_charmmfsw_coul_long.cpp
@@ -57,6 +57,10 @@ PairLJCharmmfswCoulLong::PairLJCharmmfswCoulLong(LAMMPS *lmp) : Pair(lmp)
   implicit = 0;
   mix_flag = ARITHMETIC;
   writedata = 1;
+
+  // short-range/long-range flag accessed by DihedralCharmmfsw
+
+  dihedflag = 1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -669,10 +673,6 @@ void PairLJCharmmfswCoulLong::settings(int narg, char **arg)
   cut_lj = force->numeric(FLERR,arg[1]);
   if (narg == 2) cut_coul = cut_lj;
   else cut_coul = force->numeric(FLERR,arg[2]);
-
-  // indicates pair_style being used for dihedral_charmm
-
-  dihedflag = 1;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KSPACE/pppm_disp.cpp b/src/KSPACE/pppm_disp.cpp
index 5d6c2042beec5c318f6a713642e49eb30b9f251b..b31d42a815e2b66760ebbee2df44c0cfdee474e4 100644
--- a/src/KSPACE/pppm_disp.cpp
+++ b/src/KSPACE/pppm_disp.cpp
@@ -380,6 +380,12 @@ void PPPMDisp::init()
     alpha = qdist / (cos(0.5*theta) * blen);
   }
 
+  //if g_ewald and g_ewald_6 have not been specified, set some initial value
+  //  to avoid problems when calculating the energies!
+
+  if (!gewaldflag) g_ewald = 1;
+  if (!gewaldflag_6) g_ewald_6 = 1;
+
   // initialize the pair style to get the coefficients
 
   neighrequest_flag = 0;
@@ -387,12 +393,6 @@ void PPPMDisp::init()
   neighrequest_flag = 1;
   init_coeffs();
 
-  //if g_ewald and g_ewald_6 have not been specified, set some initial value
-  //  to avoid problems when calculating the energies!
-
-  if (!gewaldflag) g_ewald = 1;
-  if (!gewaldflag_6) g_ewald_6 = 1;
-
   // set accuracy (force units) from accuracy_relative or accuracy_absolute
 
   if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
diff --git a/src/MC/fix_gcmc.cpp b/src/MC/fix_gcmc.cpp
index cba5a0a176087a6125078cc8be950040e58a9454..73758e36285a5a89003ef5a47a0ffbb74d328d58 100644
--- a/src/MC/fix_gcmc.cpp
+++ b/src/MC/fix_gcmc.cpp
@@ -260,7 +260,7 @@ void FixGCMC::options(int narg, char **arg)
   grouptypebits = NULL;
   energy_intra = 0.0;
   tfac_insert = 1.0;
-  overlap_cutoff = 0.0;
+  overlap_cutoffsq = 0.0;
   overlap_flag = 0;
 
   int iarg = 0;
@@ -366,7 +366,8 @@ void FixGCMC::options(int narg, char **arg)
       iarg += 2;
     } else if (strcmp(arg[iarg],"overlap_cutoff") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal fix gcmc command");
-      overlap_cutoff = force->numeric(FLERR,arg[iarg+1]);
+      double rtmp = force->numeric(FLERR,arg[iarg+1]);
+      overlap_cutoffsq = rtmp*rtmp;
       overlap_flag = 1;
       iarg += 2;
     } else error->all(FLERR,"Illegal fix gcmc command");
@@ -2146,7 +2147,7 @@ double FixGCMC::energy(int i, int itype, tagint imolecule, double *coord)
     // if overlap check requested, if overlap,
     // return signal value for energy 
 
-    if (overlap_flag && rsq < overlap_cutoff)
+    if (overlap_flag && rsq < overlap_cutoffsq)
       return MAXENERGYSIGNAL;
     
     if (rsq < cutsq[itype][jtype])
@@ -2216,7 +2217,7 @@ double FixGCMC::energy_full()
         delz = x[i][2] - x[j][2];
         rsq = delx*delx + dely*dely + delz*delz;
       
-        if (rsq < overlap_cutoff) {
+        if (rsq < overlap_cutoffsq) {
           overlaptest = 1;
           break;
         }
diff --git a/src/MC/fix_gcmc.h b/src/MC/fix_gcmc.h
index 2519c0096534fd046805bafc76f007e7af84ff5c..8a5375eed70ad12bd72c12744bd8f55b5cd3c161 100644
--- a/src/MC/fix_gcmc.h
+++ b/src/MC/fix_gcmc.h
@@ -112,7 +112,7 @@ class FixGCMC : public Fix {
   double **cutsq;
   double **atom_coord;
   imageint imagezero;
-  double overlap_cutoff;
+  double overlap_cutoffsq; // square distance cutoff for overlap 
   int overlap_flag;
   
   double energy_intra;
diff --git a/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp b/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp
index c75da63caedcaf42d5fcc8b079cd2a0a457b57b6..af19f3eb3bedb8b7c137f410eadc4a440ad26e0c 100644
--- a/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp
+++ b/src/MOLECULE/pair_lj_charmmfsw_coul_charmmfsh.cpp
@@ -42,6 +42,10 @@ PairLJCharmmfswCoulCharmmfsh::PairLJCharmmfswCoulCharmmfsh(LAMMPS *lmp) :
   implicit = 0;
   mix_flag = ARITHMETIC;
   writedata = 1;
+
+  // short-range/long-range flag accessed by DihedralCharmmfsw
+
+  dihedflag = 0;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -235,10 +239,6 @@ void PairLJCharmmfswCoulCharmmfsh::settings(int narg, char **arg)
   } else {
     cut_coul = force->numeric(FLERR,arg[2]);
   }
-
-  // indicates pair_style being used for dihedral_charmm
-
-  dihedflag = 0;
 }
 
 /* ----------------------------------------------------------------------
@@ -535,7 +535,7 @@ void *PairLJCharmmfswCoulCharmmfsh::extract(const char *str, int &dim)
   dim = 0;
   if (strcmp(str,"implicit") == 0) return (void *) &implicit;
 
-  // info extracted by dihedral_charmmf
+  // info extracted by dihedral_charmmfsw
 
   if (strcmp(str,"cut_coul") == 0) return (void *) &cut_coul;
   if (strcmp(str,"cut_lj_inner") == 0) return (void *) &cut_lj_inner;
diff --git a/src/Makefile b/src/Makefile
index 59f95401485cfb23b0b6cb1302b09b9682f7bccc..32f9c3787c4836e585d205b386395531ca7ee41a 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -46,39 +46,37 @@ endif
 # PACKAGE    = standard packages
 # PACKUSER   = user packagse
 # PACKLIB    = all packages that require an additional lib
+#              should be PACKSYS + PACKINT + PACKEXT
 # PACKSYS    = subset that reqiure a common system library
+#              include MPIIO and LB b/c require full MPI, not just STUBS
 # PACKINT    = subset that require an internal (provided) library
 # PACKEXT    = subset that require an external (downloaded) library
-# PACKLIB    = PACKSYS + PACKING + PACKEXT
-# PACKSCRIPT = libs under lammps/lib that have an Install.py script
 
 PACKAGE = asphere body class2 colloid compress coreshell dipole gpu \
 	  granular kim kokkos kspace manybody mc meam misc molecule \
 	  mpiio mscg opt peri poems \
 	  python qeq reax replica rigid shock snap srd voronoi
 
-PACKUSER = user-atc user-awpmd user-cg-cmm user-cgdna user-colvars \
+PACKUSER = user-atc user-awpmd user-cgdna user-cgsdk user-colvars \
 	   user-diffraction user-dpd user-drude user-eff user-fep user-h5md \
 	   user-intel user-lb user-manifold user-mgpt user-misc user-molfile \
-	   user-nc-dump user-omp user-phonon user-qmmm user-qtb \
+	   user-netcdf user-omp user-phonon user-qmmm user-qtb \
 	   user-quip user-reaxc user-smd user-smtbq user-sph user-tally \
 	   user-vtk
 
 PACKLIB = compress gpu kim kokkos meam mpiio mscg poems \
 	  python reax voronoi \
-	  user-atc user-awpmd user-colvars user-h5md user-molfile \
-	  user-nc-dump user-qmmm user-quip user-smd user-vtk
+	  user-atc user-awpmd user-colvars user-h5md user-lb user-molfile \
+	  user-netcdf user-qmmm user-quip user-smd user-vtk
 
-PACKSYS = compress mpiio python
+PACKSYS = compress mpiio python user-lb
 
 PACKINT = gpu kokkos meam poems reax user-atc user-awpmd user-colvars
 
 PACKEXT = kim mscg voronoi \
-	  user-h5md user-molfile user-nc-dump user-qmmm user-quip \
+	  user-h5md user-molfile user-netcdf user-qmmm user-quip \
 	  user-smd user-vtk
 
-PACKSCRIPT = voronoi
-
 PACKALL = $(PACKAGE) $(PACKUSER)
 
 PACKAGEUC = $(shell echo $(PACKAGE) | tr a-z A-Z)
@@ -87,6 +85,7 @@ PACKUSERUC = $(shell echo $(PACKUSER) | tr a-z A-Z)
 YESDIR = $(shell echo $(@:yes-%=%) | tr a-z A-Z)
 NODIR  = $(shell echo $(@:no-%=%) | tr a-z A-Z)
 LIBDIR = $(shell echo $(@:lib-%=%))
+LIBUSERDIR = $(shell echo $(@:lib-user-%=%))
 
 # List of all targets
 
@@ -108,7 +107,7 @@ help:
 	@echo 'make no-standard (no-std)    remove all standard pkgs'
 	@echo 'make yes-user                install all user pkgs'
 	@echo 'make no-user                 remove all user pkgs'
-	@echo 'make yes-lib       install all pkgs with libs (incldued or ext)'
+	@echo 'make yes-lib       install all pkgs with libs (included or ext)'
 	@echo 'make no-lib        remove all pkgs with libs (included or ext)'
 	@echo 'make yes-ext                 install all pkgs with external libs'
 	@echo 'make no-ext                  remove all pkgs with external libs'
@@ -274,7 +273,7 @@ package:
 	@echo 'make package-overwrite    replace package files with src files'
 	@echo 'make package-diff (pd)    diff src files against package file'
 	@echo ''
-	@echo 'make lib-package      download/build/install a package library'
+	@echo 'make lib-package      build and/or download a package library'
 
 yes-all:
 	@for p in $(PACKALL); do $(MAKE) yes-$$p; done
@@ -339,11 +338,14 @@ no-%:
 # download/build/install a package library
 
 lib-%:
-	@if [ ! -e ../lib/$(LIBDIR)/Install.py ]; then \
-	  echo "Install script for lib $(@:lib-%=%) does not exist"; \
-	else \
-	  echo "Installing lib for package $(@:lib-%=%)"; \
+	@if [ -e ../lib/$(LIBDIR)/Install.py ]; then \
+	  echo "Installing lib $(@:lib-%=%)"; \
 	  cd ../lib/$(LIBDIR); python Install.py $(args); \
+	elif [ -e ../lib/$(LIBUSERDIR)/Install.py ]; then \
+	  echo "Installing lib $(@:lib-user-%=%)"; \
+	  cd ../lib/$(LIBUSERDIR); python Install.py $(args); \
+	else \
+	  echo "Install script for lib $(@:lib-%=%) does not exist"; \
 	fi;
 
 # status = list src files that differ from package files
diff --git a/src/Purge.list b/src/Purge.list
index 554c5df824666aa4ebe58f14d5ef05ae88cb441b..6326dbadf0b38917c575e9b216b24fe3363e1d76 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -16,6 +16,25 @@ style_region.h
 style_neigh_bin.h
 style_neigh_pair.h
 style_neigh_stencil.h
+# deleted on 4 May 2017
+pair_reax_c.cpp
+pair_reax_c.h
+fix_reax_c_bonds.cpp
+fix_reax_c_bonds.h
+fix_reax_c_species.cpp
+fix_reax_c_species.h
+pair_reax_c_kokkos.cpp
+pair_reax_c_kokkos.h
+fix_reax_c_bonds_kokkos.cpp
+fix_reax_c_bonds_kokkos.h
+fix_reax_c_species_kokkos.cpp
+fix_reax_c_species_kokkos.h
+# deleted on 19 April 2017
+vmdplugin.h
+molfile_plugin.h
+# deleted on 13 April 2017
+dihedral_charmmfsh.cpp
+dihedral_charmmfsh.h
 # deleted on ## XXX 2016
 accelerator_intel.h
 neigh_bond.cpp
diff --git a/src/QEQ/fix_qeq_point.cpp b/src/QEQ/fix_qeq_point.cpp
index 9af70a445abe3431c536993ef7a666c267c89bda..63d20ad911178c7c06d45eb6005996cb5d38cbbd 100644
--- a/src/QEQ/fix_qeq_point.cpp
+++ b/src/QEQ/fix_qeq_point.cpp
@@ -58,7 +58,7 @@ void FixQEqPoint::init()
   neighbor->requests[irequest]->full = 1;
 
   int ntypes = atom->ntypes;
-  memory->create(shld,ntypes+1,ntypes+1,"qeq:shileding");
+  memory->create(shld,ntypes+1,ntypes+1,"qeq:shielding");
 
   if (strstr(update->integrate_style,"respa"))
     nlevels_respa = ((Respa *) update->integrate)->nlevels;
diff --git a/src/RIGID/fix_shake.cpp b/src/RIGID/fix_shake.cpp
index 1fe704efb0537e3f474a497c840de34c3c2520d1..5c993ee85933ff85b47710252be968e6e7905792 100644
--- a/src/RIGID/fix_shake.cpp
+++ b/src/RIGID/fix_shake.cpp
@@ -1419,12 +1419,14 @@ void FixShake::shake(int m)
   domain->minimum_image(r01);
 
   // s01 = distance vec after unconstrained update, with PBC
+  // use Domain::minimum_image_once(), not minimum_image()
+  // b/c xshake values might be huge, due to e.g. fix gcmc
 
   double s01[3];
   s01[0] = xshake[i0][0] - xshake[i1][0];
   s01[1] = xshake[i0][1] - xshake[i1][1];
   s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
+  domain->minimum_image_once(s01);
 
   // scalar distances between atoms
 
@@ -1526,18 +1528,20 @@ void FixShake::shake3(int m)
   domain->minimum_image(r02);
 
   // s01,s02 = distance vec after unconstrained update, with PBC
+  // use Domain::minimum_image_once(), not minimum_image()
+  // b/c xshake values might be huge, due to e.g. fix gcmc
 
   double s01[3];
   s01[0] = xshake[i0][0] - xshake[i1][0];
   s01[1] = xshake[i0][1] - xshake[i1][1];
   s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
+  domain->minimum_image_once(s01);
 
   double s02[3];
   s02[0] = xshake[i0][0] - xshake[i2][0];
   s02[1] = xshake[i0][1] - xshake[i2][1];
   s02[2] = xshake[i0][2] - xshake[i2][2];
-  domain->minimum_image(s02);
+  domain->minimum_image_once(s02);
 
   // scalar distances between atoms
 
@@ -1699,24 +1703,26 @@ void FixShake::shake4(int m)
   domain->minimum_image(r03);
 
   // s01,s02,s03 = distance vec after unconstrained update, with PBC
+  // use Domain::minimum_image_once(), not minimum_image()
+  // b/c xshake values might be huge, due to e.g. fix gcmc
 
   double s01[3];
   s01[0] = xshake[i0][0] - xshake[i1][0];
   s01[1] = xshake[i0][1] - xshake[i1][1];
   s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
+  domain->minimum_image_once(s01);
 
   double s02[3];
   s02[0] = xshake[i0][0] - xshake[i2][0];
   s02[1] = xshake[i0][1] - xshake[i2][1];
   s02[2] = xshake[i0][2] - xshake[i2][2];
-  domain->minimum_image(s02);
+  domain->minimum_image_once(s02);
 
   double s03[3];
   s03[0] = xshake[i0][0] - xshake[i3][0];
   s03[1] = xshake[i0][1] - xshake[i3][1];
   s03[2] = xshake[i0][2] - xshake[i3][2];
-  domain->minimum_image(s03);
+  domain->minimum_image_once(s03);
 
   // scalar distances between atoms
 
@@ -1941,24 +1947,26 @@ void FixShake::shake3angle(int m)
   domain->minimum_image(r12);
 
   // s01,s02,s12 = distance vec after unconstrained update, with PBC
+  // use Domain::minimum_image_once(), not minimum_image()
+  // b/c xshake values might be huge, due to e.g. fix gcmc
 
   double s01[3];
   s01[0] = xshake[i0][0] - xshake[i1][0];
   s01[1] = xshake[i0][1] - xshake[i1][1];
   s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
+  domain->minimum_image_once(s01);
 
   double s02[3];
   s02[0] = xshake[i0][0] - xshake[i2][0];
   s02[1] = xshake[i0][1] - xshake[i2][1];
   s02[2] = xshake[i0][2] - xshake[i2][2];
-  domain->minimum_image(s02);
+  domain->minimum_image_once(s02);
 
   double s12[3];
   s12[0] = xshake[i1][0] - xshake[i2][0];
   s12[1] = xshake[i1][1] - xshake[i2][1];
   s12[2] = xshake[i1][2] - xshake[i2][2];
-  domain->minimum_image(s12);
+  domain->minimum_image_once(s12);
 
   // scalar distances between atoms
 
@@ -2055,6 +2063,7 @@ void FixShake::shake3angle(int m)
   double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new;
 
   while (!done && niter < max_iter) {
+
     quad1 = quad1_0101 * lamda01*lamda01 +
       quad1_0202 * lamda02*lamda02 +
       quad1_1212 * lamda12*lamda12 +
diff --git a/src/SNAP/compute_sna_atom.cpp b/src/SNAP/compute_sna_atom.cpp
index ad934535abd560c0b405109b17d4ce1d53a8c814..cba6fae9b75fcd51c3124ef39ff78d4e4b0849ce 100644
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@@ -48,7 +48,8 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
   diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
-  bzeroflag = 0;
+  bzeroflag = 1;
+  quadraticflag = 0;
 
   // offset by 1 to match up with types
 
@@ -106,6 +107,11 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
 	error->all(FLERR,"Illegal compute sna/atom command");
       bzeroflag = atoi(arg[iarg+1]);
       iarg += 2;
+    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
+      if (iarg+2 > narg)
+	error->all(FLERR,"Illegal compute sna/atom command");
+      quadraticflag = atoi(arg[iarg+1]);
+      iarg += 2;
     } else error->all(FLERR,"Illegal compute sna/atom command");
   }
 
@@ -122,8 +128,9 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
   }
 
   ncoeff = snaptr[0]->ncoeff;
-  peratom_flag = 1;
   size_peratom_cols = ncoeff;
+  if (quadraticflag) size_peratom_cols += ncoeff*ncoeff;
+  peratom_flag = 1;
 
   nmax = 0;
   njmax = 0;
@@ -264,8 +271,16 @@ void ComputeSNAAtom::compute_peratom()
       snaptr[tid]->copy_bi2bvec();
       for (int icoeff = 0; icoeff < ncoeff; icoeff++)
 	sna[i][icoeff] = snaptr[tid]->bvec[icoeff];
+      if (quadraticflag) {
+        int ncount = ncoeff;
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+          double bi = snaptr[tid]->bvec[icoeff];
+          for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++)
+            sna[i][ncount++] = bi*snaptr[tid]->bvec[jcoeff];
+        }
+      }
     } else {
-      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+      for (int icoeff = 0; icoeff < size_peratom_cols; icoeff++)
 	sna[i][icoeff] = 0.0;
     }
   }
diff --git a/src/SNAP/compute_sna_atom.h b/src/SNAP/compute_sna_atom.h
index af62d7cf3b5caf9464da0370f4029021e14eefbf..b22eea71b5ae2b103a0fcaee2dadf1b1a1d835bb 100644
--- a/src/SNAP/compute_sna_atom.h
+++ b/src/SNAP/compute_sna_atom.h
@@ -44,7 +44,7 @@ class ComputeSNAAtom : public Compute {
   double *wjelem;
   class SNA** snaptr;
   double cutmax;
-
+  int quadraticflag;
 };
 
 }
diff --git a/src/SNAP/compute_snad_atom.cpp b/src/SNAP/compute_snad_atom.cpp
index 73452427bd48e42810f872eb6d4ba1c5aea4a696..39f34dd8cd50e8b15edace89accb10d8026e1178 100644
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@@ -48,9 +48,11 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
-  bzeroflag = 0;
+  bzeroflag = 1;
+  quadraticflag = 0;
   
   // process required arguments
+  
   memory->create(radelem,ntypes+1,"sna/atom:radelem"); // offset by 1 to match up with types
   memory->create(wjelem,ntypes+1,"sna/atom:wjelem");
   rcutfac = atof(arg[3]);
@@ -60,11 +62,15 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
     radelem[i+1] = atof(arg[6+i]);
   for(int i = 0; i < ntypes; i++)
     wjelem[i+1] = atof(arg[6+ntypes+i]);
+
   // construct cutsq
+
   double cut;
+  cutmax = 0.0;
   memory->create(cutsq,ntypes+1,ntypes+1,"sna/atom:cutsq");
   for(int i = 1; i <= ntypes; i++) {
     cut = 2.0*radelem[i]*rcutfac;
+    if (cut > cutmax) cutmax = cut;
     cutsq[i][i] = cut*cut;
     for(int j = i+1; j <= ntypes; j++) {
       cut = (radelem[i]+radelem[j])*rcutfac;
@@ -94,6 +100,11 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
 	error->all(FLERR,"Illegal compute snad/atom command");
       switchflag = atoi(arg[iarg+1]);
       iarg += 2;
+    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
+      if (iarg+2 > narg)
+	error->all(FLERR,"Illegal compute snad/atom command");
+      quadraticflag = atoi(arg[iarg+1]);
+      iarg += 2;
     } else error->all(FLERR,"Illegal compute snad/atom command");
   }
 
@@ -110,9 +121,19 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
   }
 
   ncoeff = snaptr[0]->ncoeff;
-  peratom_flag = 1;
-  size_peratom_cols = 3*ncoeff*atom->ntypes;
+  twoncoeff = 2*ncoeff;
+  threencoeff = 3*ncoeff;
+  size_peratom_cols = threencoeff*atom->ntypes;
+  if (quadraticflag) {
+    ncoeffsq = ncoeff*ncoeff;
+    twoncoeffsq = 2*ncoeffsq;
+    threencoeffsq = 3*ncoeffsq;
+    size_peratom_cols +=
+      threencoeffsq*atom->ntypes;
+  }
   comm_reverse = size_peratom_cols;
+  peratom_flag = 1;
+
   nmax = 0;
   njmax = 0;
   snad = NULL;
@@ -136,10 +157,9 @@ void ComputeSNADAtom::init()
 {
   if (force->pair == NULL)
     error->all(FLERR,"Compute snad/atom requires a pair style be defined");
-  // TODO: Not sure what to do with this error check since cutoff radius is not
-  // a single number
-  //if (sqrt(cutsq) > force->pair->cutforce)
-    //error->all(FLERR,"Compute snad/atom cutoff is longer than pairwise cutoff");
+
+  if (cutmax > force->pair->cutforce)
+    error->all(FLERR,"Compute sna/atom cutoff is longer than pairwise cutoff");
 
   // need an occasional full neighbor list
 
@@ -228,7 +248,9 @@ void ComputeSNADAtom::compute_peratom()
       const int* const jlist = firstneigh[i];
       const int jnum = numneigh[i];
 
-      const int typeoffset = 3*ncoeff*(atom->type[i]-1);
+      const int typeoffset = threencoeff*(atom->type[i]-1);
+      const int quadraticoffset = threencoeff*atom->ntypes +
+        threencoeffsq*(atom->type[i]-1);
 
       // insure rij, inside, and typej  are of size jnum
 
@@ -262,7 +284,11 @@ void ComputeSNADAtom::compute_peratom()
 
       snaptr[tid]->compute_ui(ninside);
       snaptr[tid]->compute_zi();
-
+      if (quadraticflag) {
+        snaptr[tid]->compute_bi();
+        snaptr[tid]->copy_bi2bvec();
+      }
+      
       for (int jj = 0; jj < ninside; jj++) {
 	const int j = snaptr[tid]->inside[jj];
 	snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
@@ -279,11 +305,38 @@ void ComputeSNADAtom::compute_peratom()
 	for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
 	  snadi[icoeff] += snaptr[tid]->dbvec[icoeff][0];
 	  snadi[icoeff+ncoeff] += snaptr[tid]->dbvec[icoeff][1];
-	  snadi[icoeff+2*ncoeff] += snaptr[tid]->dbvec[icoeff][2];
+	  snadi[icoeff+twoncoeff] += snaptr[tid]->dbvec[icoeff][2];
 	  snadj[icoeff] -= snaptr[tid]->dbvec[icoeff][0];
 	  snadj[icoeff+ncoeff] -= snaptr[tid]->dbvec[icoeff][1];
-	  snadj[icoeff+2*ncoeff] -= snaptr[tid]->dbvec[icoeff][2];
+	  snadj[icoeff+twoncoeff] -= snaptr[tid]->dbvec[icoeff][2];
 	}
+
+        if (quadraticflag) {
+          double *snadi = snad[i]+quadraticoffset;
+          double *snadj = snad[j]+quadraticoffset;
+          int ncount = 0;
+          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+            double bi = snaptr[tid]->bvec[icoeff];
+            double bix = snaptr[tid]->dbvec[icoeff][0];
+            double biy = snaptr[tid]->dbvec[icoeff][1];
+            double biz = snaptr[tid]->dbvec[icoeff][2];
+            for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
+              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
+                + bix*snaptr[tid]->bvec[jcoeff];
+              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
+                + biy*snaptr[tid]->bvec[jcoeff];
+              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
+                + biz*snaptr[tid]->bvec[jcoeff];
+              snadi[ncount] += dbxtmp;
+              snadi[ncount+ncoeffsq] += dbytmp;
+              snadi[ncount+twoncoeffsq] += dbztmp;
+              snadj[ncount] -= dbxtmp;
+              snadj[ncount+ncoeffsq] -= dbytmp;
+              snadj[ncount+twoncoeffsq] -= dbztmp;
+              ncount++;
+            }
+          }
+        }
       }
     }
   }
@@ -331,7 +384,8 @@ double ComputeSNADAtom::memory_usage()
   double bytes = nmax*size_peratom_cols * sizeof(double);
   bytes += 3*njmax*sizeof(double);
   bytes += njmax*sizeof(int);
-  bytes += ncoeff*3;
+  bytes += threencoeff*atom->ntypes;
+  if (quadraticflag) bytes += threencoeffsq*atom->ntypes;
   bytes += snaptr[0]->memory_usage()*comm->nthreads;
   return bytes;
 }
diff --git a/src/SNAP/compute_snad_atom.h b/src/SNAP/compute_snad_atom.h
index 31f5bf252d2ee8fde1739ea3e9b3e54a29e013ee..0d5a369ab6b9050bea1104c2d928bc55b388e886 100644
--- a/src/SNAP/compute_snad_atom.h
+++ b/src/SNAP/compute_snad_atom.h
@@ -37,7 +37,7 @@ class ComputeSNADAtom : public Compute {
 
  private:
   int nmax, njmax, diagonalstyle;
-  int ncoeff;
+  int ncoeff, twoncoeff, threencoeff, ncoeffsq, twoncoeffsq, threencoeffsq;
   double **cutsq;
   class NeighList *list;
   double **snad;
@@ -45,7 +45,8 @@ class ComputeSNADAtom : public Compute {
   double *radelem;
   double *wjelem;
   class SNA** snaptr;
-
+  double cutmax;
+  int quadraticflag;
 };
 
 }
diff --git a/src/SNAP/compute_snav_atom.cpp b/src/SNAP/compute_snav_atom.cpp
index f75b02fba72ea5f7fef25edc992c60c1cfc3ab4b..0d21d16561956cd1424c5b267067a471f93bd7bb 100644
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@@ -38,8 +38,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   radelem = NULL;
   wjelem = NULL;
 
-  nvirial = 6;
-
   int ntypes = atom->ntypes;
   int nargmin = 6+2*ntypes;
 
@@ -50,9 +48,11 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   diagonalstyle = 0;
   rmin0 = 0.0;
   switchflag = 1;
-  bzeroflag = 0;
+  bzeroflag = 1;
+  quadraticflag = 0;
 
   // process required arguments
+  
   memory->create(radelem,ntypes+1,"sna/atom:radelem"); // offset by 1 to match up with types
   memory->create(wjelem,ntypes+1,"sna/atom:wjelem");
   rcutfac = atof(arg[3]);
@@ -96,6 +96,11 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
 	error->all(FLERR,"Illegal compute snav/atom command");
       switchflag = atoi(arg[iarg+1]);
       iarg += 2;
+    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
+      if (iarg+2 > narg)
+	error->all(FLERR,"Illegal compute snav/atom command");
+      quadraticflag = atoi(arg[iarg+1]);
+      iarg += 2;
     } else error->all(FLERR,"Illegal compute snav/atom command");
   }
 
@@ -112,9 +117,24 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
   }
 
   ncoeff = snaptr[0]->ncoeff;
-  peratom_flag = 1;
-  size_peratom_cols = nvirial*ncoeff*atom->ntypes;
+  twoncoeff = 2*ncoeff;
+  threencoeff = 3*ncoeff;
+  fourncoeff = 4*ncoeff;
+  fivencoeff = 5*ncoeff;
+  sixncoeff = 6*ncoeff;
+  size_peratom_cols = sixncoeff*atom->ntypes;
+  if (quadraticflag) {
+    ncoeffsq = ncoeff*ncoeff;
+    twoncoeffsq = 2*ncoeffsq;
+    threencoeffsq = 3*ncoeffsq;
+    fourncoeffsq = 4*ncoeffsq;
+    fivencoeffsq = 5*ncoeffsq;
+    sixncoeffsq = 6*ncoeffsq;
+    size_peratom_cols +=
+      sixncoeffsq*atom->ntypes;
+  }
   comm_reverse = size_peratom_cols;
+  peratom_flag = 1;
 
   nmax = 0;
   njmax = 0;
@@ -231,7 +251,9 @@ void ComputeSNAVAtom::compute_peratom()
       const int* const jlist = firstneigh[i];
       const int jnum = numneigh[i];
 
-      const int typeoffset = nvirial*ncoeff*(atom->type[i]-1);
+      const int typeoffset = sixncoeff*(atom->type[i]-1);
+      const int quadraticoffset = sixncoeff*atom->ntypes +
+        sixncoeffsq*(atom->type[i]-1);
 
       // insure rij, inside, and typej  are of size jnum
 
@@ -265,6 +287,10 @@ void ComputeSNAVAtom::compute_peratom()
 
       snaptr[tid]->compute_ui(ninside);
       snaptr[tid]->compute_zi();
+      if (quadraticflag) {
+        snaptr[tid]->compute_bi();
+        snaptr[tid]->copy_bi2bvec();
+      }
 
       for (int jj = 0; jj < ninside; jj++) {
 	const int j = snaptr[tid]->inside[jj];
@@ -281,19 +307,52 @@ void ComputeSNAVAtom::compute_peratom()
 	double *snavj = snav[j]+typeoffset;
 
 	for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-	  snavi[icoeff]          += snaptr[tid]->dbvec[icoeff][0]*xtmp;
-	  snavi[icoeff+ncoeff]   += snaptr[tid]->dbvec[icoeff][1]*ytmp;
-	  snavi[icoeff+2*ncoeff] += snaptr[tid]->dbvec[icoeff][2]*ztmp;
-	  snavi[icoeff+3*ncoeff] += snaptr[tid]->dbvec[icoeff][1]*ztmp;
-	  snavi[icoeff+4*ncoeff] += snaptr[tid]->dbvec[icoeff][0]*ztmp;
-	  snavi[icoeff+5*ncoeff] += snaptr[tid]->dbvec[icoeff][0]*ytmp;
-	  snavj[icoeff]          -= snaptr[tid]->dbvec[icoeff][0]*x[j][0];
-	  snavj[icoeff+ncoeff]   -= snaptr[tid]->dbvec[icoeff][1]*x[j][1];
-	  snavj[icoeff+2*ncoeff] -= snaptr[tid]->dbvec[icoeff][2]*x[j][2];
-	  snavj[icoeff+3*ncoeff] -= snaptr[tid]->dbvec[icoeff][1]*x[j][2];
-	  snavj[icoeff+4*ncoeff] -= snaptr[tid]->dbvec[icoeff][0]*x[j][2];
-	  snavj[icoeff+5*ncoeff] -= snaptr[tid]->dbvec[icoeff][0]*x[j][1];
+	  snavi[icoeff]             += snaptr[tid]->dbvec[icoeff][0]*xtmp;
+	  snavi[icoeff+ncoeff]      += snaptr[tid]->dbvec[icoeff][1]*ytmp;
+	  snavi[icoeff+twoncoeff]   += snaptr[tid]->dbvec[icoeff][2]*ztmp;
+	  snavi[icoeff+threencoeff] += snaptr[tid]->dbvec[icoeff][1]*ztmp;
+	  snavi[icoeff+fourncoeff]  += snaptr[tid]->dbvec[icoeff][0]*ztmp;
+	  snavi[icoeff+fivencoeff]  += snaptr[tid]->dbvec[icoeff][0]*ytmp;
+	  snavj[icoeff]             -= snaptr[tid]->dbvec[icoeff][0]*x[j][0];
+	  snavj[icoeff+ncoeff]      -= snaptr[tid]->dbvec[icoeff][1]*x[j][1];
+	  snavj[icoeff+twoncoeff]   -= snaptr[tid]->dbvec[icoeff][2]*x[j][2];
+	  snavj[icoeff+threencoeff] -= snaptr[tid]->dbvec[icoeff][1]*x[j][2];
+	  snavj[icoeff+fourncoeff]  -= snaptr[tid]->dbvec[icoeff][0]*x[j][2];
+	  snavj[icoeff+fivencoeff]  -= snaptr[tid]->dbvec[icoeff][0]*x[j][1];
 	}
+
+        if (quadraticflag) {
+          double *snavi = snav[i]+quadraticoffset;
+          double *snavj = snav[j]+quadraticoffset;
+          int ncount = 0;
+          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+            double bi = snaptr[tid]->bvec[icoeff];
+            double bix = snaptr[tid]->dbvec[icoeff][0];
+            double biy = snaptr[tid]->dbvec[icoeff][1];
+            double biz = snaptr[tid]->dbvec[icoeff][2];
+            for (int jcoeff = 0; jcoeff < ncoeff; jcoeff++) {
+              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
+                + bix*snaptr[tid]->bvec[jcoeff];
+              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
+                + biy*snaptr[tid]->bvec[jcoeff];
+              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
+                + biz*snaptr[tid]->bvec[jcoeff];
+              snavi[ncount] +=               dbxtmp*xtmp;
+              snavi[ncount+ncoeffsq] +=      dbytmp*ytmp;
+              snavi[ncount+twoncoeffsq] +=   dbztmp*ztmp;
+              snavi[ncount+threencoeffsq] += dbytmp*ztmp;
+              snavi[ncount+fourncoeffsq] +=  dbxtmp*ztmp;
+              snavi[ncount+fivencoeffsq] +=  dbxtmp*ytmp;
+              snavj[ncount] -=               dbxtmp*x[j][0];
+              snavj[ncount+ncoeffsq] -=      dbytmp*x[j][1];
+              snavj[ncount+twoncoeffsq] -=   dbztmp*x[j][2];
+              snavj[ncount+threencoeffsq] -= dbytmp*x[j][2];
+              snavj[ncount+fourncoeffsq] -=  dbxtmp*x[j][2];
+              snavj[ncount+fivencoeffsq] -=  dbxtmp*x[j][1];
+              ncount++;
+            }
+          }
+        }
       }
     }
   }
@@ -341,7 +400,8 @@ double ComputeSNAVAtom::memory_usage()
   double bytes = nmax*size_peratom_cols * sizeof(double);
   bytes += 3*njmax*sizeof(double);
   bytes += njmax*sizeof(int);
-  bytes += ncoeff*nvirial;
+  bytes += sixncoeff*atom->ntypes;
+  if (quadraticflag) bytes += sixncoeffsq*atom->ntypes;
   bytes += snaptr[0]->memory_usage()*comm->nthreads;
   return bytes;
 }
diff --git a/src/SNAP/compute_snav_atom.h b/src/SNAP/compute_snav_atom.h
index 0252be705966e200a9e7e71e19d020e575ebfa68..33ae4f92173945c9754b0ef17b5a5b9e730d4a79 100644
--- a/src/SNAP/compute_snav_atom.h
+++ b/src/SNAP/compute_snav_atom.h
@@ -37,16 +37,17 @@ class ComputeSNAVAtom : public Compute {
 
  private:
   int nmax, njmax, diagonalstyle;
-  int ncoeff,nvirial;
+  int ncoeff, twoncoeff, threencoeff, fourncoeff, fivencoeff, sixncoeff;
+  int ncoeffsq, twoncoeffsq, threencoeffsq, fourncoeffsq, fivencoeffsq, sixncoeffsq;
   double **cutsq;
   class NeighList *list;
   double **snav;
   double rcutfac;
   double *radelem;
   double *wjelem;
-
   class SNA** snaptr;
-
+  double cutmax;
+  int quadraticflag;
 };
 
 }
diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp
index 06c2e48488e15d8ae072e5a7b16faafc0ed8a1e1..e4ed57b933cba2d65e310ff62238b6d9a92b1660 100644
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
@@ -1635,7 +1635,8 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename)
   rmin0 = 0.0;
   diagonalstyle = 3;
   switchflag = 1;
-  bzeroflag = 0;
+  bzeroflag = 1;
+  
   // open SNAP parameter file on proc 0
 
   FILE *fpparam;
diff --git a/src/USER-CG-CMM/Install.sh b/src/USER-CGSDK/Install.sh
similarity index 100%
rename from src/USER-CG-CMM/Install.sh
rename to src/USER-CGSDK/Install.sh
diff --git a/src/USER-CG-CMM/README b/src/USER-CGSDK/README
similarity index 58%
rename from src/USER-CG-CMM/README
rename to src/USER-CGSDK/README
index b37fbd3760df16550ecabca2b82b5d54d9b46c20..535bd43ac192062e37f77d81642f46e7ac6b962a 100644
--- a/src/USER-CG-CMM/README
+++ b/src/USER-CGSDK/README
@@ -13,23 +13,15 @@ lipids and charged amino acids.
 See the doc pages for these commands for details.
 
 There are example scripts for using this package in
-examples/USER/cg-cmm.
+examples/USER/cgsdk
 
 This is the second generation implementation reducing the the clutter
 of the previous version. For many systems with long range
 electrostatics, it will be faster to use pair_style hybrid/overlay
 with lj/sdk and coul/long instead of the combined lj/sdk/coul/long
-style, since the number of charged atom types is usually small.  To
-exploit this property, the use of the kspace_style pppm/cg is
-recommended over regular pppm. For all new styles, input file backward
-compatibility is provided.  The old implementation is still available
-through appending the /old suffix. These will be discontinued and
-removed after the new implementation has been fully validated.
-
-The current version of this package should be considered beta
-quality. The CG potentials work correctly for "normal" situations, but
-have not been testing with all kinds of potential parameters and
-simuation systems.
+style, since the number of charged atom types is usually small.
+To exploit this property, the use of the kspace_style pppm/cg is
+recommended over regular pppm.
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
@@ -38,9 +30,9 @@ The person who created this package is Axel Kohlmeyer at Temple U
 
 Thanks for contributions, support and testing goes to
 
-Wataru Shinoda (AIST, Tsukuba)
+Wataru Shinoda (Nagoya University)
 Russell DeVane (Procter & Gamble)
-Michael L. Klein (CMM / U Penn, Philadelphia)
+Michael L. Klein (Temple University, Philadelphia)
 Balasubramanian Sundaram (JNCASR, Bangalore)
 
-version: 0.99 / 2011-11-29
+version: 1.0 / 2017-04-26
diff --git a/src/USER-CG-CMM/angle_sdk.cpp b/src/USER-CGSDK/angle_sdk.cpp
similarity index 100%
rename from src/USER-CG-CMM/angle_sdk.cpp
rename to src/USER-CGSDK/angle_sdk.cpp
diff --git a/src/USER-CG-CMM/angle_sdk.h b/src/USER-CGSDK/angle_sdk.h
similarity index 98%
rename from src/USER-CG-CMM/angle_sdk.h
rename to src/USER-CGSDK/angle_sdk.h
index fbd54611874417c01bfc28f015dcb60877223108..a5d917e57cfcc0e281d05b86335db9eb017709c8 100644
--- a/src/USER-CG-CMM/angle_sdk.h
+++ b/src/USER-CGSDK/angle_sdk.h
@@ -14,7 +14,6 @@
 #ifdef ANGLE_CLASS
 
 AngleStyle(sdk,AngleSDK)
-AngleStyle(cg/cmm,AngleSDK)
 
 #else
 
diff --git a/src/USER-CG-CMM/lj_sdk_common.h b/src/USER-CGSDK/lj_sdk_common.h
similarity index 100%
rename from src/USER-CG-CMM/lj_sdk_common.h
rename to src/USER-CGSDK/lj_sdk_common.h
diff --git a/src/USER-CG-CMM/pair_lj_sdk.cpp b/src/USER-CGSDK/pair_lj_sdk.cpp
similarity index 100%
rename from src/USER-CG-CMM/pair_lj_sdk.cpp
rename to src/USER-CGSDK/pair_lj_sdk.cpp
diff --git a/src/USER-CG-CMM/pair_lj_sdk.h b/src/USER-CGSDK/pair_lj_sdk.h
similarity index 98%
rename from src/USER-CG-CMM/pair_lj_sdk.h
rename to src/USER-CGSDK/pair_lj_sdk.h
index de27485c14f7f8064efc42403b03cc152dff8789..ef0263c06bed225b0f13b658b6291e3829a83b68 100644
--- a/src/USER-CG-CMM/pair_lj_sdk.h
+++ b/src/USER-CGSDK/pair_lj_sdk.h
@@ -18,7 +18,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk,PairLJSDK)
-PairStyle(cg/cmm,PairLJSDK)
 
 #else
 
diff --git a/src/USER-CG-CMM/pair_lj_sdk_coul_long.cpp b/src/USER-CGSDK/pair_lj_sdk_coul_long.cpp
similarity index 100%
rename from src/USER-CG-CMM/pair_lj_sdk_coul_long.cpp
rename to src/USER-CGSDK/pair_lj_sdk_coul_long.cpp
diff --git a/src/USER-CG-CMM/pair_lj_sdk_coul_long.h b/src/USER-CGSDK/pair_lj_sdk_coul_long.h
similarity index 97%
rename from src/USER-CG-CMM/pair_lj_sdk_coul_long.h
rename to src/USER-CGSDK/pair_lj_sdk_coul_long.h
index 508ffe5e6d1b98cb80b933c15d991a88bc89dabe..57779cc0b95b840ad6e5cd94873fa818e586f019 100644
--- a/src/USER-CG-CMM/pair_lj_sdk_coul_long.h
+++ b/src/USER-CGSDK/pair_lj_sdk_coul_long.h
@@ -18,7 +18,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/coul/long,PairLJSDKCoulLong)
-PairStyle(cg/cmm/coul/long,PairLJSDKCoulLong)
 
 #else
 
diff --git a/src/USER-CG-CMM/pair_lj_sdk_coul_msm.cpp b/src/USER-CGSDK/pair_lj_sdk_coul_msm.cpp
similarity index 100%
rename from src/USER-CG-CMM/pair_lj_sdk_coul_msm.cpp
rename to src/USER-CGSDK/pair_lj_sdk_coul_msm.cpp
diff --git a/src/USER-CG-CMM/pair_lj_sdk_coul_msm.h b/src/USER-CGSDK/pair_lj_sdk_coul_msm.h
similarity index 97%
rename from src/USER-CG-CMM/pair_lj_sdk_coul_msm.h
rename to src/USER-CGSDK/pair_lj_sdk_coul_msm.h
index be56c0cec347b59b01ffd0143a29c366192ed49a..8438ced66bd607d28908f781bd67e29bbb1dad0a 100644
--- a/src/USER-CG-CMM/pair_lj_sdk_coul_msm.h
+++ b/src/USER-CGSDK/pair_lj_sdk_coul_msm.h
@@ -18,7 +18,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/coul/msm,PairLJSDKCoulMSM)
-PairStyle(cg/cmm/coul/msm,PairLJSDKCoulMSM)
 
 #else
 
diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp
index fbd8473cb08a5046efd5f09ac9c7d92d950e38bb..f3dec42a8320de89202129fac1741118530f53dc 100644
--- a/src/USER-MISC/fix_srp.cpp
+++ b/src/USER-MISC/fix_srp.cpp
@@ -101,6 +101,13 @@ void FixSRP::init()
   if (force->pair_match("hybrid",1) == NULL)
     error->all(FLERR,"Cannot use pair srp without pair_style hybrid");
 
+  int has_rigid = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strncmp(modify->fix[i]->style,"rigid",5) == 0) ++has_rigid;
+
+  if (has_rigid > 0)
+    error->all(FLERR,"Pair srp is not compatible with rigid fixes.");
+
   if ((bptype < 1) || (bptype > atom->ntypes))
     error->all(FLERR,"Illegal bond particle type");
 
diff --git a/src/USER-MISC/improper_ring.cpp b/src/USER-MISC/improper_ring.cpp
index 5a7937e4ee974caf6441da75c5e8b8fa92c8e0b4..adf17ed1d566e26dcd759914c9f46c8409c91552 100644
--- a/src/USER-MISC/improper_ring.cpp
+++ b/src/USER-MISC/improper_ring.cpp
@@ -204,7 +204,7 @@ void ImproperRing::compute(int eflag, int vflag)
          cfact2 = ckjji / ckjkj;
          cfact3 = ckjji / cjiji;
 
-         /* Calculate the force acted on the thrid atom of the angle. */
+         /* Calculate the force acted on the third atom of the angle. */
          fkx = cfact2 * bvec2x[icomb] - bvec1x[icomb];
          fky = cfact2 * bvec2y[icomb] - bvec1y[icomb];
          fkz = cfact2 * bvec2z[icomb] - bvec1z[icomb];
diff --git a/src/USER-MOLFILE/README b/src/USER-MOLFILE/README
index f6defed6ae849e87456f4df61b81b163e30cc34a..4437b587e4831409db3b0a0f2fda883e47caf1b7 100644
--- a/src/USER-MOLFILE/README
+++ b/src/USER-MOLFILE/README
@@ -2,8 +2,8 @@ This package provides a C++ interface class to the VMD molfile
 plugins, http://www.ks.uiuc.edu/Research/vmd/plugins/molfile, and a
 set of LAMMPS classes that use this interface.
 
-Molfile plugins provide a consistent programming interface to read and
-write file formats commonly used in molecular simulations.  This
+Molfile plugins provide a consistent programming interface to read
+and write file formats commonly used in molecular simulations.  This
 package only provides the interface code, not the plugins; these can
 be taken as precompiled binaries directly from a VMD installation that
 matches the platform of your LAMMPS executable.  Using the plugin
@@ -18,18 +18,5 @@ LAMMPS, you need to link with an appropriate system library, which
 is done using the settings in lib/molfile/Makefile.lammps.  See
 that file and the lib/molfile/README file for more details.
 
-NOTE: while the programming interface (API) to the molfile plugins is
-backward compatible (i.e. you can expect to be able to compile this
-package for plugins from newer VMD packages), the binary interface
-(ABI) is not.  So it is necessary to compile this package with the
-molfile plugin header files (vmdplugin.h and molfile_plugin.h) taken
-from the _same_ VMD installation that the (binary) plugin files are
-taken from.  These header files can be found inside the VMD
-installation tree under: "plugins/include".
-
-For convenience, this package includes a set of header files that is
-compatible with VMD 1.9 and 1.9.1 (the current version in June 2012)
-and should be compilable with VMD versions back to about version 1.8.4
-
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
diff --git a/src/USER-NC-DUMP/Install.sh b/src/USER-NETCDF/Install.sh
similarity index 100%
rename from src/USER-NC-DUMP/Install.sh
rename to src/USER-NETCDF/Install.sh
diff --git a/src/USER-NC-DUMP/README b/src/USER-NETCDF/README
similarity index 95%
rename from src/USER-NC-DUMP/README
rename to src/USER-NETCDF/README
index c02e879c61f369508bf87f5e5f546a688a10a512..57dec5e4c835fd41b4b5ac7a62ac8e96e75913e0 100644
--- a/src/USER-NC-DUMP/README
+++ b/src/USER-NETCDF/README
@@ -1,7 +1,7 @@
-USER-NC-DUMP
+USER-NETCDF
 ============
 
-This package provides the nc and (optionally) the nc/mpiio dump styles.
+This package provides the netcf and netcdf/mpiio dump styles.
 See the doc page for dump nc or dump nc/mpiio command for how to use them.
 Compiling these dump styles requires having the netCDF library installed
 on your system. See lib/netcdf/README for additional details.
diff --git a/src/USER-NC-DUMP/dump_nc.cpp b/src/USER-NETCDF/dump_netcdf.cpp
similarity index 97%
rename from src/USER-NC-DUMP/dump_nc.cpp
rename to src/USER-NETCDF/dump_netcdf.cpp
index 7a66eb022429da91a6491051ac79d42973cd181c..bad90bdef3d1affe541a93c27e69856a66c3f700 100644
--- a/src/USER-NC-DUMP/dump_nc.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -32,14 +32,14 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
+
 #if defined(LMP_HAS_NETCDF)
 
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include <netcdf.h>
-
+#include "dump_netcdf.h"
 #include "atom.h"
 #include "comm.h"
 #include "compute.h"
@@ -56,8 +56,6 @@
 #include "variable.h"
 #include "force.h"
 
-#include "dump_nc.h"
-
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
@@ -91,7 +89,7 @@ const int THIS_IS_A_BIGINT   = -4;
 
 /* ---------------------------------------------------------------------- */
 
-DumpNC::DumpNC(LAMMPS *lmp, int narg, char **arg) :
+DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
   DumpCustom(lmp, narg, arg)
 {
   // arrays for data rearrangement
@@ -224,7 +222,7 @@ DumpNC::DumpNC(LAMMPS *lmp, int narg, char **arg) :
 
 /* ---------------------------------------------------------------------- */
 
-DumpNC::~DumpNC()
+DumpNetCDF::~DumpNetCDF()
 {
   closefile();
 
@@ -238,7 +236,7 @@ DumpNC::~DumpNC()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::openfile()
+void DumpNetCDF::openfile()
 {
   // now the computes and fixes have been initialized, so we can query
   // for the size of vector quantities
@@ -594,12 +592,12 @@ void DumpNC::openfile()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::closefile()
+void DumpNetCDF::closefile()
 {
   if (filewriter && singlefile_opened) {
     NCERR( nc_close(ncid) );
     singlefile_opened = 0;
-    // append next time DumpNC::openfile is called
+    // append next time DumpNetCDF::openfile is called
     append_flag = 1;
     // write to next frame upon next open
     framei++;
@@ -608,7 +606,7 @@ void DumpNC::closefile()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::write()
+void DumpNetCDF::write()
 {
   // open file
 
@@ -678,7 +676,7 @@ void DumpNC::write()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::write_header(bigint n)
+void DumpNetCDF::write_header(bigint n)
 {
   size_t start[2];
 
@@ -753,7 +751,7 @@ void DumpNC::write_header(bigint n)
    write head of block (mass & element name) only if has atoms of the type
 ------------------------------------------------------------------------- */
 
-void DumpNC::write_data(int n, double *mybuf)
+void DumpNetCDF::write_data(int n, double *mybuf)
 {
   size_t start[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
   ptrdiff_t stride[NC_MAX_VAR_DIMS];
@@ -761,18 +759,17 @@ void DumpNC::write_data(int n, double *mybuf)
   if (!int_buffer) {
     n_buffer = n;
     int_buffer = (int *)
-      memory->smalloc(n*sizeof(int), "DumpNC::int_buffer");
+      memory->smalloc(n*sizeof(int),"dump::int_buffer");
     double_buffer = (double *)
-      memory->smalloc(n*sizeof(double), "DumpNC::double_buffer");
+      memory->smalloc(n*sizeof(double),"dump::double_buffer");
   }
 
   if (n > n_buffer) {
     n_buffer = n;
     int_buffer = (int *)
-      memory->srealloc(int_buffer, n*sizeof(int), "DumpNC::int_buffer");
+      memory->srealloc(int_buffer, n*sizeof(int),"dump::int_buffer");
     double_buffer = (double *)
-      memory->srealloc(double_buffer, n*sizeof(double),
-                       "DumpNC::double_buffer");
+      memory->srealloc(double_buffer, n*sizeof(double),"dump::double_buffer");
   }
 
   start[0] = framei-1;
@@ -887,7 +884,7 @@ void DumpNC::write_data(int n, double *mybuf)
 
 /* ---------------------------------------------------------------------- */
 
-int DumpNC::modify_param(int narg, char **arg)
+int DumpNetCDF::modify_param(int narg, char **arg)
 {
   int iarg = 0;
   if (strcmp(arg[iarg],"double") == 0) {
@@ -925,17 +922,17 @@ int DumpNC::modify_param(int narg, char **arg)
 
       if (!strcmp(arg[iarg],"step")) {
         perframe[i].type = THIS_IS_A_BIGINT;
-        perframe[i].compute = &DumpNC::compute_step;
+        perframe[i].compute = &DumpNetCDF::compute_step;
         strcpy(perframe[i].name, arg[iarg]);
       }
       else if (!strcmp(arg[iarg],"elapsed")) {
         perframe[i].type = THIS_IS_A_BIGINT;
-        perframe[i].compute = &DumpNC::compute_elapsed;
+        perframe[i].compute = &DumpNetCDF::compute_elapsed;
         strcpy(perframe[i].name, arg[iarg]);
       }
       else if (!strcmp(arg[iarg],"elaplong")) {
         perframe[i].type = THIS_IS_A_BIGINT;
-        perframe[i].compute = &DumpNC::compute_elapsed_long;
+        perframe[i].compute = &DumpNetCDF::compute_elapsed_long;
         strcpy(perframe[i].name, arg[iarg]);
       }
       else {
@@ -1036,7 +1033,7 @@ int DumpNC::modify_param(int narg, char **arg)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::write_prmtop()
+void DumpNetCDF::write_prmtop()
 {
   char fn[1024];
   char tmp[81];
@@ -1098,7 +1095,7 @@ void DumpNC::write_prmtop()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::ncerr(int err, const char *descr, int line)
+void DumpNetCDF::ncerr(int err, const char *descr, int line)
 {
   if (err != NC_NOERR) {
     char errstr[1024];
@@ -1122,21 +1119,21 @@ void DumpNC::ncerr(int err, const char *descr, int line)
    customize a new keyword by adding a method
 ------------------------------------------------------------------------- */
 
-void DumpNC::compute_step(void *r)
+void DumpNetCDF::compute_step(void *r)
 {
   *((bigint *) r) = update->ntimestep;
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::compute_elapsed(void *r)
+void DumpNetCDF::compute_elapsed(void *r)
 {
   *((bigint *) r) = update->ntimestep - update->firststep;
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNC::compute_elapsed_long(void *r)
+void DumpNetCDF::compute_elapsed_long(void *r)
 {
   *((bigint *) r) = update->ntimestep - update->beginstep;
 }
diff --git a/src/USER-NC-DUMP/dump_nc.h b/src/USER-NETCDF/dump_netcdf.h
similarity index 94%
rename from src/USER-NC-DUMP/dump_nc.h
rename to src/USER-NETCDF/dump_netcdf.h
index 788a9368f928d30c19a866b518815dd0ddeefd22..daf4e9d0de2d94151c6f0bad6b9e348171e48b82 100644
--- a/src/USER-NC-DUMP/dump_nc.h
+++ b/src/USER-NETCDF/dump_netcdf.h
@@ -32,16 +32,17 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
+
 #if defined(LMP_HAS_NETCDF)
 
 #ifdef DUMP_CLASS
 
-DumpStyle(nc,DumpNC)
+DumpStyle(netcdf,DumpNetCDF)
 
 #else
 
-#ifndef LMP_DUMP_NC_H
-#define LMP_DUMP_NC_H
+#ifndef LMP_DUMP_NETCDF_H
+#define LMP_DUMP_NETCDFC_H
 
 #include "dump_custom.h"
 
@@ -50,10 +51,10 @@ namespace LAMMPS_NS {
 const int NC_FIELD_NAME_MAX = 100;
 const int DUMP_NC_MAX_DIMS  = 100;
 
-class DumpNC : public DumpCustom {
+class DumpNetCDF : public DumpCustom {
  public:
-  DumpNC(class LAMMPS *, int, char **);
-  virtual ~DumpNC();
+  DumpNetCDF(class LAMMPS *, int, char **);
+  virtual ~DumpNetCDF();
   virtual void write();
 
  private:
@@ -68,7 +69,7 @@ class DumpNC : public DumpCustom {
     int ndumped;                  // number of enties written for this prop.
   };
 
-  typedef void (DumpNC::*funcptr_t)(void *);
+  typedef void (DumpNetCDF::*funcptr_t)(void *);
 
   // per-frame quantities (variables, fixes or computes)
   struct nc_perframe_t {
diff --git a/src/USER-NC-DUMP/dump_nc_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
similarity index 96%
rename from src/USER-NC-DUMP/dump_nc_mpiio.cpp
rename to src/USER-NETCDF/dump_netcdf_mpiio.cpp
index 6b26014030b2a509ea28f09af1a21a9af8ba09f0..2e9ec274a5e07cc04bddb7644aa87d54353b09c6 100644
--- a/src/USER-NC-DUMP/dump_nc_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -32,14 +32,14 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
+
 #if defined(LMP_HAS_PNETCDF)
 
 #include <unistd.h>
 #include <stdlib.h>
 #include <string.h>
-
 #include <pnetcdf.h>
-
+#include "dump_netcdf_mpiio.h"
 #include "atom.h"
 #include "comm.h"
 #include "compute.h"
@@ -56,8 +56,6 @@
 #include "variable.h"
 #include "force.h"
 
-#include "dump_nc_mpiio.h"
-
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
@@ -91,7 +89,7 @@ const int THIS_IS_A_BIGINT   = -4;
 
 /* ---------------------------------------------------------------------- */
 
-DumpNCMPIIO::DumpNCMPIIO(LAMMPS *lmp, int narg, char **arg) :
+DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) :
   DumpCustom(lmp, narg, arg)
 {
   // arrays for data rearrangement
@@ -217,7 +215,7 @@ DumpNCMPIIO::DumpNCMPIIO(LAMMPS *lmp, int narg, char **arg) :
 
 /* ---------------------------------------------------------------------- */
 
-DumpNCMPIIO::~DumpNCMPIIO()
+DumpNetCDFMPIIO::~DumpNetCDFMPIIO()
 {
   closefile();
 
@@ -231,7 +229,7 @@ DumpNCMPIIO::~DumpNCMPIIO()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::openfile()
+void DumpNetCDFMPIIO::openfile()
 {
   // now the computes and fixes have been initialized, so we can query
   // for the size of vector quantities
@@ -570,12 +568,12 @@ void DumpNCMPIIO::openfile()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::closefile()
+void DumpNetCDFMPIIO::closefile()
 {
   if (singlefile_opened) {
     NCERR( ncmpi_close(ncid) );
     singlefile_opened = 0;
-    // append next time DumpNCMPIIO::openfile is called
+    // append next time DumpNetCDFMPIIO::openfile is called
     append_flag = 1;
     // write to next frame upon next open
     framei++;
@@ -584,7 +582,7 @@ void DumpNCMPIIO::closefile()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::write()
+void DumpNetCDFMPIIO::write()
 {
   // open file
 
@@ -687,7 +685,7 @@ void DumpNCMPIIO::write()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::write_time_and_cell()
+void DumpNetCDFMPIIO::write_time_and_cell()
 {
   MPI_Offset start[2];
 
@@ -759,7 +757,7 @@ void DumpNCMPIIO::write_time_and_cell()
    write head of block (mass & element name) only if has atoms of the type
 ------------------------------------------------------------------------- */
 
-void DumpNCMPIIO::write_data(int n, double *mybuf)
+void DumpNetCDFMPIIO::write_data(int n, double *mybuf)
 {
   MPI_Offset start[NC_MAX_VAR_DIMS], count[NC_MAX_VAR_DIMS];
   MPI_Offset stride[NC_MAX_VAR_DIMS];
@@ -767,19 +765,18 @@ void DumpNCMPIIO::write_data(int n, double *mybuf)
   if (!int_buffer) {
     n_buffer = std::max(1, n);
     int_buffer = (int *)
-      memory->smalloc(n_buffer*sizeof(int), "DumpNCMPIIO::int_buffer");
+      memory->smalloc(n_buffer*sizeof(int),"dump::int_buffer");
     double_buffer = (double *)
-      memory->smalloc(n_buffer*sizeof(double), "DumpNCMPIIO::double_buffer");
+      memory->smalloc(n_buffer*sizeof(double),"dump::double_buffer");
   }
 
   if (n > n_buffer) {
     n_buffer = std::max(1, n);
     int_buffer = (int *)
-      memory->srealloc(int_buffer, n_buffer*sizeof(int),
-                       "DumpNCMPIIO::int_buffer");
+      memory->srealloc(int_buffer, n_buffer*sizeof(int),"dump::int_buffer");
     double_buffer = (double *)
       memory->srealloc(double_buffer, n_buffer*sizeof(double),
-                       "DumpNCMPIIO::double_buffer");
+                       "dump::double_buffer");
   }
 
   start[0] = framei-1;
@@ -882,7 +879,7 @@ void DumpNCMPIIO::write_data(int n, double *mybuf)
 
 /* ---------------------------------------------------------------------- */
 
-int DumpNCMPIIO::modify_param(int narg, char **arg)
+int DumpNetCDFMPIIO::modify_param(int narg, char **arg)
 {
   int iarg = 0;
   if (strcmp(arg[iarg],"double") == 0) {
@@ -920,17 +917,17 @@ int DumpNCMPIIO::modify_param(int narg, char **arg)
 
       if (!strcmp(arg[iarg],"step")) {
         perframe[i].type = THIS_IS_A_BIGINT;
-        perframe[i].compute = &DumpNCMPIIO::compute_step;
+        perframe[i].compute = &DumpNetCDFMPIIO::compute_step;
         strcpy(perframe[i].name, arg[iarg]);
       }
       else if (!strcmp(arg[iarg],"elapsed")) {
         perframe[i].type = THIS_IS_A_BIGINT;
-        perframe[i].compute = &DumpNCMPIIO::compute_elapsed;
+        perframe[i].compute = &DumpNetCDFMPIIO::compute_elapsed;
         strcpy(perframe[i].name, arg[iarg]);
       }
       else if (!strcmp(arg[iarg],"elaplong")) {
         perframe[i].type = THIS_IS_A_BIGINT;
-        perframe[i].compute = &DumpNCMPIIO::compute_elapsed_long;
+        perframe[i].compute = &DumpNetCDFMPIIO::compute_elapsed_long;
         strcpy(perframe[i].name, arg[iarg]);
       }
       else {
@@ -1031,7 +1028,7 @@ int DumpNCMPIIO::modify_param(int narg, char **arg)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::ncerr(int err, const char *descr, int line)
+void DumpNetCDFMPIIO::ncerr(int err, const char *descr, int line)
 {
   if (err != NC_NOERR) {
     char errstr[1024];
@@ -1055,21 +1052,21 @@ void DumpNCMPIIO::ncerr(int err, const char *descr, int line)
    customize a new keyword by adding a method
 ------------------------------------------------------------------------- */
 
-void DumpNCMPIIO::compute_step(void *r)
+void DumpNetCDFMPIIO::compute_step(void *r)
 {
   *((bigint *) r) = update->ntimestep;
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::compute_elapsed(void *r)
+void DumpNetCDFMPIIO::compute_elapsed(void *r)
 {
   *((bigint *) r) = update->ntimestep - update->firststep;
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNCMPIIO::compute_elapsed_long(void *r)
+void DumpNetCDFMPIIO::compute_elapsed_long(void *r)
 {
   *((bigint *) r) = update->ntimestep - update->beginstep;
 }
diff --git a/src/USER-NC-DUMP/dump_nc_mpiio.h b/src/USER-NETCDF/dump_netcdf_mpiio.h
similarity index 95%
rename from src/USER-NC-DUMP/dump_nc_mpiio.h
rename to src/USER-NETCDF/dump_netcdf_mpiio.h
index 5e36335e648978c2e5e5dc40486487b066c7888c..6f5b00b03350f6a08c9c603a01cbae8c49f90e7b 100644
--- a/src/USER-NC-DUMP/dump_nc_mpiio.h
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.h
@@ -32,16 +32,17 @@
 
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
+
 #if defined(LMP_HAS_PNETCDF)
 
 #ifdef DUMP_CLASS
 
-DumpStyle(nc/mpiio,DumpNCMPIIO)
+DumpStyle(netcdf/mpiio,DumpNetCDFMPIIO)
 
 #else
 
-#ifndef LMP_DUMP_NC_MPIIO_H
-#define LMP_DUMP_NC_MPIIO_H
+#ifndef LMP_DUMP_NETCDF_MPIIO_H
+#define LMP_DUMP_NETCDF_MPIIO_H
 
 #include "dump_custom.h"
 
@@ -50,10 +51,10 @@ namespace LAMMPS_NS {
 const int NC_MPIIO_FIELD_NAME_MAX = 100;
 const int DUMP_NC_MPIIO_MAX_DIMS  = 100;
 
-class DumpNCMPIIO : public DumpCustom {
+class DumpNetCDFMPIIO : public DumpCustom {
  public:
-  DumpNCMPIIO(class LAMMPS *, int, char **);
-  virtual ~DumpNCMPIIO();
+  DumpNetCDFMPIIO(class LAMMPS *, int, char **);
+  virtual ~DumpNetCDFMPIIO();
   virtual void write();
 
  private:
diff --git a/src/USER-OMP/angle_sdk_omp.h b/src/USER-OMP/angle_sdk_omp.h
index 9ab75904ceb54b8ce60997836829cb63868efde1..c041c2ecc26f912342fa7107ff36ed7de8867a10 100644
--- a/src/USER-OMP/angle_sdk_omp.h
+++ b/src/USER-OMP/angle_sdk_omp.h
@@ -18,7 +18,6 @@
 #ifdef ANGLE_CLASS
 
 AngleStyle(sdk/omp,AngleSDKOMP)
-AngleStyle(cg/cmm/omp,AngleSDKOMP)
 
 #else
 
diff --git a/src/USER-OMP/improper_ring_omp.cpp b/src/USER-OMP/improper_ring_omp.cpp
index bd7593c51ab2311e22d29d6114ef67fdbaa44f09..4eadc831833b9f2b3f9e26fd24a545ecafa962c4 100644
--- a/src/USER-OMP/improper_ring_omp.cpp
+++ b/src/USER-OMP/improper_ring_omp.cpp
@@ -206,7 +206,7 @@ void ImproperRingOMP::eval(int nfrom, int nto, ThrData * const thr)
         cfact2 = ckjji / ckjkj;
         cfact3 = ckjji / cjiji;
 
-        /* Calculate the force acted on the thrid atom of the angle. */
+        /* Calculate the force acted on the third atom of the angle. */
         fkx = cfact2 * bvec2x[icomb] - bvec1x[icomb];
         fky = cfact2 * bvec2y[icomb] - bvec1y[icomb];
         fkz = cfact2 * bvec2z[icomb] - bvec1z[icomb];
diff --git a/src/USER-OMP/pair_lj_sdk_coul_long_omp.h b/src/USER-OMP/pair_lj_sdk_coul_long_omp.h
index a615efb5071899529063bbe780899732c3aff2b5..1886d2c7b5959591b9ab54b32b92420842933232 100644
--- a/src/USER-OMP/pair_lj_sdk_coul_long_omp.h
+++ b/src/USER-OMP/pair_lj_sdk_coul_long_omp.h
@@ -18,7 +18,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/coul/long/omp,PairLJSDKCoulLongOMP)
-PairStyle(cg/cmm/coul/long/omp,PairLJSDKCoulLongOMP)
 
 #else
 
diff --git a/src/USER-OMP/pair_lj_sdk_coul_msm_omp.h b/src/USER-OMP/pair_lj_sdk_coul_msm_omp.h
index 9e4a922c390f632cb0b1e5db30731f3ba383c680..9841408b8a06a55463e61097bd82cd2488bf2165 100644
--- a/src/USER-OMP/pair_lj_sdk_coul_msm_omp.h
+++ b/src/USER-OMP/pair_lj_sdk_coul_msm_omp.h
@@ -18,7 +18,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/coul/msm/omp,PairLJSDKCoulMSMOMP)
-PairStyle(cg/cmm/coul/msm/omp,PairLJSDKCoulMSMOMP)
 
 #else
 
@@ -54,4 +53,4 @@ E: Must use 'kspace_modify pressure/scalar no' with OMP MSM Pair styles
 
 The kspace scalar pressure option is not (yet) compatible with OMP MSM Pair styles.
 
-*/
\ No newline at end of file
+*/
diff --git a/src/USER-OMP/pair_lj_sdk_omp.h b/src/USER-OMP/pair_lj_sdk_omp.h
index c3837fb683c0b71d6378b6234c36074406b73f12..36c913252a9832897749608bd1fc4905308f71d8 100644
--- a/src/USER-OMP/pair_lj_sdk_omp.h
+++ b/src/USER-OMP/pair_lj_sdk_omp.h
@@ -18,7 +18,6 @@
 #ifdef PAIR_CLASS
 
 PairStyle(lj/sdk/omp,PairLJSDKOMP)
-PairStyle(cg/cmm/omp,PairLJSDKOMP)
 
 #else
 
diff --git a/src/USER-REAXC/compute_spec_atom.cpp b/src/USER-REAXC/compute_spec_atom.cpp
index 4af8efcae71b324da7d5b8a872f13e5e491dae59..164ce87205a15854d0a2fe1079d41d3f6af9818e 100644
--- a/src/USER-REAXC/compute_spec_atom.cpp
+++ b/src/USER-REAXC/compute_spec_atom.cpp
@@ -24,7 +24,7 @@
 
 #include "reaxc_defs.h"
 #include "reaxc_types.h"
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 
 using namespace LAMMPS_NS;
 
@@ -71,7 +71,7 @@ ComputeSpecAtom::ComputeSpecAtom(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"vz") == 0) {
       pack_choice[i] = &ComputeSpecAtom::pack_vz;
 
-    // from pair_reax_c
+    // from pair_reaxc
     } else if (strcmp(arg[iarg],"abo01") == 0) {
       pack_choice[i] = &ComputeSpecAtom::pack_abo01;
     } else if (strcmp(arg[iarg],"abo02") == 0) {
diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp
index 26cf03f60a838477d6fec05da6b58457c3f63895..01ecd9d3994fce173ac4dbf9ca6b619314c62b3a 100644
--- a/src/USER-REAXC/fix_qeq_reax.cpp
+++ b/src/USER-REAXC/fix_qeq_reax.cpp
@@ -23,7 +23,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "fix_qeq_reax.h"
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "atom.h"
 #include "comm.h"
 #include "domain.h"
@@ -375,7 +375,7 @@ void FixQEqReax::init_shielding()
 
   ntypes = atom->ntypes;
   if (shld == NULL)
-    memory->create(shld,ntypes+1,ntypes+1,"qeq:shileding");
+    memory->create(shld,ntypes+1,ntypes+1,"qeq:shielding");
 
   for( i = 1; i <= ntypes; ++i )
     for( j = 1; j <= ntypes; ++j )
diff --git a/src/USER-REAXC/fix_reax_c.cpp b/src/USER-REAXC/fix_reaxc.cpp
similarity index 99%
rename from src/USER-REAXC/fix_reax_c.cpp
rename to src/USER-REAXC/fix_reaxc.cpp
index e1cc4e340ef57efd2e68cd28a226d14c79b4bb33..df06217993da53bc31492e1ea85957883a4ef475 100644
--- a/src/USER-REAXC/fix_reax_c.cpp
+++ b/src/USER-REAXC/fix_reaxc.cpp
@@ -21,7 +21,7 @@
    Algorithmic Techniques", Parallel Computing, in press.
 ------------------------------------------------------------------------- */
 
-#include "fix_reax_c.h"
+#include "fix_reaxc.h"
 #include "atom.h"
 #include "pair.h"
 #include "comm.h"
diff --git a/src/USER-REAXC/fix_reax_c.h b/src/USER-REAXC/fix_reaxc.h
similarity index 100%
rename from src/USER-REAXC/fix_reax_c.h
rename to src/USER-REAXC/fix_reaxc.h
diff --git a/src/USER-REAXC/fix_reaxc_bonds.cpp b/src/USER-REAXC/fix_reaxc_bonds.cpp
index 543669de766cce7c3a1466988623af1f564b2dd0..cf9e4789c1342f232ffabc68eb5206a7cc24b149 100644
--- a/src/USER-REAXC/fix_reaxc_bonds.cpp
+++ b/src/USER-REAXC/fix_reaxc_bonds.cpp
@@ -21,7 +21,7 @@
 #include "fix_reaxc_bonds.h"
 #include "atom.h"
 #include "update.h"
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
diff --git a/src/USER-REAXC/fix_reaxc_species.cpp b/src/USER-REAXC/fix_reaxc_species.cpp
index ead73f02a1328cef35a4af076ee8e1f0818aa490..d291903fa8fb9ece2b071cf871f94e80104eb6ee 100644
--- a/src/USER-REAXC/fix_reaxc_species.cpp
+++ b/src/USER-REAXC/fix_reaxc_species.cpp
@@ -24,7 +24,7 @@
 #include "fix_reaxc_species.h"
 #include "domain.h"
 #include "update.h"
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "modify.h"
 #include "neighbor.h"
 #include "neigh_list.h"
diff --git a/src/USER-REAXC/fix_reaxc_species.h b/src/USER-REAXC/fix_reaxc_species.h
index 872ea2528f06df00cd27ee0e76fd19fa66fdad51..563a10f39db3913a49abbf97cb28b63975b64d51 100644
--- a/src/USER-REAXC/fix_reaxc_species.h
+++ b/src/USER-REAXC/fix_reaxc_species.h
@@ -23,7 +23,7 @@ FixStyle(reax/c/species,FixReaxCSpecies)
 #include "fix.h"
 #include "pointers.h"
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_types.h"
 #include "reaxc_defs.h"
 
diff --git a/src/USER-REAXC/pair_reax_c.cpp b/src/USER-REAXC/pair_reaxc.cpp
similarity index 97%
rename from src/USER-REAXC/pair_reax_c.cpp
rename to src/USER-REAXC/pair_reaxc.cpp
index 4933c90f01d8273f38566fa456dc3da2e43aa91b..d51b0fc2f8bb28b9516218daf1bb433948ef7b9b 100644
--- a/src/USER-REAXC/pair_reax_c.cpp
+++ b/src/USER-REAXC/pair_reaxc.cpp
@@ -20,7 +20,7 @@
    Hybrid and hybrid/overlay compatibility added by Ray Shan (Sandia)
 ------------------------------------------------------------------------- */
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "atom.h"
 #include "update.h"
 #include "force.h"
@@ -30,7 +30,7 @@
 #include "neigh_request.h"
 #include "modify.h"
 #include "fix.h"
-#include "fix_reax_c.h"
+#include "fix_reaxc.h"
 #include "citeme.h"
 #include "memory.h"
 #include "error.h"
@@ -223,10 +223,11 @@ void PairReaxC::settings(int narg, char **arg)
 
   qeqflag = 1;
   control->lgflag = 0;
+  control->enobondsflag = 1;
   system->mincap = MIN_CAP;
   system->safezone = SAFE_ZONE;
   system->saferzone = SAFER_ZONE;
-
+  
   // process optional keywords
 
   int iarg = 1;
@@ -238,7 +239,13 @@ void PairReaxC::settings(int narg, char **arg)
       else if (strcmp(arg[iarg+1],"no") == 0) qeqflag = 0;
       else error->all(FLERR,"Illegal pair_style reax/c command");
       iarg += 2;
-    } else if (strcmp(arg[iarg],"lgvdw") == 0) {
+    } else if (strcmp(arg[iarg],"enobonds") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal pair_style reax/c command");
+      if (strcmp(arg[iarg+1],"yes") == 0) control->enobondsflag = 1;
+      else if (strcmp(arg[iarg+1],"no") == 0) control->enobondsflag = 0;
+      else error->all(FLERR,"Illegal pair_style reax/c command");
+      iarg += 2;
+  } else if (strcmp(arg[iarg],"lgvdw") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal pair_style reax/c command");
       if (strcmp(arg[iarg+1],"yes") == 0) control->lgflag = 1;
       else if (strcmp(arg[iarg+1],"no") == 0) control->lgflag = 0;
diff --git a/src/USER-REAXC/pair_reax_c.h b/src/USER-REAXC/pair_reaxc.h
similarity index 100%
rename from src/USER-REAXC/pair_reax_c.h
rename to src/USER-REAXC/pair_reaxc.h
diff --git a/src/USER-REAXC/reaxc_allocate.cpp b/src/USER-REAXC/reaxc_allocate.cpp
index dc8545e0069f0deafb38dac5484379e40a386915..969912e082ed8e7419ded19ee33c0ded09587c80 100644
--- a/src/USER-REAXC/reaxc_allocate.cpp
+++ b/src/USER-REAXC/reaxc_allocate.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_allocate.h"
 #include "reaxc_list.h"
 #include "reaxc_reset_tools.h"
diff --git a/src/USER-REAXC/reaxc_bond_orders.cpp b/src/USER-REAXC/reaxc_bond_orders.cpp
index 0b4ca21adf4e128e21d9701486f8127e3f0d28d2..04cedf18a8e5d7c49699e48a1595fdc220503ed5 100644
--- a/src/USER-REAXC/reaxc_bond_orders.cpp
+++ b/src/USER-REAXC/reaxc_bond_orders.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_types.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_list.h"
diff --git a/src/USER-REAXC/reaxc_bonds.cpp b/src/USER-REAXC/reaxc_bonds.cpp
index e0ef38ba0f2788fa739b1ed53e701f234721ac2d..a8a1298166e239c173d559616f568847c137fdc2 100644
--- a/src/USER-REAXC/reaxc_bonds.cpp
+++ b/src/USER-REAXC/reaxc_bonds.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_bonds.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_list.h"
diff --git a/src/USER-REAXC/reaxc_control.cpp b/src/USER-REAXC/reaxc_control.cpp
index 3753360c68593c1f704ad9bf137839e95aa061c0..4def41bc8c7c309f1f3c4c0a47e1d74a54f8a758 100644
--- a/src/USER-REAXC/reaxc_control.cpp
+++ b/src/USER-REAXC/reaxc_control.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_control.h"
 #include "reaxc_tool_box.h"
 
diff --git a/src/USER-REAXC/reaxc_defs.h b/src/USER-REAXC/reaxc_defs.h
index d0a75d431baa506cbe7a2b9d584b68542d8f5010..101b554fb23fe5247c7256dd4622f0974596339f 100644
--- a/src/USER-REAXC/reaxc_defs.h
+++ b/src/USER-REAXC/reaxc_defs.h
@@ -116,8 +116,8 @@
 
 #define MAX_BOND 20
 
-#define MAXREAXBOND 24 /* used in fix_reaxc_bonds.cpp and pair_reax_c.cpp */
-#define MAXSPECBOND 24 /* used in fix_reaxc_species.cpp and pair_reax_c.cpp */
+#define MAXREAXBOND 24 /* used in fix_reaxc_bonds.cpp and pair_reaxc.cpp */
+#define MAXSPECBOND 24 /* used in fix_reaxc_species.cpp and pair_reaxc.cpp */
 
 /******************* ENUMERATIONS *************************/
 enum geo_formats { CUSTOM, PDB, ASCII_RESTART, BINARY_RESTART, GF_N };
diff --git a/src/USER-REAXC/reaxc_ffield.cpp b/src/USER-REAXC/reaxc_ffield.cpp
index fda284140349885e083c50fe17e79f4d86489ec9..58a347ebf781daf0ffe4e3a1596cc28b944c2362 100644
--- a/src/USER-REAXC/reaxc_ffield.cpp
+++ b/src/USER-REAXC/reaxc_ffield.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "error.h"
 #include "reaxc_ffield.h"
 #include "reaxc_tool_box.h"
diff --git a/src/USER-REAXC/reaxc_forces.cpp b/src/USER-REAXC/reaxc_forces.cpp
index 7f11f5565febf8fbeb2f3604c6976a1b259588b5..215ded6e5d0b2bbec7353c0a30b5ed632e32c69c 100644
--- a/src/USER-REAXC/reaxc_forces.cpp
+++ b/src/USER-REAXC/reaxc_forces.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_forces.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_bonds.h"
diff --git a/src/USER-REAXC/reaxc_hydrogen_bonds.cpp b/src/USER-REAXC/reaxc_hydrogen_bonds.cpp
index 8d7b3b381997575806e38f71df9f195c10ff4d33..ff771ad65bc96f6e987a2707863c93fcfa712607 100644
--- a/src/USER-REAXC/reaxc_hydrogen_bonds.cpp
+++ b/src/USER-REAXC/reaxc_hydrogen_bonds.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_hydrogen_bonds.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_list.h"
diff --git a/src/USER-REAXC/reaxc_init_md.cpp b/src/USER-REAXC/reaxc_init_md.cpp
index f912c95ea5b8ba13a99352dad21e180b1dc31a66..b11cdd2fbcddb8bd235bf410ac349a1e39328b47 100644
--- a/src/USER-REAXC/reaxc_init_md.cpp
+++ b/src/USER-REAXC/reaxc_init_md.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_init_md.h"
 #include "reaxc_allocate.h"
 #include "reaxc_forces.h"
diff --git a/src/USER-REAXC/reaxc_io_tools.cpp b/src/USER-REAXC/reaxc_io_tools.cpp
index 0c14dad5d43a0300b4ce3e3c049960c3acc5c476..4d58f7514de5eefc6910fb9e345e9fb16ad64459 100644
--- a/src/USER-REAXC/reaxc_io_tools.cpp
+++ b/src/USER-REAXC/reaxc_io_tools.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "update.h"
 #include "reaxc_io_tools.h"
 #include "reaxc_list.h"
diff --git a/src/USER-REAXC/reaxc_list.cpp b/src/USER-REAXC/reaxc_list.cpp
index d22ac4ca7f7d5a5b3b1cc628e945638bb702a13f..2755d5506e7b78c456dabe06cb9caab72021594f 100644
--- a/src/USER-REAXC/reaxc_list.cpp
+++ b/src/USER-REAXC/reaxc_list.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_list.h"
 #include "reaxc_tool_box.h"
 
diff --git a/src/USER-REAXC/reaxc_lookup.cpp b/src/USER-REAXC/reaxc_lookup.cpp
index 903e54962dcc9f39a285982ed0999e31597b00cd..9db8b7b9f650105c9c6c9d5bed90634a49b2e29f 100644
--- a/src/USER-REAXC/reaxc_lookup.cpp
+++ b/src/USER-REAXC/reaxc_lookup.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_lookup.h"
 #include "reaxc_nonbonded.h"
 #include "reaxc_tool_box.h"
diff --git a/src/USER-REAXC/reaxc_multi_body.cpp b/src/USER-REAXC/reaxc_multi_body.cpp
index 1923668e89bf08160750eb78c0083e5c18b78b1d..ecfd3ad04d7fc179cf7a1806036ae8c7d58b7c29 100644
--- a/src/USER-REAXC/reaxc_multi_body.cpp
+++ b/src/USER-REAXC/reaxc_multi_body.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_multi_body.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_list.h"
@@ -79,7 +79,7 @@ void Atom_Energy( reax_system *system, control_params *control,
       numbonds ++;
 
     /* calculate the energy */
-    if (numbonds > 0)
+    if (numbonds > 0 || control->enobondsflag)
       data->my_en.e_lp += e_lp =
         p_lp2 * workspace->Delta_lp[i] * inv_expvd2;
 
@@ -87,7 +87,8 @@ void Atom_Energy( reax_system *system, control_params *control,
       75 * p_lp2 * workspace->Delta_lp[i] * expvd2 * SQR(inv_expvd2);
     CElp = dElp * workspace->dDelta_lp[i];
 
-    if (numbonds > 0) workspace->CdDelta[i] += CElp;  // lp - 1st term
+    if (numbonds > 0 || control->enobondsflag)
+      workspace->CdDelta[i] += CElp;  // lp - 1st term
 
     /* tally into per-atom energy */
     if( system->pair_ptr->evflag)
@@ -187,7 +188,7 @@ void Atom_Energy( reax_system *system, control_params *control,
     for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj )
       numbonds ++;
 
-    if (numbonds > 0)
+    if (numbonds > 0 || control->enobondsflag)
       data->my_en.e_un += e_un =
         -p_ovun5 * (1.0 - exp_ovun6) * inv_exp_ovun2n * inv_exp_ovun8;
 
@@ -202,13 +203,15 @@ void Atom_Energy( reax_system *system, control_params *control,
     /* tally into per-atom energy */
     if( system->pair_ptr->evflag) {
       eng_tmp = e_ov;
-      if (numbonds > 0) eng_tmp += e_un;
+      if (numbonds > 0 || control->enobondsflag)
+        eng_tmp += e_un;
       system->pair_ptr->ev_tally(i,i,system->n,1,eng_tmp,0.0,0.0,0.0,0.0,0.0);
     }
 
     /* forces */
     workspace->CdDelta[i] += CEover3;   // OvCoor - 2nd term
-    if (numbonds > 0) workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
+    if (numbonds > 0 || control->enobondsflag)
+      workspace->CdDelta[i] += CEunder3;  // UnCoor - 1st term
 
     for( pj = Start_Index(i, bonds); pj < End_Index(i, bonds); ++pj ) {
       pbond = &(bonds->select.bond_list[pj]);
diff --git a/src/USER-REAXC/reaxc_nonbonded.cpp b/src/USER-REAXC/reaxc_nonbonded.cpp
index cb24e2dc3704f8fa28008e336224b6352b185ecc..9c223428a68dfe386c52345a3a89193b2d2161f1 100644
--- a/src/USER-REAXC/reaxc_nonbonded.cpp
+++ b/src/USER-REAXC/reaxc_nonbonded.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_types.h"
 #include "reaxc_nonbonded.h"
 #include "reaxc_bond_orders.h"
diff --git a/src/USER-REAXC/reaxc_reset_tools.cpp b/src/USER-REAXC/reaxc_reset_tools.cpp
index 1e6aeab4753148247f5182b545d2f62afe3c10cc..4ec744e7b1c0a9ff1f823951690d6f86de0b4cc4 100644
--- a/src/USER-REAXC/reaxc_reset_tools.cpp
+++ b/src/USER-REAXC/reaxc_reset_tools.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_reset_tools.h"
 #include "reaxc_list.h"
 #include "reaxc_tool_box.h"
diff --git a/src/USER-REAXC/reaxc_system_props.cpp b/src/USER-REAXC/reaxc_system_props.cpp
index 6b4551a03f7c4f810ece3d0366fdc99c3ac4794b..54eeb6da1e863108148a11c169f20311288355c7 100644
--- a/src/USER-REAXC/reaxc_system_props.cpp
+++ b/src/USER-REAXC/reaxc_system_props.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_system_props.h"
 #include "reaxc_tool_box.h"
 #include "reaxc_vector.h"
diff --git a/src/USER-REAXC/reaxc_tool_box.cpp b/src/USER-REAXC/reaxc_tool_box.cpp
index 22576e9f3bba5d7eb4bf04cfcb88a9105b31ca85..4fc6796efe0cc67c2c874b7180a1df58b90b7323 100644
--- a/src/USER-REAXC/reaxc_tool_box.cpp
+++ b/src/USER-REAXC/reaxc_tool_box.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_tool_box.h"
 
 struct timeval tim;
diff --git a/src/USER-REAXC/reaxc_torsion_angles.cpp b/src/USER-REAXC/reaxc_torsion_angles.cpp
index 2cfe3297657a7e48f920fac2c014cafdaaca8cad..74d5b04f20b7ce2ac9a846c4e3716dfd1edd0a5e 100644
--- a/src/USER-REAXC/reaxc_torsion_angles.cpp
+++ b/src/USER-REAXC/reaxc_torsion_angles.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_torsion_angles.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_list.h"
diff --git a/src/USER-REAXC/reaxc_traj.cpp b/src/USER-REAXC/reaxc_traj.cpp
index 9d4fa73524078599bed83aa80642d0507fbbcfc8..ae2bba2150673f13cf3523fcb4ca1e78ab8d81e4 100644
--- a/src/USER-REAXC/reaxc_traj.cpp
+++ b/src/USER-REAXC/reaxc_traj.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_traj.h"
 #include "reaxc_list.h"
 #include "reaxc_tool_box.h"
diff --git a/src/USER-REAXC/reaxc_types.h b/src/USER-REAXC/reaxc_types.h
index db4cf04178f4c722da766c980555b06b14705444..b3e2f40f02d1dceb06b0434b84c55f8f052a5efd 100644
--- a/src/USER-REAXC/reaxc_types.h
+++ b/src/USER-REAXC/reaxc_types.h
@@ -61,13 +61,12 @@
 #define MAX_BOND                    20  // same as reaxc_defs.h
 
 /********************** TYPE DEFINITIONS ********************/
-typedef int  ivec[3];
+typedef int ivec[3];
 typedef double rvec[3];
 typedef double rtensor[3][3];
 typedef double rvec2[2];
 typedef double rvec4[4];
 
-
 // import LAMMPS' definition of tagint and bigint
 typedef LAMMPS_NS::tagint rc_tagint;
 typedef LAMMPS_NS::bigint rc_bigint;
@@ -79,7 +78,6 @@ typedef struct
   void *out_atoms;
 } mpi_out_data;
 
-
 typedef struct
 {
   MPI_Comm     world;
@@ -107,7 +105,6 @@ typedef struct
   void *in2_buffer;
 } mpi_datatypes;
 
-
 typedef struct
 {
   int n_global;
@@ -115,8 +112,6 @@ typedef struct
   int vdw_type;
 } global_parameters;
 
-
-
 typedef struct
 {
   /* Line one in field file */
@@ -163,8 +158,6 @@ typedef struct
 
 } single_body_parameters;
 
-
-
 /* Two Body Parameters */
 typedef struct {
   /* Bond Order parameters */
@@ -193,8 +186,6 @@ typedef struct {
   double v13cor, ovc;
 } two_body_parameters;
 
-
-
 /* 3-body parameters */
 typedef struct {
   /* valence angle */
@@ -214,15 +205,11 @@ typedef struct{
   three_body_parameters prm[REAX_MAX_3BODY_PARAM];
 } three_body_header;
 
-
-
 /* hydrogen-bond parameters */
 typedef struct{
   double r0_hb, p_hb1, p_hb2, p_hb3;
 } hbond_parameters;
 
-
-
 /* 4-body parameters */
 typedef struct {
   double V1, V2, V3;
@@ -234,14 +221,12 @@ typedef struct {
   double p_cot1;
 } four_body_parameters;
 
-
 typedef struct
 {
   int cnt;
   four_body_parameters prm[REAX_MAX_4BODY_PARAM];
 } four_body_header;
 
-
 typedef struct
 {
   int num_atom_types;
@@ -253,8 +238,6 @@ typedef struct
   four_body_header ****fbp;
 } reax_interaction;
 
-
-
 struct _reax_atom
 {
   rc_tagint  orig_id;
@@ -283,8 +266,6 @@ struct _reax_atom
 };
 typedef _reax_atom reax_atom;
 
-
-
 typedef struct
 {
   double V;
@@ -295,8 +276,6 @@ typedef struct
   rtensor g;
 } simulation_box;
 
-
-
 struct grid_cell
 {
   double cutoff;
@@ -471,7 +450,8 @@ typedef struct
   int  restrict_type;
 
   int lgflag;
-
+  int enobondsflag;
+  
 } control_params;
 
 
diff --git a/src/USER-REAXC/reaxc_valence_angles.cpp b/src/USER-REAXC/reaxc_valence_angles.cpp
index c2b3287be5fe71bb68f8a328ca463e4c678fed7f..c92996e56b9a776735bbe2eab7fa06819ef18eb2 100644
--- a/src/USER-REAXC/reaxc_valence_angles.cpp
+++ b/src/USER-REAXC/reaxc_valence_angles.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_valence_angles.h"
 #include "reaxc_bond_orders.h"
 #include "reaxc_list.h"
diff --git a/src/USER-REAXC/reaxc_vector.cpp b/src/USER-REAXC/reaxc_vector.cpp
index ee63e94280844fc07b050ad131a7681f12a5cb54..977b17a6dc2eb70a9910eb1d12a87f3cbd04b93b 100644
--- a/src/USER-REAXC/reaxc_vector.cpp
+++ b/src/USER-REAXC/reaxc_vector.cpp
@@ -24,7 +24,7 @@
   <http://www.gnu.org/licenses/>.
   ----------------------------------------------------------------------*/
 
-#include "pair_reax_c.h"
+#include "pair_reaxc.h"
 #include "reaxc_vector.h"
 
 
diff --git a/src/USER-TALLY/compute_force_tally.cpp b/src/USER-TALLY/compute_force_tally.cpp
index e9ecedd5abc3287e276cec82e6d0b3bd22924cd7..e97a1c751c323e4482d14477c1b76f75a448c2eb 100644
--- a/src/USER-TALLY/compute_force_tally.cpp
+++ b/src/USER-TALLY/compute_force_tally.cpp
@@ -65,12 +65,12 @@ ComputeForceTally::~ComputeForceTally()
 void ComputeForceTally::init()
 {
   if (force->pair == NULL)
-    error->all(FLERR,"Trying to use compute force/tally with no pair style");
+    error->all(FLERR,"Trying to use compute force/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->all(FLERR,"Compute force/tally used with incompatible pair style.");
+    error->warning(FLERR,"Compute force/tally used with incompatible pair style");
 
   if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
                           || force->improper || force->kspace))
diff --git a/src/USER-TALLY/compute_heat_flux_tally.cpp b/src/USER-TALLY/compute_heat_flux_tally.cpp
index 214311cb3df44414967a0045f73dfe1275e3bc6a..48cad538d5e09d5f9e32eff23ce3cbf02fc43afd 100644
--- a/src/USER-TALLY/compute_heat_flux_tally.cpp
+++ b/src/USER-TALLY/compute_heat_flux_tally.cpp
@@ -64,12 +64,12 @@ ComputeHeatFluxTally::~ComputeHeatFluxTally()
 void ComputeHeatFluxTally::init()
 {
   if (force->pair == NULL)
-    error->all(FLERR,"Trying to use compute heat/flux/tally with no pair style");
+    error->all(FLERR,"Trying to use compute heat/flux/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->all(FLERR,"Compute heat/flux/tally used with incompatible pair style.");
+    error->warning(FLERR,"Compute heat/flux/tally used with incompatible pair style");
 
   if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
                           || force->improper || force->kspace))
diff --git a/src/USER-TALLY/compute_pe_mol_tally.cpp b/src/USER-TALLY/compute_pe_mol_tally.cpp
index 09ee04d57adc9464658c1858d16a58efce8428cd..a30f2d6b9a9c9bc0d8edf509059a5f054abc0b49 100644
--- a/src/USER-TALLY/compute_pe_mol_tally.cpp
+++ b/src/USER-TALLY/compute_pe_mol_tally.cpp
@@ -59,15 +59,15 @@ ComputePEMolTally::~ComputePEMolTally()
 void ComputePEMolTally::init()
 {
   if (force->pair == NULL)
-    error->all(FLERR,"Trying to use compute pe/mol/tally with no pair style");
+    error->all(FLERR,"Trying to use compute pe/mol/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (atom->molecule_flag == 0)
-    error->all(FLERR,"Compute pe/mol/tally requires molecule IDs.");
+    error->all(FLERR,"Compute pe/mol/tally requires molecule IDs");
 
   if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->all(FLERR,"Compute pe/mol/tally used with incompatible pair style.");
+    error->warning(FLERR,"Compute pe/mol/tally used with incompatible pair style");
 
   if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
                           || force->improper || force->kspace))
diff --git a/src/USER-TALLY/compute_pe_tally.cpp b/src/USER-TALLY/compute_pe_tally.cpp
index 68c00b6d2e063194cdd9465d888917107961dfa5..2117f2cb15952e09189d0600a4b96ad17ed39f74 100644
--- a/src/USER-TALLY/compute_pe_tally.cpp
+++ b/src/USER-TALLY/compute_pe_tally.cpp
@@ -64,12 +64,12 @@ ComputePETally::~ComputePETally()
 void ComputePETally::init()
 {
   if (force->pair == NULL)
-    error->all(FLERR,"Trying to use compute pe/tally with no pair style");
+    error->all(FLERR,"Trying to use compute pe/tally without a pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->all(FLERR,"Compute pe/tally used with incompatible pair style.");
+    error->warning(FLERR,"Compute pe/tally used with incompatible pair style");
 
   if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
                           || force->improper || force->kspace))
diff --git a/src/USER-TALLY/compute_stress_tally.cpp b/src/USER-TALLY/compute_stress_tally.cpp
index 2575bd372a1f45d23f890b399f4165d72aa878bc..66df9f6e4ffc28d9dc68587e1393c03a7caf3f34 100644
--- a/src/USER-TALLY/compute_stress_tally.cpp
+++ b/src/USER-TALLY/compute_stress_tally.cpp
@@ -65,12 +65,12 @@ ComputeStressTally::~ComputeStressTally()
 void ComputeStressTally::init()
 {
   if (force->pair == NULL)
-    error->all(FLERR,"Trying to use compute stress/tally with no pair style");
+    error->all(FLERR,"Trying to use compute stress/tally without pair style");
   else
     force->pair->add_tally_callback(this);
 
   if (force->pair->single_enable == 0 || force->pair->manybody_flag)
-    error->all(FLERR,"Compute stress/tally used with incompatible pair style.");
+    error->warning(FLERR,"Compute stress/tally used with incompatible pair style");
 
   if ((comm->me == 0) && (force->bond || force->angle || force->dihedral
                           || force->improper || force->kspace))
diff --git a/src/USER-VTK/README b/src/USER-VTK/README
index 86ef56a7408eb2c1d0d49ae986e45734e7345566..3429c96b7282f2caa886ab707442e7b3fd0013bf 100644
--- a/src/USER-VTK/README
+++ b/src/USER-VTK/README
@@ -1,17 +1,17 @@
-This package implements the "dump custom/vtk" command which can be used in a
+This package implements the "dump vtk" command which can be used in a
 LAMMPS input script.
 
-This dump allows to output atom data similar to dump custom, but directly into
-VTK files.
+This dump allows output of atom data similar to the dump custom
+command, but in VTK format.
 
-This package uses the VTK library (www.vtk.org) which must be installed on your
-system. See the lib/vtk/README file and the LAMMPS manual for information on
-building LAMMPS with external libraries. The settings in the Makefile.lammps
-file in that directory must be correct for LAMMPS to build correctly with this
-package installed.
+This package uses the VTK library (www.vtk.org) which must be
+installed on your system. See the lib/vtk/README file and the LAMMPS
+manual for information on building LAMMPS with external libraries.
+The settings in the Makefile.lammps file in that directory must be
+correct for LAMMPS to build correctly with this package installed.
 
-This code was initially developed for LIGGGHTS by Daniel Queteschiner at DCS
-Computing. This is an effort to integrate it back to LAMMPS.
+This code was initially developed for LIGGGHTS by Daniel Queteschiner
+at DCS Computing. This is an effort to integrate it back to LAMMPS.
 
 The person who created this package is Richard Berger at JKU
 (richard.berger@jku.at). Contact him directly if you have questions.
diff --git a/src/USER-VTK/dump_custom_vtk.cpp b/src/USER-VTK/dump_vtk.cpp
similarity index 91%
rename from src/USER-VTK/dump_custom_vtk.cpp
rename to src/USER-VTK/dump_vtk.cpp
index 0e4bc459766a69b8864724454b88a4f5997547ab..0aa749e73b18237f22b0b2afb8fc7b54f82ee588 100644
--- a/src/USER-VTK/dump_custom_vtk.cpp
+++ b/src/USER-VTK/dump_vtk.cpp
@@ -25,7 +25,7 @@
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
-#include "dump_custom_vtk.h"
+#include "dump_vtk.h"
 #include "atom.h"
 #include "force.h"
 #include "domain.h"
@@ -39,12 +39,15 @@
 #include "fix.h"
 #include "memory.h"
 #include "error.h"
+
 #include <vector>
 #include <sstream>
 #include <vtkVersion.h>
+
 #ifndef VTK_MAJOR_VERSION
 #include <vtkConfigure.h>
 #endif
+
 #include <vtkPointData.h>
 #include <vtkCellData.h>
 #include <vtkDoubleArray.h>
@@ -93,10 +96,10 @@ enum{VTK,VTP,VTU,PVTP,PVTU}; // file formats
 
 /* ---------------------------------------------------------------------- */
 
-DumpCustomVTK::DumpCustomVTK(LAMMPS *lmp, int narg, char **arg) :
+DumpVTK::DumpVTK(LAMMPS *lmp, int narg, char **arg) :
   DumpCustom(lmp, narg, arg)
 {
-  if (narg == 5) error->all(FLERR,"No dump custom/vtk arguments specified");
+  if (narg == 5) error->all(FLERR,"No dump vtk arguments specified");
 
   pack_choice.clear();
   vtype.clear();
@@ -113,7 +116,7 @@ DumpCustomVTK::DumpCustomVTK(LAMMPS *lmp, int narg, char **arg) :
 
   if (ioptional < narg &&
       strcmp(style,"image") != 0 && strcmp(style,"movie") != 0)
-    error->all(FLERR,"Invalid attribute in dump custom command");
+    error->all(FLERR,"Invalid attribute in dump vtk command");
   size_one = pack_choice.size();
   current_pack_choice_key = -1;
 
@@ -162,7 +165,7 @@ DumpCustomVTK::DumpCustomVTK(LAMMPS *lmp, int narg, char **arg) :
 
 /* ---------------------------------------------------------------------- */
 
-DumpCustomVTK::~DumpCustomVTK()
+DumpVTK::~DumpVTK()
 {
   delete [] filecurrent;
   delete [] domainfilecurrent;
@@ -173,7 +176,7 @@ DumpCustomVTK::~DumpCustomVTK()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::init_style()
+void DumpVTK::init_style()
 {
   // default for element names = C
 
@@ -191,14 +194,14 @@ void DumpCustomVTK::init_style()
 
   // setup function ptrs
 
-  header_choice = &DumpCustomVTK::header_vtk;
+  header_choice = &DumpVTK::header_vtk;
 
   if (vtk_file_format == VTP || vtk_file_format == PVTP)
-    write_choice = &DumpCustomVTK::write_vtp;
+    write_choice = &DumpVTK::write_vtp;
   else if (vtk_file_format == VTU || vtk_file_format == PVTU)
-    write_choice = &DumpCustomVTK::write_vtu;
+    write_choice = &DumpVTK::write_vtu;
   else
-    write_choice = &DumpCustomVTK::write_vtk;
+    write_choice = &DumpVTK::write_vtk;
 
   // find current ptr for each compute,fix,variable
   // check that fix frequency is acceptable
@@ -206,24 +209,24 @@ void DumpCustomVTK::init_style()
   int icompute;
   for (int i = 0; i < ncompute; i++) {
     icompute = modify->find_compute(id_compute[i]);
-    if (icompute < 0) error->all(FLERR,"Could not find dump custom/vtk compute ID");
+    if (icompute < 0) error->all(FLERR,"Could not find dump vtk compute ID");
     compute[i] = modify->compute[icompute];
   }
 
   int ifix;
   for (int i = 0; i < nfix; i++) {
     ifix = modify->find_fix(id_fix[i]);
-    if (ifix < 0) error->all(FLERR,"Could not find dump custom/vtk fix ID");
+    if (ifix < 0) error->all(FLERR,"Could not find dump vtk fix ID");
     fix[i] = modify->fix[ifix];
     if (nevery % modify->fix[ifix]->peratom_freq)
-      error->all(FLERR,"Dump custom/vtk and fix not computed at compatible times");
+      error->all(FLERR,"Dump vtk and fix not computed at compatible times");
   }
 
   int ivariable;
   for (int i = 0; i < nvariable; i++) {
     ivariable = input->variable->find(id_variable[i]);
     if (ivariable < 0)
-      error->all(FLERR,"Could not find dump custom/vtk variable name");
+      error->all(FLERR,"Could not find dump vtk variable name");
     variable[i] = ivariable;
   }
 
@@ -239,25 +242,25 @@ void DumpCustomVTK::init_style()
   if (iregion >= 0) {
     iregion = domain->find_region(idregion);
     if (iregion == -1)
-      error->all(FLERR,"Region ID for dump custom/vtk does not exist");
+      error->all(FLERR,"Region ID for dump vtk does not exist");
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_header(bigint)
+void DumpVTK::write_header(bigint)
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::header_vtk(bigint)
+void DumpVTK::header_vtk(bigint)
 {
 }
 
 /* ---------------------------------------------------------------------- */
 
-int DumpCustomVTK::count()
+int DumpVTK::count()
 {
   n_calls_ = 0;
 
@@ -807,7 +810,7 @@ int DumpCustomVTK::count()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write()
+void DumpVTK::write()
 {
   // simulation box bounds
 
@@ -905,7 +908,7 @@ void DumpCustomVTK::write()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::pack(tagint *ids)
+void DumpVTK::pack(tagint *ids)
 {
   int n = 0;
   for (std::map<int,FnPtrPack>::iterator it=pack_choice.begin(); it!=pack_choice.end(); ++it, ++n) {
@@ -922,14 +925,14 @@ void DumpCustomVTK::pack(tagint *ids)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_data(int n, double *mybuf)
+void DumpVTK::write_data(int n, double *mybuf)
 {
   (this->*write_choice)(n,mybuf);
 }
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::setFileCurrent() {
+void DumpVTK::setFileCurrent() {
   delete [] filecurrent;
   filecurrent = NULL;
 
@@ -1064,7 +1067,7 @@ void DumpCustomVTK::setFileCurrent() {
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::buf2arrays(int n, double *mybuf)
+void DumpVTK::buf2arrays(int n, double *mybuf)
 {
   for (int iatom=0; iatom < n; ++iatom) {
     vtkIdType pid[1];
@@ -1123,7 +1126,7 @@ void DumpCustomVTK::buf2arrays(int n, double *mybuf)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::prepare_domain_data(vtkRectilinearGrid *rgrid)
+void DumpVTK::prepare_domain_data(vtkRectilinearGrid *rgrid)
 {
   vtkSmartPointer<vtkDoubleArray> xCoords =  vtkSmartPointer<vtkDoubleArray>::New();
   xCoords->InsertNextValue(boxxlo);
@@ -1143,7 +1146,7 @@ void DumpCustomVTK::prepare_domain_data(vtkRectilinearGrid *rgrid)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::prepare_domain_data_triclinic(vtkUnstructuredGrid *hexahedronGrid)
+void DumpVTK::prepare_domain_data_triclinic(vtkUnstructuredGrid *hexahedronGrid)
 {
   vtkSmartPointer<vtkPoints> hexahedronPoints = vtkSmartPointer<vtkPoints>::New();
   hexahedronPoints->SetNumberOfPoints(8);
@@ -1173,7 +1176,7 @@ void DumpCustomVTK::prepare_domain_data_triclinic(vtkUnstructuredGrid *hexahedro
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_domain_vtk()
+void DumpVTK::write_domain_vtk()
 {
   vtkSmartPointer<vtkRectilinearGrid> rgrid = vtkSmartPointer<vtkRectilinearGrid>::New();
   prepare_domain_data(rgrid.GetPointer());
@@ -1197,7 +1200,7 @@ void DumpCustomVTK::write_domain_vtk()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_domain_vtk_triclinic()
+void DumpVTK::write_domain_vtk_triclinic()
 {
   vtkSmartPointer<vtkUnstructuredGrid> hexahedronGrid = vtkSmartPointer<vtkUnstructuredGrid>::New();
   prepare_domain_data_triclinic(hexahedronGrid.GetPointer());
@@ -1221,7 +1224,7 @@ void DumpCustomVTK::write_domain_vtk_triclinic()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_domain_vtr()
+void DumpVTK::write_domain_vtr()
 {
   vtkSmartPointer<vtkRectilinearGrid> rgrid = vtkSmartPointer<vtkRectilinearGrid>::New();
   prepare_domain_data(rgrid.GetPointer());
@@ -1242,7 +1245,7 @@ void DumpCustomVTK::write_domain_vtr()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_domain_vtu_triclinic()
+void DumpVTK::write_domain_vtu_triclinic()
 {
   vtkSmartPointer<vtkUnstructuredGrid> hexahedronGrid = vtkSmartPointer<vtkUnstructuredGrid>::New();
   prepare_domain_data_triclinic(hexahedronGrid.GetPointer());
@@ -1263,7 +1266,7 @@ void DumpCustomVTK::write_domain_vtu_triclinic()
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_vtk(int n, double *mybuf)
+void DumpVTK::write_vtk(int n, double *mybuf)
 {
   ++n_calls_;
 
@@ -1330,7 +1333,7 @@ void DumpCustomVTK::write_vtk(int n, double *mybuf)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_vtp(int n, double *mybuf)
+void DumpVTK::write_vtp(int n, double *mybuf)
 {
   ++n_calls_;
 
@@ -1394,7 +1397,7 @@ void DumpCustomVTK::write_vtp(int n, double *mybuf)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::write_vtu(int n, double *mybuf)
+void DumpVTK::write_vtu(int n, double *mybuf)
 {
   ++n_calls_;
 
@@ -1457,7 +1460,7 @@ void DumpCustomVTK::write_vtu(int n, double *mybuf)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::reset_vtk_data_containers()
+void DumpVTK::reset_vtk_data_containers()
 {
   points = vtkSmartPointer<vtkPoints>::New();
   pointsCells = vtkSmartPointer<vtkCellArray>::New();
@@ -1489,16 +1492,16 @@ void DumpCustomVTK::reset_vtk_data_containers()
 
 /* ---------------------------------------------------------------------- */
 
-int DumpCustomVTK::parse_fields(int narg, char **arg)
+int DumpVTK::parse_fields(int narg, char **arg)
 {
 
-  pack_choice[X] = &DumpCustomVTK::pack_x;
+  pack_choice[X] = &DumpVTK::pack_x;
   vtype[X] = DOUBLE;
   name[X] = "x";
-  pack_choice[Y] = &DumpCustomVTK::pack_y;
+  pack_choice[Y] = &DumpVTK::pack_y;
   vtype[Y] = DOUBLE;
   name[Y] = "y";
-  pack_choice[Z] = &DumpCustomVTK::pack_z;
+  pack_choice[Z] = &DumpVTK::pack_z;
   vtype[Z] = DOUBLE;
   name[Z] = "z";
 
@@ -1508,33 +1511,33 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     i = iarg-5;
 
     if (strcmp(arg[iarg],"id") == 0) {
-      pack_choice[ID] = &DumpCustomVTK::pack_id;
+      pack_choice[ID] = &DumpVTK::pack_id;
       vtype[ID] = INT;
       name[ID] = arg[iarg];
     } else if (strcmp(arg[iarg],"mol") == 0) {
       if (!atom->molecule_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[MOL] = &DumpCustomVTK::pack_molecule;
+      pack_choice[MOL] = &DumpVTK::pack_molecule;
       vtype[MOL] = INT;
       name[MOL] = arg[iarg];
     } else if (strcmp(arg[iarg],"proc") == 0) {
-      pack_choice[PROC] = &DumpCustomVTK::pack_proc;
+      pack_choice[PROC] = &DumpVTK::pack_proc;
       vtype[PROC] = INT;
       name[PROC] = arg[iarg];
     } else if (strcmp(arg[iarg],"procp1") == 0) {
-      pack_choice[PROCP1] = &DumpCustomVTK::pack_procp1;
+      pack_choice[PROCP1] = &DumpVTK::pack_procp1;
       vtype[PROCP1] = INT;
       name[PROCP1] = arg[iarg];
     } else if (strcmp(arg[iarg],"type") == 0) {
-      pack_choice[TYPE] = &DumpCustomVTK::pack_type;
+      pack_choice[TYPE] = &DumpVTK::pack_type;
       vtype[TYPE] = INT;
       name[TYPE] =arg[iarg];
     } else if (strcmp(arg[iarg],"element") == 0) {
-      pack_choice[ELEMENT] = &DumpCustomVTK::pack_type;
+      pack_choice[ELEMENT] = &DumpVTK::pack_type;
       vtype[ELEMENT] = STRING;
       name[ELEMENT] = arg[iarg];
     } else if (strcmp(arg[iarg],"mass") == 0) {
-      pack_choice[MASS] = &DumpCustomVTK::pack_mass;
+      pack_choice[MASS] = &DumpVTK::pack_mass;
       vtype[MASS] = DOUBLE;
       name[MASS] = arg[iarg];
 
@@ -1545,182 +1548,182 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     } else if (strcmp(arg[iarg],"z") == 0) {
       // required property
     } else if (strcmp(arg[iarg],"xs") == 0) {
-      if (domain->triclinic) pack_choice[XS] = &DumpCustomVTK::pack_xs_triclinic;
-      else pack_choice[XS] = &DumpCustomVTK::pack_xs;
+      if (domain->triclinic) pack_choice[XS] = &DumpVTK::pack_xs_triclinic;
+      else pack_choice[XS] = &DumpVTK::pack_xs;
       vtype[XS] = DOUBLE;
       name[XS] = arg[iarg];
     } else if (strcmp(arg[iarg],"ys") == 0) {
-      if (domain->triclinic) pack_choice[YS] = &DumpCustomVTK::pack_ys_triclinic;
-      else pack_choice[YS] = &DumpCustomVTK::pack_ys;
+      if (domain->triclinic) pack_choice[YS] = &DumpVTK::pack_ys_triclinic;
+      else pack_choice[YS] = &DumpVTK::pack_ys;
       vtype[YS] = DOUBLE;
       name[YS] = arg[iarg];
     } else if (strcmp(arg[iarg],"zs") == 0) {
-      if (domain->triclinic) pack_choice[ZS] = &DumpCustomVTK::pack_zs_triclinic;
-      else pack_choice[ZS] = &DumpCustomVTK::pack_zs;
+      if (domain->triclinic) pack_choice[ZS] = &DumpVTK::pack_zs_triclinic;
+      else pack_choice[ZS] = &DumpVTK::pack_zs;
       vtype[ZS] = DOUBLE;
       name[ZS] = arg[iarg];
     } else if (strcmp(arg[iarg],"xu") == 0) {
-      if (domain->triclinic) pack_choice[XU] = &DumpCustomVTK::pack_xu_triclinic;
-      else pack_choice[XU] = &DumpCustomVTK::pack_xu;
+      if (domain->triclinic) pack_choice[XU] = &DumpVTK::pack_xu_triclinic;
+      else pack_choice[XU] = &DumpVTK::pack_xu;
       vtype[XU] = DOUBLE;
       name[XU] = arg[iarg];
     } else if (strcmp(arg[iarg],"yu") == 0) {
-      if (domain->triclinic) pack_choice[YU] = &DumpCustomVTK::pack_yu_triclinic;
-      else pack_choice[YU] = &DumpCustomVTK::pack_yu;
+      if (domain->triclinic) pack_choice[YU] = &DumpVTK::pack_yu_triclinic;
+      else pack_choice[YU] = &DumpVTK::pack_yu;
       vtype[YU] = DOUBLE;
       name[YU] = arg[iarg];
     } else if (strcmp(arg[iarg],"zu") == 0) {
-      if (domain->triclinic) pack_choice[ZU] = &DumpCustomVTK::pack_zu_triclinic;
-      else pack_choice[ZU] = &DumpCustomVTK::pack_zu;
+      if (domain->triclinic) pack_choice[ZU] = &DumpVTK::pack_zu_triclinic;
+      else pack_choice[ZU] = &DumpVTK::pack_zu;
       vtype[ZU] = DOUBLE;
       name[ZU] = arg[iarg];
     } else if (strcmp(arg[iarg],"xsu") == 0) {
-      if (domain->triclinic) pack_choice[XSU] = &DumpCustomVTK::pack_xsu_triclinic;
-      else pack_choice[XSU] = &DumpCustomVTK::pack_xsu;
+      if (domain->triclinic) pack_choice[XSU] = &DumpVTK::pack_xsu_triclinic;
+      else pack_choice[XSU] = &DumpVTK::pack_xsu;
       vtype[XSU] = DOUBLE;
       name[XSU] = arg[iarg];
     } else if (strcmp(arg[iarg],"ysu") == 0) {
-      if (domain->triclinic) pack_choice[YSU] = &DumpCustomVTK::pack_ysu_triclinic;
-      else pack_choice[YSU] = &DumpCustomVTK::pack_ysu;
+      if (domain->triclinic) pack_choice[YSU] = &DumpVTK::pack_ysu_triclinic;
+      else pack_choice[YSU] = &DumpVTK::pack_ysu;
       vtype[YSU] = DOUBLE;
       name[YSU] = arg[iarg];
     } else if (strcmp(arg[iarg],"zsu") == 0) {
-      if (domain->triclinic) pack_choice[ZSU] = &DumpCustomVTK::pack_zsu_triclinic;
-      else pack_choice[ZSU] = &DumpCustomVTK::pack_zsu;
+      if (domain->triclinic) pack_choice[ZSU] = &DumpVTK::pack_zsu_triclinic;
+      else pack_choice[ZSU] = &DumpVTK::pack_zsu;
       vtype[ZSU] = DOUBLE;
       name[ZSU] = arg[iarg];
     } else if (strcmp(arg[iarg],"ix") == 0) {
-      pack_choice[IX] = &DumpCustomVTK::pack_ix;
+      pack_choice[IX] = &DumpVTK::pack_ix;
       vtype[IX] = INT;
       name[IX] = arg[iarg];
     } else if (strcmp(arg[iarg],"iy") == 0) {
-      pack_choice[IY] = &DumpCustomVTK::pack_iy;
+      pack_choice[IY] = &DumpVTK::pack_iy;
       vtype[IY] = INT;
       name[IY] = arg[iarg];
     } else if (strcmp(arg[iarg],"iz") == 0) {
-      pack_choice[IZ] = &DumpCustomVTK::pack_iz;
+      pack_choice[IZ] = &DumpVTK::pack_iz;
       vtype[IZ] = INT;
       name[IZ] = arg[iarg];
 
     } else if (strcmp(arg[iarg],"vx") == 0) {
-      pack_choice[VX] = &DumpCustomVTK::pack_vx;
+      pack_choice[VX] = &DumpVTK::pack_vx;
       vtype[VX] = DOUBLE;
       name[VX] = arg[iarg];
     } else if (strcmp(arg[iarg],"vy") == 0) {
-      pack_choice[VY] = &DumpCustomVTK::pack_vy;
+      pack_choice[VY] = &DumpVTK::pack_vy;
       vtype[VY] = DOUBLE;
       name[VY] = arg[iarg];
     } else if (strcmp(arg[iarg],"vz") == 0) {
-      pack_choice[VZ] = &DumpCustomVTK::pack_vz;
+      pack_choice[VZ] = &DumpVTK::pack_vz;
       vtype[VZ] = DOUBLE;
       name[VZ] = arg[iarg];
     } else if (strcmp(arg[iarg],"fx") == 0) {
-      pack_choice[FX] = &DumpCustomVTK::pack_fx;
+      pack_choice[FX] = &DumpVTK::pack_fx;
       vtype[FX] = DOUBLE;
       name[FX] = arg[iarg];
     } else if (strcmp(arg[iarg],"fy") == 0) {
-      pack_choice[FY] = &DumpCustomVTK::pack_fy;
+      pack_choice[FY] = &DumpVTK::pack_fy;
       vtype[FY] = DOUBLE;
       name[FY] = arg[iarg];
     } else if (strcmp(arg[iarg],"fz") == 0) {
-      pack_choice[FZ] = &DumpCustomVTK::pack_fz;
+      pack_choice[FZ] = &DumpVTK::pack_fz;
       vtype[FZ] = DOUBLE;
       name[FZ] = arg[iarg];
     } else if (strcmp(arg[iarg],"q") == 0) {
       if (!atom->q_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[Q] = &DumpCustomVTK::pack_q;
+      pack_choice[Q] = &DumpVTK::pack_q;
       vtype[Q] = DOUBLE;
       name[Q] = arg[iarg];
     } else if (strcmp(arg[iarg],"mux") == 0) {
       if (!atom->mu_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[MUX] = &DumpCustomVTK::pack_mux;
+      pack_choice[MUX] = &DumpVTK::pack_mux;
       vtype[MUX] = DOUBLE;
       name[MUX] = arg[iarg];
     } else if (strcmp(arg[iarg],"muy") == 0) {
       if (!atom->mu_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[MUY] = &DumpCustomVTK::pack_muy;
+      pack_choice[MUY] = &DumpVTK::pack_muy;
       vtype[MUY] = DOUBLE;
       name[MUY] = arg[iarg];
     } else if (strcmp(arg[iarg],"muz") == 0) {
       if (!atom->mu_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[MUZ] = &DumpCustomVTK::pack_muz;
+      pack_choice[MUZ] = &DumpVTK::pack_muz;
       vtype[MUZ] = DOUBLE;
       name[MUZ] = arg[iarg];
     } else if (strcmp(arg[iarg],"mu") == 0) {
       if (!atom->mu_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[MU] = &DumpCustomVTK::pack_mu;
+      pack_choice[MU] = &DumpVTK::pack_mu;
       vtype[MU] = DOUBLE;
       name[MU] = arg[iarg];
 
     } else if (strcmp(arg[iarg],"radius") == 0) {
       if (!atom->radius_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[RADIUS] = &DumpCustomVTK::pack_radius;
+      pack_choice[RADIUS] = &DumpVTK::pack_radius;
       vtype[RADIUS] = DOUBLE;
       name[RADIUS] = arg[iarg];
     } else if (strcmp(arg[iarg],"diameter") == 0) {
       if (!atom->radius_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[DIAMETER] = &DumpCustomVTK::pack_diameter;
+      pack_choice[DIAMETER] = &DumpVTK::pack_diameter;
       vtype[DIAMETER] = DOUBLE;
       name[DIAMETER] = arg[iarg];
     } else if (strcmp(arg[iarg],"omegax") == 0) {
       if (!atom->omega_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[OMEGAX] = &DumpCustomVTK::pack_omegax;
+      pack_choice[OMEGAX] = &DumpVTK::pack_omegax;
       vtype[OMEGAX] = DOUBLE;
       name[OMEGAX] = arg[iarg];
     } else if (strcmp(arg[iarg],"omegay") == 0) {
       if (!atom->omega_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[OMEGAY] = &DumpCustomVTK::pack_omegay;
+      pack_choice[OMEGAY] = &DumpVTK::pack_omegay;
       vtype[OMEGAY] = DOUBLE;
       name[OMEGAY] = arg[iarg];
     } else if (strcmp(arg[iarg],"omegaz") == 0) {
       if (!atom->omega_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[OMEGAZ] = &DumpCustomVTK::pack_omegaz;
+      pack_choice[OMEGAZ] = &DumpVTK::pack_omegaz;
       vtype[OMEGAZ] = DOUBLE;
       name[OMEGAZ] = arg[iarg];
     } else if (strcmp(arg[iarg],"angmomx") == 0) {
       if (!atom->angmom_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[ANGMOMX] = &DumpCustomVTK::pack_angmomx;
+      pack_choice[ANGMOMX] = &DumpVTK::pack_angmomx;
       vtype[ANGMOMX] = DOUBLE;
       name[ANGMOMX] = arg[iarg];
     } else if (strcmp(arg[iarg],"angmomy") == 0) {
       if (!atom->angmom_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[ANGMOMY] = &DumpCustomVTK::pack_angmomy;
+      pack_choice[ANGMOMY] = &DumpVTK::pack_angmomy;
       vtype[ANGMOMY] = DOUBLE;
       name[ANGMOMY] = arg[iarg];
     } else if (strcmp(arg[iarg],"angmomz") == 0) {
       if (!atom->angmom_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[ANGMOMZ] = &DumpCustomVTK::pack_angmomz;
+      pack_choice[ANGMOMZ] = &DumpVTK::pack_angmomz;
       vtype[ANGMOMZ] = DOUBLE;
       name[ANGMOMZ] = arg[iarg];
     } else if (strcmp(arg[iarg],"tqx") == 0) {
       if (!atom->torque_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[TQX] = &DumpCustomVTK::pack_tqx;
+      pack_choice[TQX] = &DumpVTK::pack_tqx;
       vtype[TQX] = DOUBLE;
       name[TQX] = arg[iarg];
     } else if (strcmp(arg[iarg],"tqy") == 0) {
       if (!atom->torque_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[TQY] = &DumpCustomVTK::pack_tqy;
+      pack_choice[TQY] = &DumpVTK::pack_tqy;
       vtype[TQY] = DOUBLE;
       name[TQY] = arg[iarg];
     } else if (strcmp(arg[iarg],"tqz") == 0) {
       if (!atom->torque_flag)
         error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[TQZ] = &DumpCustomVTK::pack_tqz;
+      pack_choice[TQZ] = &DumpVTK::pack_tqz;
       vtype[TQZ] = DOUBLE;
       name[TQZ] = arg[iarg];
 
@@ -1728,7 +1731,7 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     // if no trailing [], then arg is set to 0, else arg is int between []
 
     } else if (strncmp(arg[iarg],"c_",2) == 0) {
-      pack_choice[ATTRIBUTES+i] = &DumpCustomVTK::pack_compute;
+      pack_choice[ATTRIBUTES+i] = &DumpVTK::pack_compute;
       vtype[ATTRIBUTES+i] = DOUBLE;
 
       int n = strlen(arg[iarg]);
@@ -1738,24 +1741,24 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
       char *ptr = strchr(suffix,'[');
       if (ptr) {
         if (suffix[strlen(suffix)-1] != ']')
-          error->all(FLERR,"Invalid attribute in dump custom/vtk command");
+          error->all(FLERR,"Invalid attribute in dump vtk command");
         argindex[ATTRIBUTES+i] = atoi(ptr+1);
         *ptr = '\0';
       } else argindex[ATTRIBUTES+i] = 0;
 
       n = modify->find_compute(suffix);
-      if (n < 0) error->all(FLERR,"Could not find dump custom/vtk compute ID");
+      if (n < 0) error->all(FLERR,"Could not find dump vtk compute ID");
       if (modify->compute[n]->peratom_flag == 0)
-        error->all(FLERR,"Dump custom/vtk compute does not compute per-atom info");
+        error->all(FLERR,"Dump vtk compute does not compute per-atom info");
       if (argindex[ATTRIBUTES+i] == 0 && modify->compute[n]->size_peratom_cols > 0)
         error->all(FLERR,
-                   "Dump custom/vtk compute does not calculate per-atom vector");
+                   "Dump vtk compute does not calculate per-atom vector");
       if (argindex[ATTRIBUTES+i] > 0 && modify->compute[n]->size_peratom_cols == 0)
         error->all(FLERR,\
-                   "Dump custom/vtk compute does not calculate per-atom array");
+                   "Dump vtk compute does not calculate per-atom array");
       if (argindex[ATTRIBUTES+i] > 0 &&
           argindex[ATTRIBUTES+i] > modify->compute[n]->size_peratom_cols)
-        error->all(FLERR,"Dump custom/vtk compute vector is accessed out-of-range");
+        error->all(FLERR,"Dump vtk compute vector is accessed out-of-range");
 
       field2index[ATTRIBUTES+i] = add_compute(suffix);
       name[ATTRIBUTES+i] = arg[iarg];
@@ -1765,7 +1768,7 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     // if no trailing [], then arg is set to 0, else arg is between []
 
     } else if (strncmp(arg[iarg],"f_",2) == 0) {
-      pack_choice[ATTRIBUTES+i] = &DumpCustomVTK::pack_fix;
+      pack_choice[ATTRIBUTES+i] = &DumpVTK::pack_fix;
       vtype[ATTRIBUTES+i] = DOUBLE;
 
       int n = strlen(arg[iarg]);
@@ -1775,22 +1778,22 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
       char *ptr = strchr(suffix,'[');
       if (ptr) {
         if (suffix[strlen(suffix)-1] != ']')
-          error->all(FLERR,"Invalid attribute in dump custom/vtk command");
+          error->all(FLERR,"Invalid attribute in dump vtk command");
         argindex[ATTRIBUTES+i] = atoi(ptr+1);
         *ptr = '\0';
       } else argindex[ATTRIBUTES+i] = 0;
 
       n = modify->find_fix(suffix);
-      if (n < 0) error->all(FLERR,"Could not find dump custom/vtk fix ID");
+      if (n < 0) error->all(FLERR,"Could not find dump vtk fix ID");
       if (modify->fix[n]->peratom_flag == 0)
-        error->all(FLERR,"Dump custom/vtk fix does not compute per-atom info");
+        error->all(FLERR,"Dump vtk fix does not compute per-atom info");
       if (argindex[ATTRIBUTES+i] == 0 && modify->fix[n]->size_peratom_cols > 0)
-        error->all(FLERR,"Dump custom/vtk fix does not compute per-atom vector");
+        error->all(FLERR,"Dump vtk fix does not compute per-atom vector");
       if (argindex[ATTRIBUTES+i] > 0 && modify->fix[n]->size_peratom_cols == 0)
-        error->all(FLERR,"Dump custom/vtk fix does not compute per-atom array");
+        error->all(FLERR,"Dump vtk fix does not compute per-atom array");
       if (argindex[ATTRIBUTES+i] > 0 &&
           argindex[ATTRIBUTES+i] > modify->fix[n]->size_peratom_cols)
-        error->all(FLERR,"Dump custom/vtk fix vector is accessed out-of-range");
+        error->all(FLERR,"Dump vtk fix vector is accessed out-of-range");
 
       field2index[ATTRIBUTES+i] = add_fix(suffix);
       name[ATTRIBUTES+i] = arg[iarg];
@@ -1799,7 +1802,7 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     // variable value = v_name
 
     } else if (strncmp(arg[iarg],"v_",2) == 0) {
-      pack_choice[ATTRIBUTES+i] = &DumpCustomVTK::pack_variable;
+      pack_choice[ATTRIBUTES+i] = &DumpVTK::pack_variable;
       vtype[ATTRIBUTES+i] = DOUBLE;
 
       int n = strlen(arg[iarg]);
@@ -1809,9 +1812,9 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
       argindex[ATTRIBUTES+i] = 0;
 
       n = input->variable->find(suffix);
-      if (n < 0) error->all(FLERR,"Could not find dump custom/vtk variable name");
+      if (n < 0) error->all(FLERR,"Could not find dump vtk variable name");
       if (input->variable->atomstyle(n) == 0)
-        error->all(FLERR,"Dump custom/vtk variable is not atom-style variable");
+        error->all(FLERR,"Dump vtk variable is not atom-style variable");
 
       field2index[ATTRIBUTES+i] = add_variable(suffix);
       name[ATTRIBUTES+i] = suffix;
@@ -1820,7 +1823,7 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     // custom per-atom floating point value = d_ID
 
     } else if (strncmp(arg[iarg],"d_",2) == 0) {
-      pack_choice[ATTRIBUTES+i] = &DumpCustomVTK::pack_custom;
+      pack_choice[ATTRIBUTES+i] = &DumpVTK::pack_custom;
       vtype[ATTRIBUTES+i] = DOUBLE;
 
       int n = strlen(arg[iarg]);
@@ -1843,7 +1846,7 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
     // custom per-atom integer value = i_ID
 
     } else if (strncmp(arg[iarg],"i_",2) == 0) {
-      pack_choice[ATTRIBUTES+i] = &DumpCustomVTK::pack_custom;
+      pack_choice[ATTRIBUTES+i] = &DumpVTK::pack_custom;
       vtype[ATTRIBUTES+i] = INT;
 
       int n = strlen(arg[iarg]);
@@ -1873,7 +1876,7 @@ int DumpCustomVTK::parse_fields(int narg, char **arg)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::identify_vectors()
+void DumpVTK::identify_vectors()
 {
   // detect vectors
   vector_set.insert(X); // required
@@ -1923,7 +1926,7 @@ void DumpCustomVTK::identify_vectors()
    if already in list, do not add, just return index, else add to list
 ------------------------------------------------------------------------- */
 
-int DumpCustomVTK::add_compute(char *id)
+int DumpVTK::add_compute(char *id)
 {
   int icompute;
   for (icompute = 0; icompute < ncompute; icompute++)
@@ -1948,7 +1951,7 @@ int DumpCustomVTK::add_compute(char *id)
    if already in list, do not add, just return index, else add to list
 ------------------------------------------------------------------------- */
 
-int DumpCustomVTK::add_fix(char *id)
+int DumpVTK::add_fix(char *id)
 {
   int ifix;
   for (ifix = 0; ifix < nfix; ifix++)
@@ -1973,7 +1976,7 @@ int DumpCustomVTK::add_fix(char *id)
    if already in list, do not add, just return index, else add to list
 ------------------------------------------------------------------------- */
 
-int DumpCustomVTK::add_variable(char *id)
+int DumpVTK::add_variable(char *id)
 {
   int ivariable;
   for (ivariable = 0; ivariable < nvariable; ivariable++)
@@ -2002,7 +2005,7 @@ int DumpCustomVTK::add_variable(char *id)
    if already in list, do not add, just return index, else add to list
 ------------------------------------------------------------------------- */
 
-int DumpCustomVTK::add_custom(char *id, int flag)
+int DumpVTK::add_custom(char *id, int flag)
 {
   int icustom;
   for (icustom = 0; icustom < ncustom; icustom++)
@@ -2026,7 +2029,7 @@ int DumpCustomVTK::add_custom(char *id, int flag)
 
 /* ---------------------------------------------------------------------- */
 
-int DumpCustomVTK::modify_param(int narg, char **arg)
+int DumpVTK::modify_param(int narg, char **arg)
 {
   if (strcmp(arg[0],"region") == 0) {
     if (narg < 2) error->all(FLERR,"Illegal dump_modify command");
@@ -2301,7 +2304,7 @@ int DumpCustomVTK::modify_param(int narg, char **arg)
    return # of bytes of allocated memory in buf, choose, variable arrays
 ------------------------------------------------------------------------- */
 
-bigint DumpCustomVTK::memory_usage()
+bigint DumpVTK::memory_usage()
 {
   bigint bytes = Dump::memory_usage();
   bytes += memory->usage(choose,maxlocal);
@@ -2315,7 +2318,7 @@ bigint DumpCustomVTK::memory_usage()
    extraction of Compute, Fix, Variable results
 ------------------------------------------------------------------------- */
 
-void DumpCustomVTK::pack_compute(int n)
+void DumpVTK::pack_compute(int n)
 {
   double *vector = compute[field2index[current_pack_choice_key]]->vector_atom;
   double **array = compute[field2index[current_pack_choice_key]]->array_atom;
@@ -2337,7 +2340,7 @@ void DumpCustomVTK::pack_compute(int n)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::pack_fix(int n)
+void DumpVTK::pack_fix(int n)
 {
   double *vector = fix[field2index[current_pack_choice_key]]->vector_atom;
   double **array = fix[field2index[current_pack_choice_key]]->array_atom;
@@ -2359,7 +2362,7 @@ void DumpCustomVTK::pack_fix(int n)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::pack_variable(int n)
+void DumpVTK::pack_variable(int n)
 {
   double *vector = vbuf[field2index[current_pack_choice_key]];
 
@@ -2371,9 +2374,8 @@ void DumpCustomVTK::pack_variable(int n)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpCustomVTK::pack_custom(int n)
+void DumpVTK::pack_custom(int n)
 {
-
   int index = field2index[n];
 
   if (flag_custom[index] == 0) { // integer
diff --git a/src/USER-VTK/dump_custom_vtk.h b/src/USER-VTK/dump_vtk.h
similarity index 95%
rename from src/USER-VTK/dump_custom_vtk.h
rename to src/USER-VTK/dump_vtk.h
index f3b4a8b63e63cad24849ed30ae8cfc7ccb24473e..603ca114ba6ddc33938b76d6c05cfb1aa9556056 100644
--- a/src/USER-VTK/dump_custom_vtk.h
+++ b/src/USER-VTK/dump_vtk.h
@@ -17,12 +17,12 @@
 
 #ifdef DUMP_CLASS
 
-DumpStyle(custom/vtk,DumpCustomVTK)
+DumpStyle(vtk,DumpVTK)
 
 #else
 
-#ifndef LMP_DUMP_CUSTOM_VTK_H
-#define LMP_DUMP_CUSTOM_VTK_H
+#ifndef LMP_DUMP_VTK_H
+#define LMP_DUMP_VTK_H
 
 #include "dump_custom.h"
 #include <map>
@@ -40,7 +40,7 @@ class vtkUnstructuredGrid;
 namespace LAMMPS_NS {
 
 /**
- * @brief DumpCustomVTK class
+ * @brief DumpVTK class
  *        write atom data to vtk files.
  *
  * Similar to the DumpCustom class but uses the vtk library to write data to vtk simple
@@ -54,10 +54,11 @@ namespace LAMMPS_NS {
  * This dump command does not support compressed files, buffering or custom format strings,
  * multiproc is only supported by the xml formats, multifile option has to be used.
  */
-class DumpCustomVTK : public DumpCustom {
+
+class DumpVTK : public DumpCustom {
  public:
-  DumpCustomVTK(class LAMMPS *, int, char **);
-  virtual ~DumpCustomVTK();
+  DumpVTK(class LAMMPS *, int, char **);
+  virtual ~DumpVTK();
 
   virtual void write();
  protected:
@@ -86,11 +87,11 @@ class DumpCustomVTK : public DumpCustom {
   int add_custom(char *, int);
   virtual int modify_param(int, char **);
 
-  typedef void (DumpCustomVTK::*FnPtrHeader)(bigint);
+  typedef void (DumpVTK::*FnPtrHeader)(bigint);
   FnPtrHeader header_choice;           // ptr to write header functions
   void header_vtk(bigint);
 
-  typedef void (DumpCustomVTK::*FnPtrWrite)(int, double *);
+  typedef void (DumpVTK::*FnPtrWrite)(int, double *);
   FnPtrWrite write_choice;             // ptr to write data functions
   void write_vtk(int, double *);
   void write_vtp(int, double *);
@@ -103,7 +104,7 @@ class DumpCustomVTK : public DumpCustom {
   void write_domain_vtr();
   void write_domain_vtu_triclinic();
 
-  typedef void (DumpCustomVTK::*FnPtrPack)(int);
+  typedef void (DumpVTK::*FnPtrPack)(int);
   std::map<int, FnPtrPack> pack_choice;  // ptrs to pack functions
   std::map<int, int> vtype;              // data type
   std::map<int, std::string> name;       // attribute labels
diff --git a/src/compute_dipole_chunk.cpp b/src/compute_dipole_chunk.cpp
index 74d66e7c1b5d03e673234cad9813f5aded6b9397..45389ee61418a425f082e8bb80ec6438a2f8f322 100644
--- a/src/compute_dipole_chunk.cpp
+++ b/src/compute_dipole_chunk.cpp
@@ -31,10 +31,12 @@ enum { MASSCENTER, GEOMCENTER };
 
 ComputeDipoleChunk::ComputeDipoleChunk(LAMMPS *lmp, int narg, char **arg) : 
   Compute(lmp, narg, arg),
-  idchunk(NULL), massproc(NULL), masstotal(NULL), chrgproc(NULL), chrgtotal(NULL), com(NULL),
+  idchunk(NULL), massproc(NULL), masstotal(NULL), chrgproc(NULL), 
+  chrgtotal(NULL), com(NULL),
   comall(NULL), dipole(NULL), dipoleall(NULL)
 {
-  if ((narg != 4) && (narg != 5)) error->all(FLERR,"Illegal compute dipole/chunk command");
+  if ((narg != 4) && (narg != 5)) 
+    error->all(FLERR,"Illegal compute dipole/chunk command");
 
   array_flag = 1;
   size_array_cols = 4;
diff --git a/src/domain.cpp b/src/domain.cpp
index 31fb3b855955e15126bb2e881ac6cfa68872e1ca..8ead12cd4e2b0f6e1289dfaf17c1614e2e334b05 100644
--- a/src/domain.cpp
+++ b/src/domain.cpp
@@ -944,6 +944,10 @@ void Domain::subbox_too_small_check(double thresh)
    changed "if" to "while" to enable distance to
      far-away ghost atom returned by atom->map() to be wrapped back into box
      could be problem for looking up atom IDs when cutoff > boxsize
+   this should not be used if atom has moved infinitely far outside box
+     b/c while could iterate forever
+     e.g. fix shake prediction of new position with highly overlapped atoms
+     use minimum_image_once() instead
 ------------------------------------------------------------------------- */
 
 void Domain::minimum_image(double &dx, double &dy, double &dz)
@@ -1009,6 +1013,10 @@ void Domain::minimum_image(double &dx, double &dy, double &dz)
    changed "if" to "while" to enable distance to
      far-away ghost atom returned by atom->map() to be wrapped back into box
      could be problem for looking up atom IDs when cutoff > boxsize
+   this should not be used if atom has moved infinitely far outside box
+     b/c while could iterate forever
+     e.g. fix shake prediction of new position with highly overlapped atoms
+     use minimum_image_once() instead
 ------------------------------------------------------------------------- */
 
 void Domain::minimum_image(double *delta)
@@ -1067,6 +1075,70 @@ void Domain::minimum_image(double *delta)
   }
 }
 
+/* ----------------------------------------------------------------------
+   minimum image convention in periodic dimensions
+   use 1/2 of box size as test
+   for triclinic, also add/subtract tilt factors in other dims as needed
+   only shift by one box length in each direction
+   this should not be used if multiple box shifts are required
+------------------------------------------------------------------------- */
+
+void Domain::minimum_image_once(double *delta)
+{
+  if (triclinic == 0) {
+    if (xperiodic) {
+      if (fabs(delta[0]) > xprd_half) {
+        if (delta[0] < 0.0) delta[0] += xprd;
+        else delta[0] -= xprd;
+      }
+    }
+    if (yperiodic) {
+      if (fabs(delta[1]) > yprd_half) {
+        if (delta[1] < 0.0) delta[1] += yprd;
+        else delta[1] -= yprd;
+      }
+    }
+    if (zperiodic) {
+      if (fabs(delta[2]) > zprd_half) {
+        if (delta[2] < 0.0) delta[2] += zprd;
+        else delta[2] -= zprd;
+      }
+    }
+
+  } else {
+    if (zperiodic) {
+      if (fabs(delta[2]) > zprd_half) {
+        if (delta[2] < 0.0) {
+          delta[2] += zprd;
+          delta[1] += yz;
+          delta[0] += xz;
+        } else {
+          delta[2] -= zprd;
+          delta[1] -= yz;
+          delta[0] -= xz;
+        }
+      }
+    }
+    if (yperiodic) {
+      if (fabs(delta[1]) > yprd_half) {
+        if (delta[1] < 0.0) {
+          delta[1] += yprd;
+          delta[0] += xy;
+        } else {
+          delta[1] -= yprd;
+          delta[0] -= xy;
+        }
+      }
+    }
+    if (xperiodic) {
+      if (fabs(delta[0]) > xprd_half) {
+        if (delta[0] < 0.0) delta[0] += xprd;
+        else delta[0] -= xprd;
+      }
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    return local index of atom J or any of its images that is closest to atom I
    if J is not a valid index like -1, just return it
diff --git a/src/domain.h b/src/domain.h
index 22e3191231de93c6cd09dee8eafb92403716336a..0f47a3c2ca36e708efeea632b08b155d529c2a61 100644
--- a/src/domain.h
+++ b/src/domain.h
@@ -112,6 +112,7 @@ class Domain : protected Pointers {
   void subbox_too_small_check(double);
   void minimum_image(double &, double &, double &);
   void minimum_image(double *);
+  void minimum_image_once(double *);
   int closest_image(int, int);
   int closest_image(double *, int);
   void closest_image(const double * const, const double * const,
diff --git a/src/min.cpp b/src/min.cpp
index 79d7d6a8bdaff9b99d5858115a5f58485563695c..d308efb8486e0ecdbd5280e2bd476c1f21726844 100644
--- a/src/min.cpp
+++ b/src/min.cpp
@@ -187,6 +187,8 @@ void Min::setup(int flag)
             update->minimize_style);
     if (flag) {
       fprintf(screen,"  Unit style    : %s\n", update->unit_style);
+      fprintf(screen,"  Current step  : " BIGINT_FORMAT "\n",
+              update->ntimestep);
       timer->print_timeout(screen);
     }
   }
@@ -196,7 +198,12 @@ void Min::setup(int flag)
   // cannot be done in init() b/c update init() is before modify init()
 
   nextra_global = modify->min_dof();
-  if (nextra_global) fextra = new double[nextra_global];
+  if (nextra_global) {
+    fextra = new double[nextra_global];
+    if (comm->me == 0 && screen)
+      fprintf(screen,"WARNING: Energy due to %d extra global DOFs will"
+              " be included in minimizer energies\n",nextra_global);
+  }
 
   // compute for potential energy
 
diff --git a/src/min.h b/src/min.h
index 464018e825349fe3613007ae0153ed4794b608a8..021198bc09b2aa970b92e137f415f0214e3abb74 100644
--- a/src/min.h
+++ b/src/min.h
@@ -123,6 +123,12 @@ Minimization requires that neigh_modify settings be delay = 0, every =
 changed them and will restore them to their original values after the
 minimization.
 
+W: Energy due to X extra global DOFs will be included in minimizer energies
+
+When using fixes like box/relax, the potential energy used by the minimizer
+is augmented by an additional energy provided by the fix. Thus the printed
+converged energy may be different from the total potential energy.
+
 E: Minimization could not find thermo_pe compute
 
 This compute is created by the thermo command.  It must have been
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 4cd99b41d7a3c6f24a3a39b0bf20edbc211e3cc3..1d12ef578e29c64bcfc8d638027f2ddca2f89df6 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -667,7 +667,7 @@ int Neighbor::init_pair()
     
   // create new lists, one per request including added requests
   // wait to allocate initial pages until copy lists are detected
-  // NOTE: can I allocation now, instead of down below?
+  // NOTE: can I allocate now, instead of down below?
 
   nlist = nrequest;
   
@@ -1216,7 +1216,7 @@ void Neighbor::morph_copy()
     
     // check all other lists
 
-    for (j = 0; j < i; j++) {
+    for (j = 0; j < nrequest; j++) {
       if (i == j) continue;
       jrq = requests[j];
 
@@ -1279,7 +1279,7 @@ void Neighbor::morph_copy()
     // turn list I into a copy of list J
     // do not copy a list from another copy list, but from its parent list
 
-    if (j < i) {
+    if (j < nrequest) {
       irq->copy = 1;
       if (jrq->copy) irq->copylist = jrq->copylist;
       else irq->copylist = j;
diff --git a/src/pair.h b/src/pair.h
index 3f66c6095a8199b8b4530761af014854efce0914..dd859e5f2a07ca2b99ba288302a03b2f17d1cbbe 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -194,8 +194,8 @@ class Pair : protected Pointers {
   int num_tally_compute;
   class Compute **list_tally_compute;
  public:
-  void add_tally_callback(class Compute *);
-  void del_tally_callback(class Compute *);
+  virtual void add_tally_callback(class Compute *);
+  virtual void del_tally_callback(class Compute *);
 
  protected:
   int instance_me;        // which Pair class instantiation I am
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 03e55006fc44953f068609927aab53982b94edf4..fa79f1cf970d3876b1fb71a7ecf39e8f3ffb316a 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -33,7 +33,7 @@ using namespace LAMMPS_NS;
 
 PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp),
   styles(NULL), keywords(NULL), multiple(NULL), nmap(NULL),
-  map(NULL), special_lj(NULL), special_coul(NULL)
+  map(NULL), special_lj(NULL), special_coul(NULL), compute_tally(NULL)
 {
   nstyles = 0;
   
@@ -62,6 +62,7 @@ PairHybrid::~PairHybrid()
 
   delete [] special_lj;
   delete [] special_coul;
+  delete [] compute_tally;
 
   delete [] svector;
 
@@ -169,6 +170,23 @@ void PairHybrid::compute(int eflag, int vflag)
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
+
+/* ---------------------------------------------------------------------- */
+
+void PairHybrid::add_tally_callback(Compute *ptr)
+{
+  for (int m = 0; m < nstyles; m++)
+    if (compute_tally[m]) styles[m]->add_tally_callback(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHybrid::del_tally_callback(Compute *ptr)
+{
+  for (int m = 0; m < nstyles; m++)
+    if (compute_tally[m]) styles[m]->del_tally_callback(ptr);
+}
+
 /* ---------------------------------------------------------------------- */
 
 void PairHybrid::compute_inner()
@@ -253,6 +271,8 @@ void PairHybrid::settings(int narg, char **arg)
   special_lj = new double*[narg];
   special_coul = new double*[narg];
 
+  compute_tally = new int[narg];
+
   // allocate each sub-style
   // allocate uses suffix, but don't store suffix version in keywords,
   //   else syntax in coeff() will not match
@@ -272,6 +292,7 @@ void PairHybrid::settings(int narg, char **arg)
     styles[nstyles] = force->new_pair(arg[iarg],1,dummy);
     force->store_style(keywords[nstyles],arg[iarg],0);
     special_lj[nstyles] = special_coul[nstyles] = NULL;
+    compute_tally[nstyles] = 1;
 
     jarg = iarg + 1;
     while (jarg < narg && !force->pair_map->count(arg[jarg])) jarg++;
@@ -782,6 +803,20 @@ void PairHybrid::modify_params(int narg, char **arg)
       iarg += 5;
     }
 
+    // if 2nd keyword (after pair) is compute/tally:
+    // set flag to register USER-TALLY computes accordingly
+
+    if (iarg < narg && strcmp(arg[iarg],"compute/tally") == 0) {
+      if (narg < iarg+2)
+        error->all(FLERR,"Illegal pair_modify compute/tally command");
+      if (strcmp(arg[iarg+1],"yes") == 0) {
+        compute_tally[m] = 1;
+      } else if (strcmp(arg[iarg+1],"no") == 0) {
+        compute_tally[m] = 0;
+      } else error->all(FLERR,"Illegal pair_modify compute/tally command");
+      iarg += 2;
+    }
+
     // apply the remaining keywords to the base pair style itself and the
     // sub-style except for "pair" and "special".
     // the former is important for some keywords like "tail" or "compute"
diff --git a/src/pair_hybrid.h b/src/pair_hybrid.h
index e3de3b022a4d1bcf2a300a44c74d65fd7d92e5d1..b8b9af5f405c7819fa72ae0434bfbfa55ddaef48 100644
--- a/src/pair_hybrid.h
+++ b/src/pair_hybrid.h
@@ -55,6 +55,9 @@ class PairHybrid : public Pair {
 
   int check_ijtype(int, int, char *);
 
+  virtual void add_tally_callback(class Compute *);
+  virtual void del_tally_callback(class Compute *);
+
  protected:
   int nstyles;                  // # of sub-styles
   Pair **styles;                // list of Pair style classes
@@ -69,6 +72,7 @@ class PairHybrid : public Pair {
   int ***map;                   // list of sub-styles itype,jtype points to
   double **special_lj;          // list of per style LJ exclusion factors
   double **special_coul;        // list of per style Coulomb exclusion factors
+  int *compute_tally;           // list of on/off flags for tally computes
 
   void allocate();
   void flags();
diff --git a/src/version.h b/src/version.h
index e6ffb22dc0c08bd54aca97a7320d97a920125839..7ae7ec4872517291ac25edcd148ba7b6e858cc3c 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "11 Apr 2017"
+#define LAMMPS_VERSION "4 May 2017"
diff --git a/tools/msi2lmp/README b/tools/msi2lmp/README
index a20f6e893f60c42064e0d184fbeb41c483a381c1..db9b1aca5efeec6f0d68dcf6539351426046f0d9 100644
--- a/tools/msi2lmp/README
+++ b/tools/msi2lmp/README
@@ -1,98 +1,50 @@
-Axel Kohlmeyer is the current maintainer of the msi2lmp tool.
-Please send any inquiries about msi2lmp to the lammps-users mailing list.
 
-06 Oct 2016 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Improved whitespace handling in parsing topology and force field
-files to avoid bogus warnings about type name truncation.
-
-24 Oct 2015 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Added check to make certain that force field files
-are consistent with the notation of non-bonded parameters
-that the msi2lmp code expects. For Class 1 and OPLS-AA
-the A-B notation with geometric mixing is expected and for
-Class 2 the r-eps notation with sixthpower mixing.
-
-11 Sep 2014 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Refactored ReadMdfFile.c so it more consistently honors
-the MAX_NAME and MAX_STRING string length defines and 
-potentially handles inputs with long names better.
-
-27 May 2014 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Added TopoTools style type hints as comments to all Mass, PairCoeff,
-BondCoeff, AngleCoeff, DihedralCoeff, ImproperCoeff entries.
-This should make it easier to identify force field entries with
-the structure and force field map in the data file later.
-
-06 Mar 2014 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Fixed a bug in handling of triclinic cells, where the matrices to
-convert to and from fractional coordinates were incorrectly built.
-
-26 Oct 2013 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Implemented writing out force field style hints in generated data
-files for improved consistency checking when reading those files.
-Also added writing out CGCMM style comments to identify atom types.
+ msi2lmp.exe
 
-08 Oct 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+This code has several known limitations listed below under "LIMITATIONS"
+(and possibly some unknown ones, too) and is no longer under active
+development. Only the occasional bugfix is applied.
 
-Fixed a memory access violation with Class 2 force fields.
-Free all allocated memory to better detection of memory errors.
-Print out version number and data with all print levels > 0.
-Added valgrind checks to the regression tests
+Please send any inquiries about msi2lmp to the lammps-users
+mailing list and not to individual people.
 
-08 Oct 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+------------------------------------------------------------------------
 
-Fixed a memory access violation with Class 2 force fields.
-Free all allocated memory to better detection of memory errors.
-Print out version number and data with all print levels > 0.
-Added valgrind checks to the regression tests
+OVERVIEW
 
-02 Aug 2013 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Added rudimentary support for OPLS-AA based on
-input provided by jeff greathouse.
+This is the third version of a program that generates a LAMMPS data file
+based on the information in MSI .car (atom coordinates), .mdf (molecular
+topology) and .frc (forcefield) files.  The .car and .mdf files are
+specific to a molecular system while the .frc file is specific to a
+forcefield version.  The only coherency needed between .frc and
+.car/.mdf files are the atom types.
 
-18 Jul 2013 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Added support for writing out image flags
-Improved accuracy of atom masses
-Added flag for shifting the entire system
-Fixed some minor logic bugs and prepared
-for supporting other force fields and morse style bonds.
-
-12 Jul 2013 Axel Kohlmeyer <akohlmey@gmail.com>
-
-Fixed the bug that caused improper coefficients to be wrong
-Cleaned up the handling of box parameters and center the box
-by default around the system/molecule. Added a flag to make
-this step optional and center the box around the origin instead.
-Added a regression test script with examples.
-
-1 Jul 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+The first version was written by Steve Lustig at Dupont, but required
+using Discover to derive internal coordinates and forcefield parameters
 
-Cleanup and improved port to windows.
-Removed some more static string limits.
-Added print level 3 for additional output.
-Make code stop at missing force field parameters
-and added -i flag to override this.
-Safer argument checking.
-Provide short versions for all flags.
+The second version was written by Michael Peachey while an intern in the
+Cray Chemistry Applications Group managed by John Carpenter. This
+version derived internal coordinates from the mdf file and looked up
+parameters in the frc file thus eliminating the need for Discover.
 
-23 Sep 2011
+The third version was written by John Carpenter to optimize the
+performance of the program for large molecular systems (the original
+code for deriving atom numbers was quadratic in time) and to make the
+program fully dynamic. The second version used fixed dimension arrays
+for the internal coordinates.
 
-added support for triclinic boxes
-see msi2lmp/TriclinicModification.pdf doc for details
+The third version was revised in Fall 2011 by Stephanie Teich-McGoldrick
+to add support non-orthogonal cells.
 
------------------------------
+The next revision was started in Summer/Fall 2013 by Axel Kohlmeyer to
+improve portability to Windows compilers, clean up command line parsing
+and improve compatibility with the then current LAMMPS versions. This
+revision removes compatibility with the obsolete LAMMPS version written
+in Fortran 90.
 
- msi2lmp V3.6 4/10/2005
+INSTALLATION & USAGE
 
- This program uses the .car and .mdf files from MSI/Biosyms's INSIGHT
+This program uses the .car and .mdf files from MSI/Biosyms's INSIGHT
  program to produce a LAMMPS data file.
 
  1. Building msi2lmp
@@ -178,50 +130,111 @@ see msi2lmp/TriclinicModification.pdf doc for details
   -- the LAMMPS data file is written to <ROOTNAME>.data
      protocol and error information is written to the screen.
 
-****************************************************************
-*
-* msi2lmp
-*
-* This is the third version of a program that generates a LAMMPS
-* data file based on the information in MSI .car (atom
-* coordinates), .mdf (molecular topology) and .frc (forcefield) 
-* files. The .car and .mdf files are specific to a molecular
-* system while the .frc file is specific to a forcefield version.
-* The only coherency needed between .frc and .car/.mdf files are
-* the atom types. 
-*
-* The first version was written by Steve Lustig at Dupont, but
-* required using Discover to derive internal coordinates and
-* forcefield parameters
-*
-* The second version was written by Michael Peachey while an
-* intern in the Cray Chemistry Applications Group managed
-* by John Carpenter. This version derived internal coordinates
-* from the mdf file and looked up parameters in the frc file
-* thus eliminating the need for Discover.
-*
-* The third version was written by John Carpenter to optimize
-* the performance of the program for large molecular systems
-* (the original  code for deriving atom numbers was quadratic in time)
-* and to make the program fully dynamic. The second version used
-* fixed dimension arrays for the internal coordinates.
-*
-* The current maintainer is only reluctantly doing so because John Mayo no longer
-* needs this code.
-*
-* V3.2 corresponds to adding code to MakeLists.c to gracefully deal with
-* systems that may only be molecules of 1 to 3 atoms. In V3.1, the values
-* for number_of_dihedrals, etc. could be unpredictable in these systems.
-*
-* V3.3 was generated in response to a strange error reading a MDF file generated by
-* Accelys' Materials Studio GUI. Simply rewriting the input part of ReadMdfFile.c 
-* seems to have fixed the problem.
-*
-* V3.4 and V3.5 are minor upgrades to fix bugs associated mostly with .car and .mdf files
-* written by Accelys' Materials Studio GUI.
-*
-* V3.6 outputs to LAMMPS 2005 (C++ version).
-*
-* Contact: Kelly L. Anderson, kelly.anderson@cantab.net
-* 
-* April 2005
+------------------------------------------------------------------------
+
+LIMITATIONS
+
+msi2lmp has the following known limitations:
+
+- there is no support to select morse bonds over harmonic bonds
+- there is no support for auto-equivalences to supplement fully
+  parameterized interactions with heuristic ones
+- there is no support for bond increments
+
+------------------------------------------------------------------------
+
+CHANGELOG
+
+06 Oct 2016 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Improved whitespace handling in parsing topology and force field
+files to avoid bogus warnings about type name truncation.
+
+24 Oct 2015 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Added check to make certain that force field files are consistent with
+the notation of non-bonded parameters that the msi2lmp code expects.
+For Class 1 and OPLS-AA the A-B notation with geometric mixing is
+expected and for Class 2 the r-eps notation with sixthpower mixing.
+
+11 Sep 2014 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Refactored ReadMdfFile.c so it more consistently honors the MAX_NAME
+and MAX_STRING string length defines and potentially handles inputs
+with long names better.
+
+27 May 2014 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Added TopoTools style type hints as comments to all Mass, PairCoeff,
+BondCoeff, AngleCoeff, DihedralCoeff, ImproperCoeff entries.
+This should make it easier to identify force field entries with
+the structure and force field map in the data file later.
+
+06 Mar 2014 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Fixed a bug in handling of triclinic cells, where the matrices to
+convert to and from fractional coordinates were incorrectly built.
+
+26 Oct 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Implemented writing out force field style hints in generated data
+files for improved consistency checking when reading those files.
+Also added writing out CGCMM style comments to identify atom types.
+
+08 Oct 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Fixed a memory access violation with Class 2 force fields.  Free all
+allocated memory to better detection of memory errors.  Print out
+version number and data with all print levels > 0.  Added valgrind
+checks to the regression tests.
+
+02 Aug 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Added rudimentary support for OPLS-AA based on input provided
+by jeff greathouse.
+
+18 Jul 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Added support for writing out image flags.  Improved accuracy of atom
+masses.  Added flag for shifting the entire system.  Fixed some minor
+logic bugs and prepared for supporting other force fields and morse
+style bonds.
+
+12 Jul 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Fixed the bug that caused improper coefficients to be wrong. Cleaned up
+the handling of box parameters and center the box by default around the
+system/molecule. Added a flag to make this step optional and center the
+box around the origin instead.  Added a regression test script with
+examples.
+
+1 Jul 2013 Axel Kohlmeyer <akohlmey@gmail.com>
+
+Cleanup and improved port to windows. Removed some more static string
+limits.  Added print level 3 for additional output.  Make code stop at
+missing force field parameters and added -i flag to override this.
+Safer argument checking.  Provide short versions for all flags.
+
+23 Sep 2011
+
+added support for triclinic boxes
+
+V3.6 outputs to LAMMPS 2005 (C++ version).
+
+Contact: Kelly L. Anderson, kelly.anderson@cantab.net
+
+V3.4 and V3.5 are minor upgrades to fix bugs associated mostly with .car
+ and .mdf files written by Accelys' Materials Studio GUI.  April 2005
+
+V3.3 was generated in response to a strange error reading a MDF file
+generated by Accelys' Materials Studio GUI. Simply rewriting the input
+part of ReadMdfFile.c seems to have fixed the problem.
+
+V3.2 corresponds to adding code to MakeLists.c to gracefully deal with
+systems that may only be molecules of 1 to 3 atoms. In V3.1, the values
+for number_of_dihedrals, etc. could be unpredictable in these systems.
+
+-----------------------------
+
+ msi2lmp v3.9.8 6/10/2016
+
diff --git a/tools/msi2lmp/src/GetParameters.c b/tools/msi2lmp/src/GetParameters.c
index e183c529e0127b7324e42a893211526798aefa5f..192b4d296cb8247e8e47d83bed83b2928d3d13c5 100644
--- a/tools/msi2lmp/src/GetParameters.c
+++ b/tools/msi2lmp/src/GetParameters.c
@@ -136,7 +136,7 @@ void GetParameters()
       if (forcefield & (FF_TYPE_CLASS1|FF_TYPE_OPLSAA)) {
         bondtypes[i].params[0] = ff_bond.data[k].ff_param[1];
         bondtypes[i].params[1] = ff_bond.data[k].ff_param[0];
-      } 
+      }
 
       if (forcefield & FF_TYPE_CLASS2) {
         for (j=0; j < 4; j++)
diff --git a/tools/msi2lmp/src/InitializeItems.c b/tools/msi2lmp/src/InitializeItems.c
index 4df9fd0f1075c218182991ad1bac45aed50991ef..1e336369130dec323941265de0aa4f297815cb42 100644
--- a/tools/msi2lmp/src/InitializeItems.c
+++ b/tools/msi2lmp/src/InitializeItems.c
@@ -68,7 +68,7 @@ void InitializeItems(void)
   if (forcefield & (FF_TYPE_CLASS1|FF_TYPE_OPLSAA)) {
     strcpy(ff_tor.keyword,"#torsion_1");
     ff_tor.number_of_parameters = 3;
-  } 
+  }
 
   if (forcefield & FF_TYPE_CLASS2) {
     strcpy(ff_tor.keyword,"#torsion_3");
diff --git a/tools/msi2lmp/src/WriteDataFile.c b/tools/msi2lmp/src/WriteDataFile.c
index 498978406fae84e5afeedaca9d11d1b61aec89b1..c03eba71c5f2b52f7e54b53c45bb8e9b2cdd985a 100644
--- a/tools/msi2lmp/src/WriteDataFile.c
+++ b/tools/msi2lmp/src/WriteDataFile.c
@@ -144,7 +144,7 @@ void WriteDataFile(char *nameroot)
       else if (forcefield & FF_TYPE_CLASS2)
         fputs(" # class2\n\n",DatF);
     } else fputs("\n\n",DatF);
-    
+
     for (i=0; i < no_angle_types; i++) {
       fprintf(DatF, " %3i", i+1);
       for ( j = 0; j < m; j++)
diff --git a/tools/msi2lmp/src/msi2lmp.c b/tools/msi2lmp/src/msi2lmp.c
index c94d4b4d733d54f3f2fce6c8576c2cab8655bdf5..15cfddd258587b67e4ff01093fc50188589ea820 100644
--- a/tools/msi2lmp/src/msi2lmp.c
+++ b/tools/msi2lmp/src/msi2lmp.c
@@ -142,9 +142,6 @@
 * and to make the program fully dynamic. The second version used
 * fixed dimension arrays for the internal coordinates.
 *
-* John Carpenter can be contacted by sending email to
-* jec374@earthlink.net
-*
 * November 2000
 */
 
@@ -356,7 +353,7 @@ int main (int argc, char *argv[])
     if (centerflag) puts(" Output is recentered around geometrical center");
     if (hintflag) puts(" Output contains style flag hints");
     else puts(" Style flag hints disabled");
-    printf(" System translated by: %g %g %g\n",shift[0],shift[1],shift[2]); 
+    printf(" System translated by: %g %g %g\n",shift[0],shift[1],shift[2]);
   }
 
   n = 0;
@@ -374,7 +371,7 @@ int main (int argc, char *argv[])
   if (n == 0) {
     if (iflag > 0) fputs(" WARNING",stderr);
     else           fputs(" Error  ",stderr);
-    
+
     fputs("- forcefield name and class appear to be inconsistent\n\n",stderr);
     if (iflag == 0) return 7;
   }
diff --git a/tools/msi2lmp/src/msi2lmp.h b/tools/msi2lmp/src/msi2lmp.h
index 377ab1a6c32586207895d6ce43852b8055a411d7..4716f719d6ad502875759c0e942da379941a9fd6 100644
--- a/tools/msi2lmp/src/msi2lmp.h
+++ b/tools/msi2lmp/src/msi2lmp.h
@@ -24,13 +24,13 @@
 * and to make the program fully dynamic. The second version used
 * fixed dimension arrays for the internal coordinates.
 *
-* The thrid version was revised in Fall 2011 by 
+* The third version was revised in Fall 2011 by
 * Stephanie Teich-McGoldrick to add support non-orthogonal cells.
 *
 * The next revision was started in Summer/Fall 2013 by
 * Axel Kohlmeyer to improve portability to Windows compilers,
 * clean up command line parsing and improve compatibility with
-* the then current LAMMPS versions. This revision removes 
+* the then current LAMMPS versions. This revision removes
 * compatibility with the obsolete LAMMPS version written in Fortran 90.
 */