diff --git a/src/GRANULAR/pair_gran_hooke_history.cpp b/src/GRANULAR/pair_gran_hooke_history.cpp
index d70c7391e2ba896961640895ce98e07acc6a0cd5..df6db8ddfaef10e944d29e8b9bce3b5313447031 100644
--- a/src/GRANULAR/pair_gran_hooke_history.cpp
+++ b/src/GRANULAR/pair_gran_hooke_history.cpp
@@ -63,6 +63,8 @@ PairGranHookeHistory::PairGranHookeHistory(LAMMPS *lmp) : Pair(lmp)
 
 PairGranHookeHistory::~PairGranHookeHistory()
 {
+  if (copymode) return;
+
   delete [] svector;
   if (fix_history) modify->delete_fix("NEIGH_HISTORY");
 
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index c6fab2a1b10f5d9f45c6bac72de82dd1ab5d8bf1..73fa21cb41f81769cb2b7d07bc8bc5f236d2ed29 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -71,6 +71,8 @@ action atom_vec_kokkos.cpp
 action atom_vec_kokkos.h
 action atom_vec_molecular_kokkos.cpp atom_vec_molecular.cpp
 action atom_vec_molecular_kokkos.h atom_vec_molecular.h
+action atom_vec_sphere_kokkos.cpp atom_vec_sphere.cpp
+action atom_vec_sphere_kokkos.h atom_vec_sphere.h
 action bond_class2_kokkos.cpp bond_class2.cpp 
 action bond_class2_kokkos.h bond_class2.h
 action bond_fene_kokkos.cpp bond_fene.cpp
@@ -97,6 +99,8 @@ action fix_eos_table_rx_kokkos.cpp fix_eos_table_rx.cpp
 action fix_eos_table_rx_kokkos.h fix_eos_table_rx.h  
 action fix_langevin_kokkos.cpp
 action fix_langevin_kokkos.h
+action fix_neigh_history_kokkos.cpp
+action fix_neigh_history_kokkos.h
 action fix_nh_kokkos.cpp
 action fix_nh_kokkos.h
 action fix_nph_kokkos.cpp
@@ -191,6 +195,8 @@ action pair_eam_fs_kokkos.cpp pair_eam_fs.cpp
 action pair_eam_fs_kokkos.h pair_eam_fs.h
 action pair_exp6_rx_kokkos.cpp pair_exp6_rx.cpp
 action pair_exp6_rx_kokkos.h pair_exp6_rx.h
+action pair_gran_hooke_history_kokkos.h
+action pair_gran_hooke_history_kokkos.cpp
 action pair_hybrid_kokkos.cpp
 action pair_hybrid_kokkos.h
 action pair_hybrid_overlay_kokkos.cpp
diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.cpp b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2788526d7c8d71af51fb9df660301823f2bbc60
--- /dev/null
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
@@ -0,0 +1,2155 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "atom_vec_sphere_kokkos.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "force.h"
+#include "fix.h"
+#include "fix_adapt.h"
+#include "math_const.h"
+#include "memory.h"
+#include "error.h"
+#include "memory_kokkos.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+
+static const double MY_PI  = 3.14159265358979323846; // pi
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecSphereKokkos::AtomVecSphereKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
+{
+  molecular = 0;
+
+  comm_x_only = 1;
+  comm_f_only = 0;
+  size_forward = 3;
+  size_reverse = 6;
+  size_border = 8;
+  size_velocity = 6;
+  size_data_atom = 7;
+  size_data_vel = 7;
+  xcol_data = 5;
+
+  atom->sphere_flag = 1;
+  atom->radius_flag = atom->rmass_flag = atom->omega_flag =
+    atom->torque_flag = 1;
+
+  k_count = DAT::tdual_int_1d("atom::k_count",1);
+  atomKK = (AtomKokkos *) atom;
+  commKK = (CommKokkos *) comm;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::init()
+{
+  AtomVec::init();
+
+  // set radvary if particle diameters are time-varying due to fix adapt
+
+  radvary = 0;
+  comm_x_only = 1;
+  size_forward = 3;
+
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"adapt") == 0) {
+      FixAdapt *fix = (FixAdapt *) modify->fix[i];
+      if (fix->diamflag) {
+        radvary = 1;
+        comm_x_only = 0;
+        size_forward = 5;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by a chunk
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::grow(int n)
+{
+  if (n == 0) nmax += DELTA;
+  else nmax = n;
+  atom->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  sync(Device,ALL_MASK);
+  modified(Device,ALL_MASK);
+
+  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
+  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
+  memoryKK->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
+  memoryKK->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
+
+  memoryKK->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x");
+  memoryKK->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v");
+  memoryKK->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");
+  memoryKK->grow_kokkos(atomKK->k_radius,atomKK->radius,nmax,"atom:radius");
+  memoryKK->grow_kokkos(atomKK->k_rmass,atomKK->rmass,nmax,"atom:rmass");
+  memoryKK->grow_kokkos(atomKK->k_omega,atomKK->omega,nmax,3,"atom:omega");
+  memoryKK->grow_kokkos(atomKK->k_torque,atomKK->torque,nmax,3,"atom:torque");
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+
+  grow_reset();
+  sync(Host,ALL_MASK);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+  radius = atomKK->radius;
+  d_radius = atomKK->k_radius.d_view;
+  h_radius = atomKK->k_radius.h_view;
+  rmass = atomKK->rmass;
+  d_rmass = atomKK->k_rmass.d_view;
+  h_rmass = atomKK->k_rmass.h_view;
+  omega = atomKK->omega;
+  d_omega = atomKK->k_omega.d_view;
+  h_omega = atomKK->k_omega.h_view;
+  torque = atomKK->torque;
+  d_torque = atomKK->k_torque.d_view;
+  h_torque = atomKK->k_torque.h_view;
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::copy(int i, int j, int delflag)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK | RADIUS_MASK |
+            RMASS_MASK | OMEGA_MASK);
+
+  h_tag[j] = h_tag[i];
+  h_type[j] = h_type[i];
+  h_mask[j] = h_mask[i];
+  h_image[j] = h_image[i];
+  h_x(j,0) = h_x(i,0);
+  h_x(j,1) = h_x(i,1);
+  h_x(j,2) = h_x(i,2);
+  h_v(j,0) = h_v(i,0);
+  h_v(j,1) = h_v(i,1);
+  h_v(j,2) = h_v(i,2);
+
+  h_radius[j] = h_radius[i];
+  h_rmass[j] = h_rmass[i];
+  h_omega(j,0) = h_omega(i,0);
+  h_omega(j,1) = h_omega(i,1);
+  h_omega(j,2) = h_omega(i,2);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK | RADIUS_MASK |
+                RMASS_MASK | OMEGA_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecSphereKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_float_1d _radius,_rmass;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecSphereKokkos_PackComm(
+    const typename DAT::tdual_x_array &x,
+    const typename DAT::tdual_float_1d &radius,
+    const typename DAT::tdual_float_1d &rmass,
+    const typename DAT::tdual_xfloat_2d &buf,
+    const typename DAT::tdual_int_2d &list,
+    const int & iswap,
+    const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+    const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+    _x(x.view<DeviceType>()),
+    _radius(radius.view<DeviceType>()),
+    _rmass(rmass.view<DeviceType>()),
+    _list(list.view<DeviceType>()),_iswap(iswap),
+    _xprd(xprd),_yprd(yprd),_zprd(zprd),
+    _xy(xy),_xz(xz),_yz(yz) {
+    const size_t maxsend = (buf.view<DeviceType>().extent(0)*buf.view<DeviceType>().extent(1))/3;
+    const size_t elements = 5;
+    buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+    _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+    _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    if (PBC_FLAG == 0) {
+      _buf(i,0) = _x(j,0);
+      _buf(i,1) = _x(j,1);
+      _buf(i,2) = _x(j,2);
+    } else {
+      if (TRICLINIC == 0) {
+	_buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+	_buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+	_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+      } else {
+	_buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+	_buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+	_buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+      }
+    }
+    _buf(i,3) = _radius(j);
+    _buf(i,4) = _rmass(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_comm_kokkos(
+  const int &n,
+  const DAT::tdual_int_2d &list,
+  const int & iswap,
+  const DAT::tdual_xfloat_2d &buf,
+  const int &pbc_flag,
+  const int* const pbc)
+{
+  // Fallback to AtomVecKokkos if radvary == 0
+  if (radvary == 0)
+    return AtomVecKokkos::pack_comm_kokkos(n,list,iswap,buf,pbc_flag,pbc);
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackComm<LMPHostType,1,1> f(
+         atomKK->k_x,
+	 atomKK->k_radius,atomKK->k_rmass,
+	 buf,list,iswap,
+	 domain->xprd,domain->yprd,domain->zprd,
+	 domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackComm<LMPHostType,1,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackComm<LMPHostType,0,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackComm<LMPHostType,0,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackComm<LMPDeviceType,1,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackComm<LMPDeviceType,1,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackComm<LMPDeviceType,0,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackComm<LMPDeviceType,0,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  buf,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+  return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecSphereKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  typename ArrayTypes<DeviceType>::t_float_1d _radius,_rmass;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecSphereKokkos_PackCommSelf(
+    const typename DAT::tdual_x_array &x,
+    const typename DAT::tdual_float_1d &radius,
+    const typename DAT::tdual_float_1d &rmass,
+    const int &nfirst,
+    const typename DAT::tdual_int_2d &list,
+    const int & iswap,
+    const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+    const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+    _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
+    _radius(radius.view<DeviceType>()),
+    _rmass(rmass.view<DeviceType>()),
+    _nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+    _xprd(xprd),_yprd(yprd),_zprd(zprd),
+    _xy(xy),_xz(xz),_yz(yz) {
+    _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+    _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    if (PBC_FLAG == 0) {
+      _xw(i+_nfirst,0) = _x(j,0);
+      _xw(i+_nfirst,1) = _x(j,1);
+      _xw(i+_nfirst,2) = _x(j,2);
+    } else {
+      if (TRICLINIC == 0) {
+	_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+	_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+	_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+      } else {
+	_xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+	_xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+	_xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+      }
+    }
+    _radius(i+_nfirst) = _radius(j);
+    _rmass(i+_nfirst) = _rmass(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_comm_self(
+  const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+  const int nfirst, const int &pbc_flag, const int* const pbc) {
+  // Fallback to AtomVecKokkos if radvary == 0
+  if (radvary == 0) 
+    return AtomVecKokkos::pack_comm_self(n,list,iswap,nfirst,pbc_flag,pbc);
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPHostType,1,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPHostType,1,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPHostType,0,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPHostType,0,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPDeviceType,1,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPDeviceType,1,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPDeviceType,0,1> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      } else {
+	struct AtomVecSphereKokkos_PackCommSelf<LMPDeviceType,0,0> f(
+          atomKK->k_x,
+	  atomKK->k_radius,atomKK->k_rmass,
+	  nfirst,list,iswap,
+	  domain->xprd,domain->yprd,domain->zprd,
+	  domain->xy,domain->xz,domain->yz,pbc);
+	Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+  return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSphereKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_float_1d _radius,_rmass;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecSphereKokkos_UnpackComm(
+    const typename DAT::tdual_x_array &x,
+    const typename DAT::tdual_float_1d &radius,
+    const typename DAT::tdual_float_1d &rmass,
+    const typename DAT::tdual_xfloat_2d &buf,
+    const int& first):
+    _x(x.view<DeviceType>()),
+    _radius(radius.view<DeviceType>()),
+    _rmass(rmass.view<DeviceType>()),
+    _buf(buf.view<DeviceType>()),
+    _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    _x(i+_first,0) = _buf(i,0);
+    _x(i+_first,1) = _buf(i,1);
+    _x(i+_first,2) = _buf(i,2);
+    _radius(i+_first) = _buf(i,3);
+    _rmass(i+_first) = _buf(i,4);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_comm_kokkos(
+  const int &n, const int &first,
+  const DAT::tdual_xfloat_2d &buf ) {
+  // Fallback to AtomVecKokkos if radvary == 0
+  if (radvary == 0) {
+    AtomVecKokkos::unpack_comm_kokkos(n,first,buf);
+    return;
+  }
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    struct AtomVecSphereKokkos_UnpackComm<LMPHostType> f(
+      atomKK->k_x,
+      atomKK->k_radius,atomKK->k_rmass,
+      buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    struct AtomVecSphereKokkos_UnpackComm<LMPDeviceType> f(
+      atomKK->k_x,
+      atomKK->k_radius,atomKK->k_rmass,
+      buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_comm(int n, int *list, double *buf,
+			           int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  if (radvary == 0) {
+    // Not sure if we need to call sync for X here
+    sync(Host,X_MASK);
+    m = 0;
+    if (pbc_flag == 0) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0);
+        buf[m++] = h_x(j,1);
+        buf[m++] = h_x(j,2);
+      }
+    } else {
+      if (domain->triclinic == 0) {
+        dx = pbc[0]*domain->xprd;
+        dy = pbc[1]*domain->yprd;
+        dz = pbc[2]*domain->zprd;
+      } else {
+        dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+        dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+        dz = pbc[2]*domain->zprd;
+      }
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+      }
+    }
+  } else {
+    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    m = 0;
+    if (pbc_flag == 0) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0);
+        buf[m++] = h_x(j,1);
+        buf[m++] = h_x(j,2);
+        buf[m++] = h_radius[j];
+        buf[m++] = h_rmass[j];
+      }
+    } else {
+      if (domain->triclinic == 0) {
+        dx = pbc[0]*domain->xprd;
+        dy = pbc[1]*domain->yprd;
+        dz = pbc[2]*domain->zprd;
+      } else {
+        dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+        dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+        dz = pbc[2]*domain->zprd;
+      }
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_radius[j];
+        buf[m++] = h_rmass[j];
+      }
+    }
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_comm_vel(int n, int *list, double *buf,
+				       int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  if (radvary == 0) {
+    sync(Host,X_MASK|V_MASK|OMEGA_MASK);
+    m = 0;
+    if (pbc_flag == 0) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0);
+        buf[m++] = h_x(j,1);
+        buf[m++] = h_x(j,2);
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+    } else {
+      if (domain->triclinic == 0) {
+        dx = pbc[0]*domain->xprd;
+        dy = pbc[1]*domain->yprd;
+        dz = pbc[2]*domain->zprd;
+      } else {
+        dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+        dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+        dz = pbc[2]*domain->zprd;
+      }
+      if (!deform_vremap) {
+        for (i = 0; i < n; i++) {
+          j = list[i];
+          buf[m++] = h_x(j,0) + dx;
+          buf[m++] = h_x(j,1) + dy;
+          buf[m++] = h_x(j,2) + dz;
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+      } else {
+        dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+        dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+        dvz = pbc[2]*h_rate[2];
+        for (i = 0; i < n; i++) {
+          j = list[i];
+          buf[m++] = h_x(j,0) + dx;
+          buf[m++] = h_x(j,1) + dy;
+          buf[m++] = h_x(j,2) + dz;
+         if (mask[i] & deform_groupbit) {
+            buf[m++] = h_v(j,0) + dvx;
+            buf[m++] = h_v(j,1) + dvy;
+            buf[m++] = h_v(j,2) + dvz;
+          } else {
+            buf[m++] = h_v(j,0);
+            buf[m++] = h_v(j,1);
+            buf[m++] = h_v(j,2);
+          }
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+      }
+    }
+
+  } else {
+    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    m = 0;
+    if (pbc_flag == 0) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0);
+        buf[m++] = h_x(j,1);
+        buf[m++] = h_x(j,2);
+        buf[m++] = h_radius[j];
+        buf[m++] = h_rmass[j];
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+    } else {
+      if (domain->triclinic == 0) {
+        dx = pbc[0]*domain->xprd;
+        dy = pbc[1]*domain->yprd;
+        dz = pbc[2]*domain->zprd;
+      } else {
+        dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+        dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+        dz = pbc[2]*domain->zprd;
+      }
+      if (!deform_vremap) {
+        for (i = 0; i < n; i++) {
+          j = list[i];
+          buf[m++] = h_x(j,0) + dx;
+          buf[m++] = h_x(j,1) + dy;
+          buf[m++] = h_x(j,2) + dz;
+          buf[m++] = h_radius[j];
+          buf[m++] = h_rmass[j];
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+      } else {
+        dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+        dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+        dvz = pbc[2]*h_rate[2];
+        for (i = 0; i < n; i++) {
+          j = list[i];
+          buf[m++] = h_x(j,0) + dx;
+          buf[m++] = h_x(j,1) + dy;
+          buf[m++] = h_x(j,2) + dz;
+          buf[m++] = h_radius[j];
+          buf[m++] = h_rmass[j];
+          if (mask[i] & deform_groupbit) {
+            buf[m++] = h_v(j,0) + dvx;
+            buf[m++] = h_v(j,1) + dvy;
+            buf[m++] = h_v(j,2) + dvz;
+          } else {
+            buf[m++] = h_v(j,0);
+            buf[m++] = h_v(j,1);
+            buf[m++] = h_v(j,2);
+          }
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+      }
+    }
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_comm_hybrid(int n, int *list, double *buf)
+{
+  if (radvary == 0) return 0;
+
+  sync(Host,RADIUS_MASK|RMASS_MASK);
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    buf[m++] = h_radius[j];
+    buf[m++] = h_rmass[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_comm(int n, int first, double *buf)
+{
+  if (radvary == 0) {
+    int m = 0;
+    const int last = first + n;
+    for (int i = first; i < last; i++) {
+      h_x(i,0) = buf[m++];
+      h_x(i,1) = buf[m++];
+      h_x(i,2) = buf[m++];
+    }
+    modified(Host,X_MASK);
+  } else {
+    int m = 0;
+    const int last = first + n;
+    for (int i = first; i < last; i++) {
+      h_x(i,0) = buf[m++];
+      h_x(i,1) = buf[m++];
+      h_x(i,2) = buf[m++];
+      h_radius[i] = buf[m++];
+      h_rmass[i] = buf[m++];
+    }
+    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  if (radvary == 0) {
+    int m = 0;
+    const int last = first + n;
+    for (int i = first; i < last; i++) {
+      h_x(i,0) = buf[m++];
+      h_x(i,1) = buf[m++];
+      h_x(i,2) = buf[m++];
+      h_v(i,0) = buf[m++];
+      h_v(i,1) = buf[m++];
+      h_v(i,2) = buf[m++];
+      h_omega(i,0) = buf[m++];
+      h_omega(i,1) = buf[m++];
+      h_omega(i,2) = buf[m++];
+    }
+    modified(Host,X_MASK|V_MASK|OMEGA_MASK);
+  } else {
+    int m = 0;
+    const int last = first + n;
+    for (int i = first; i < last; i++) {
+      h_x(i,0) = buf[m++];
+      h_x(i,1) = buf[m++];
+      h_x(i,2) = buf[m++];
+      h_radius[i] = buf[m++];
+      h_rmass[i] = buf[m++];
+      h_v(i,0) = buf[m++];
+      h_v(i,1) = buf[m++];
+      h_v(i,2) = buf[m++];
+      h_omega(i,0) = buf[m++];
+      h_omega(i,1) = buf[m++];
+      h_omega(i,2) = buf[m++];
+    }
+    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::unpack_comm_hybrid(int n, int first, double *buf)
+{
+  if (radvary == 0) return 0;
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    h_radius[i] = buf[m++];
+    h_rmass[i] = buf[m++];
+  }
+  modified(Host,RADIUS_MASK|RMASS_MASK);
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK|TORQUE_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+    buf[m++] = h_torque(i,0);
+    buf[m++] = h_torque(i,1);
+    buf[m++] = h_torque(i,2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_reverse_hybrid(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,TORQUE_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_torque(i,0);
+    buf[m++] = h_torque(i,1);
+    buf[m++] = h_torque(i,2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  if(n > 0) {
+    sync(Host,F_MASK|TORQUE_MASK);
+    modified(Host,F_MASK|TORQUE_MASK);
+  }
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+    h_torque(j,0) += buf[m++];
+    h_torque(j,1) += buf[m++];
+    h_torque(j,2) += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::unpack_reverse_hybrid(int n, int *list, double *buf)
+{
+  if(n > 0) {
+    sync(Host,TORQUE_MASK);
+    modified(Host,TORQUE_MASK);
+  }
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_torque(j,0) += buf[m++];
+    h_torque(j,1) += buf[m++];
+    h_torque(j,2) += buf[m++];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG>
+struct AtomVecSphereKokkos_PackBorder {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
+  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  const typename ArrayTypes<DeviceType>::t_int_1d _type;
+  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_float_1d _radius,_rmass;
+  X_FLOAT _dx,_dy,_dz;
+
+  AtomVecSphereKokkos_PackBorder(const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
+			      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
+			      const int & iswap,
+			      const typename ArrayTypes<DeviceType>::t_x_array &x,
+			      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+			      const typename ArrayTypes<DeviceType>::t_int_1d &type,
+			      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
+			      const typename ArrayTypes<DeviceType>::t_float_1d &radius,
+			      const typename ArrayTypes<DeviceType>::t_float_1d &rmass,
+			      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
+    _buf(buf),_list(list),_iswap(iswap),
+    _x(x),_tag(tag),_type(type),_mask(mask),
+    _radius(radius),
+    _rmass(rmass),
+    _dx(dx),_dy(dy),_dz(dz) {}
+  
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    if (PBC_FLAG == 0) {
+      _buf(i,0) = _x(j,0);
+      _buf(i,1) = _x(j,1);
+      _buf(i,2) = _x(j,2);
+    } else {
+      _buf(i,0) = _x(j,0) + _dx;
+      _buf(i,1) = _x(j,1) + _dy;
+      _buf(i,2) = _x(j,2) + _dz;
+    }
+    _buf(i,3) = _tag(j);
+    _buf(i,4) = _type(j);
+    _buf(i,5) = _mask(j);
+    _buf(i,6) = _radius(j);
+    _buf(i,7) = _rmass(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
+					    int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  X_FLOAT dx,dy,dz;
+
+  // This was in atom_vec_dpd_kokkos but doesn't appear in any other atom_vec
+  sync(space,ALL_MASK);
+
+  if (pbc_flag != 0) {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if(space==Host) {
+      AtomVecSphereKokkos_PackBorder<LMPHostType,1> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_radius,h_rmass,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    } else {
+      AtomVecSphereKokkos_PackBorder<LMPDeviceType,1> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_radius,d_rmass,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    }
+  } else {
+    dx = dy = dz = 0;
+    if(space==Host) {
+      AtomVecSphereKokkos_PackBorder<LMPHostType,0> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_radius,h_rmass,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    } else {
+      AtomVecSphereKokkos_PackBorder<LMPDeviceType,0> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_radius,d_rmass,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    }
+  }
+  return n*6;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  sync(Host,ALL_MASK);
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+      buf[m++] = h_radius[j];
+      buf[m++] = h_rmass[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+      buf[m++] = h_radius[j];
+      buf[m++] = h_rmass[j];
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  sync(Host,ALL_MASK);
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+      buf[m++] = h_radius[j];
+      buf[m++] = h_rmass[j];
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_omega(j,0);
+      buf[m++] = h_omega(j,1);
+      buf[m++] = h_omega(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag[j]).d;
+        buf[m++] = ubuf(h_type[j]).d;
+        buf[m++] = ubuf(h_mask[j]).d;
+        buf[m++] = h_radius[j];
+        buf[m++] = h_rmass[j];
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag[j]).d;
+        buf[m++] = ubuf(h_type[j]).d;
+        buf[m++] = ubuf(h_mask[j]).d;
+        buf[m++] = h_radius[j];
+        buf[m++] = h_rmass[j];
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_border_hybrid(int n, int *list, double *buf)
+{
+  sync(Host,RADIUS_MASK|RMASS_MASK);
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    buf[m++] = h_radius[j];
+    buf[m++] = h_rmass[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSphereKokkos_UnpackBorder {
+  typedef DeviceType device_type;
+
+  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  typename ArrayTypes<DeviceType>::t_int_1d _type;
+  typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_float_1d _radius,_rmass;
+  int _first;
+
+  AtomVecSphereKokkos_UnpackBorder(
+    const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
+    typename ArrayTypes<DeviceType>::t_x_array &x,
+    typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+    typename ArrayTypes<DeviceType>::t_int_1d &type,
+    typename ArrayTypes<DeviceType>::t_int_1d &mask,
+    const typename ArrayTypes<DeviceType>::t_float_1d &radius,
+    const typename ArrayTypes<DeviceType>::t_float_1d &rmass,
+    const int& first):
+    _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),
+    _radius(radius),
+    _rmass(rmass),
+    _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    _x(i+_first,0) = _buf(i,0);
+    _x(i+_first,1) = _buf(i,1);
+    _x(i+_first,2) = _buf(i,2);
+    _tag(i+_first) = static_cast<int> (_buf(i,3));
+    _type(i+_first) = static_cast<int>  (_buf(i,4));
+    _mask(i+_first) = static_cast<int>  (_buf(i,5));
+    _radius(i+_first) = _buf(i,6);
+    _rmass(i+_first) = _buf(i,7);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_border_kokkos(const int &n, const int &first,
+					       const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 RADIUS_MASK|RMASS_MASK);
+  while (first+n >= nmax) grow(0);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+	         RADIUS_MASK|RMASS_MASK);
+  if(space==Host) {
+    struct AtomVecSphereKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
+      h_x,h_tag,h_type,h_mask,
+      h_radius,h_rmass,
+      first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    struct AtomVecSphereKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),
+      d_x,d_tag,d_type,d_mask,
+      d_radius,d_rmass,
+      first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_border(int n, int first, double *buf)
+{
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = (tagint) ubuf(buf[m++]).i;
+    h_type[i] = (int) ubuf(buf[m++]).i;
+    h_mask[i] = (int) ubuf(buf[m++]).i;
+    h_radius[i] = buf[m++];
+    h_rmass[i] = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK);
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = (tagint) ubuf(buf[m++]).i;
+    h_type[i] = (int) ubuf(buf[m++]).i;
+    h_mask[i] = (int) ubuf(buf[m++]).i;
+    h_radius[i] = buf[m++];
+    h_rmass[i] = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_omega(i,0) = buf[m++];
+    h_omega(i,1) = buf[m++];
+    h_omega(i,2) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::unpack_border_hybrid(int n, int first, double *buf)
+{
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    h_radius[i] = buf[m++];
+    h_rmass[i] = buf[m++];
+  }
+  modified(Host,RADIUS_MASK|RMASS_MASK);
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSphereKokkos_PackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array_randomread _x;
+  typename AT::t_v_array_randomread _v;
+  typename AT::t_tagint_1d_randomread _tag;
+  typename AT::t_int_1d_randomread _type;
+  typename AT::t_int_1d_randomread _mask;
+  typename AT::t_imageint_1d_randomread _image;
+  typename AT::t_float_1d_randomread _radius,_rmass;
+  typename AT::t_v_array_randomread _omega;
+  typename AT::t_x_array _xw;
+  typename AT::t_v_array _vw;
+  typename AT::t_tagint_1d _tagw;
+  typename AT::t_int_1d _typew;
+  typename AT::t_int_1d _maskw;
+  typename AT::t_imageint_1d _imagew;
+  typename AT::t_float_1d _radiusw,_rmassw;
+  typename AT::t_v_array _omegaw;
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d_const _sendlist;
+  typename AT::t_int_1d_const _copylist;
+  int _nlocal,_dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecSphereKokkos_PackExchangeFunctor(
+    const AtomKokkos* atom,
+    const typename AT::tdual_xfloat_2d buf,
+    typename AT::tdual_int_1d sendlist,
+    typename AT::tdual_int_1d copylist,int nlocal, int dim,X_FLOAT lo, X_FLOAT hi):
+    _x(atom->k_x.view<DeviceType>()),
+    _v(atom->k_v.view<DeviceType>()),
+    _tag(atom->k_tag.view<DeviceType>()),
+    _type(atom->k_type.view<DeviceType>()),
+    _mask(atom->k_mask.view<DeviceType>()),
+    _image(atom->k_image.view<DeviceType>()),
+    _radius(atom->k_radius.view<DeviceType>()),
+    _rmass(atom->k_rmass.view<DeviceType>()),
+    _omega(atom->k_omega.view<DeviceType>()),
+    _xw(atom->k_x.view<DeviceType>()),
+    _vw(atom->k_v.view<DeviceType>()),
+    _tagw(atom->k_tag.view<DeviceType>()),
+    _typew(atom->k_type.view<DeviceType>()),
+    _maskw(atom->k_mask.view<DeviceType>()),
+    _imagew(atom->k_image.view<DeviceType>()),
+    _radiusw(atom->k_radius.view<DeviceType>()),
+    _rmassw(atom->k_rmass.view<DeviceType>()),
+    _omegaw(atom->k_omega.view<DeviceType>()),
+    _sendlist(sendlist.template view<DeviceType>()),
+    _copylist(copylist.template view<DeviceType>()),
+    _nlocal(nlocal),_dim(dim),
+    _lo(lo),_hi(hi){
+    const size_t elements = 16;
+    const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &mysend) const {
+    const int i = _sendlist(mysend);
+    _buf(mysend,0) = 16;
+    _buf(mysend,1) = _x(i,0);
+    _buf(mysend,2) = _x(i,1);
+    _buf(mysend,3) = _x(i,2);
+    _buf(mysend,4) = _v(i,0);
+    _buf(mysend,5) = _v(i,1);
+    _buf(mysend,6) = _v(i,2);
+    _buf(mysend,7) = _tag[i];
+    _buf(mysend,8) = _type[i];
+    _buf(mysend,9) = _mask[i];
+    _buf(mysend,10) = _image[i];
+    _buf(mysend,11) = _radius[i];
+    _buf(mysend,12) = _rmass[i];
+    _buf(mysend,13) = _omega(i,0);
+    _buf(mysend,14) = _omega(i,1);
+    _buf(mysend,15) = _omega(i,2);
+    const int j = _copylist(mysend);
+
+    if (j>-1) {
+      _xw(i,0) = _x(j,0);
+      _xw(i,1) = _x(j,1);
+      _xw(i,2) = _x(j,2);
+      _vw(i,0) = _v(j,0);
+      _vw(i,1) = _v(j,1);
+      _vw(i,2) = _v(j,2);
+      _tagw[i] = _tag(j);
+      _typew[i] = _type(j);
+      _maskw[i] = _mask(j);
+      _imagew[i] = _image(j);
+      _radiusw[i] = _radius(j);
+      _rmassw[i] = _rmass(j);
+      _omegaw(i,0) = _omega(j,0);
+      _omegaw(i,1) = _omega(j,1);
+      _omegaw(i,2) = _omega(j,2);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi )
+{
+  if(nsend > (int) (k_buf.view<LMPHostType>().extent(0)*k_buf.view<LMPHostType>().extent(1))/16) {
+    int newsize = nsend*17/k_buf.view<LMPHostType>().extent(1)+1;
+    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
+  }
+  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+             MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
+             OMEGA_MASK);
+
+  if(space == Host) {
+    AtomVecSphereKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+  } else {
+    AtomVecSphereKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+  }
+  return nsend*16;
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_exchange(int i, double *buf)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
+            OMEGA_MASK);
+
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = ubuf(h_tag[i]).d;
+  buf[m++] = ubuf(h_type[i]).d;
+  buf[m++] = ubuf(h_mask[i]).d;
+  buf[m++] = ubuf(h_image[i]).d;
+
+  buf[m++] = h_radius[i];
+  buf[m++] = h_rmass[i];
+  buf[m++] = h_omega(i,0);
+  buf[m++] = h_omega(i,1);
+  buf[m++] = h_omega(i,2);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSphereKokkos_UnpackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array _x;
+  typename AT::t_v_array _v;
+  typename AT::t_tagint_1d _tag;
+  typename AT::t_int_1d _type;
+  typename AT::t_int_1d _mask;
+  typename AT::t_imageint_1d _image;
+  typename AT::t_float_1d _radius;
+  typename AT::t_float_1d _rmass;
+  typename AT::t_v_array _omega;
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d _nlocal;
+  int _dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecSphereKokkos_UnpackExchangeFunctor(
+    const AtomKokkos* atom,
+    const typename AT::tdual_xfloat_2d buf,
+    typename AT::tdual_int_1d nlocal,
+    int dim, X_FLOAT lo, X_FLOAT hi):
+    _x(atom->k_x.view<DeviceType>()),
+    _v(atom->k_v.view<DeviceType>()),
+    _tag(atom->k_tag.view<DeviceType>()),
+    _type(atom->k_type.view<DeviceType>()),
+    _mask(atom->k_mask.view<DeviceType>()),
+    _image(atom->k_image.view<DeviceType>()),
+    _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
+    _lo(lo),_hi(hi){
+    const size_t elements = 16;
+    const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &myrecv) const {
+    X_FLOAT x = _buf(myrecv,_dim+1);
+    if (x >= _lo && x < _hi) {
+      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
+      _x(i,0) = _buf(myrecv,1);
+      _x(i,1) = _buf(myrecv,2);
+      _x(i,2) = _buf(myrecv,3);
+      _v(i,0) = _buf(myrecv,4);
+      _v(i,1) = _buf(myrecv,5);
+      _v(i,2) = _buf(myrecv,6);
+      _tag[i] = _buf(myrecv,7);
+      _type[i] = _buf(myrecv,8);
+      _mask[i] = _buf(myrecv,9);
+      _image[i] = _buf(myrecv,10);
+      _radius[i] = _buf(myrecv,11);
+      _rmass[i] = _buf(myrecv,12);
+      _omega(i,0) = _buf(myrecv,13);
+      _omega(i,1) = _buf(myrecv,14);
+      _omega(i,2) = _buf(myrecv,15);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  if(space == Host) {
+    k_count.h_view(0) = nlocal;
+    AtomVecSphereKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/16,f);
+  } else {
+    k_count.h_view(0) = nlocal;
+    k_count.modify<LMPHostType>();
+    k_count.sync<LMPDeviceType>();
+    AtomVecSphereKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/16,f);
+    k_count.modify<LMPDeviceType>();
+    k_count.sync<LMPHostType>();
+  }
+
+  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                 MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
+	         OMEGA_MASK);
+
+  return k_count.h_view(0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::unpack_exchange(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_tag[nlocal] = (tagint) ubuf(buf[m++]).i;
+  h_type[nlocal] = (int) ubuf(buf[m++]).i;
+  h_mask[nlocal] = (int) ubuf(buf[m++]).i;
+  h_image[nlocal] = (imageint) ubuf(buf[m++]).i;
+
+  h_radius[nlocal] = buf[m++];
+  h_rmass[nlocal] = buf[m++];
+  h_omega(nlocal,0) = buf[m++];
+  h_omega(nlocal,1) = buf[m++];
+  h_omega(nlocal,2) = buf[m++];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK | RADIUS_MASK | RMASS_MASK |
+           OMEGA_MASK);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::size_restart()
+{
+  int i;
+
+  int nlocal = atom->nlocal;
+  int n = 16 * nlocal;
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   molecular types may be negative, but write as positive
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_restart(int i, double *buf)
+{
+  sync(Host,X_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK | V_MASK |
+            RADIUS_MASK | RMASS_MASK | OMEGA_MASK);
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = ubuf(h_tag[i]).d;
+  buf[m++] = ubuf(h_type[i]).d;
+  buf[m++] = ubuf(h_mask[i]).d;
+  buf[m++] = ubuf(h_image[i]).d;
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+
+  buf[m++] = h_radius[i];
+  buf[m++] = h_rmass[i];
+  buf[m++] = h_omega(i,0);
+  buf[m++] = h_omega(i,1);
+  buf[m++] = h_omega(i,2);
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_tag[nlocal] = (tagint) ubuf(buf[m++]).i;
+  h_type[nlocal] = (int) ubuf(buf[m++]).i;
+  h_mask[nlocal] = (int) ubuf(buf[m++]).i;
+  h_image[nlocal] = (imageint) ubuf(buf[m++]).i;
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+
+  h_radius[nlocal] = buf[m++];
+  h_rmass[nlocal] = buf[m++];
+  h_omega(nlocal,0) = buf[m++];
+  h_omega(nlocal,1) = buf[m++];
+  h_omega(nlocal,2) = buf[m++];
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (buf[0]) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  modified(Host,X_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK | V_MASK |
+	        RADIUS_MASK | RMASS_MASK | OMEGA_MASK);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   set other values to defaults
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    atomKK->modified(Host,ALL_MASK);
+    grow(0);
+  }
+  atomKK->modified(Host,ALL_MASK);
+
+  h_tag[nlocal] = 0;
+  h_type[nlocal] = itype;
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+  h_mask[nlocal] = 1;
+  h_image[nlocal] = ((imageint) IMGMAX << IMG2BITS) |
+    ((imageint) IMGMAX << IMGBITS) | IMGMAX;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  h_radius[nlocal] = 0.5;
+  h_rmass[nlocal] = 4.0*MY_PI/3.0 * h_radius[nlocal]*h_radius[nlocal]*h_radius[nlocal];
+  h_omega(nlocal,0) = 0.0;
+  h_omega(nlocal,1) = 0.0;
+  h_omega(nlocal,2) = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   initialize other atom quantities
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::data_atom(double *coord, imageint imagetmp, char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  tag[nlocal] = ATOTAGINT(values[0]);
+  type[nlocal] = atoi(values[1]);
+  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom type in Atoms section of data file");
+
+  radius[nlocal] = 0.5 * atof(values[2]);
+  if (radius[nlocal] < 0.0)
+    error->one(FLERR,"Invalid radius in Atoms section of data file");
+
+  double density = atof(values[3]);
+  if (density <= 0.0)
+    error->one(FLERR,"Invalid density in Atoms section of data file");
+
+  if (radius[nlocal] == 0.0) rmass[nlocal] = density;
+  else
+    rmass[nlocal] = 4.0*MY_PI/3.0 *
+      radius[nlocal]*radius[nlocal]*radius[nlocal] * density;
+
+  x[nlocal][0] = coord[0];
+  x[nlocal][1] = coord[1];
+  x[nlocal][2] = coord[2];
+
+  image[nlocal] = imagetmp;
+
+  mask[nlocal] = 1;
+  v[nlocal][0] = 0.0;
+  v[nlocal][1] = 0.0;
+  v[nlocal][2] = 0.0;
+  omega[nlocal][0] = 0.0;
+  omega[nlocal][1] = 0.0;
+  omega[nlocal][2] = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack hybrid quantities from one line in Atoms section of data file
+   initialize other atom quantities for this sub-style
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::data_atom_hybrid(int nlocal, char **values)
+{
+  radius[nlocal] = 0.5 * atof(values[0]);
+  if (radius[nlocal] < 0.0)
+    error->one(FLERR,"Invalid radius in Atoms section of data file");
+
+  double density = atof(values[1]);
+  if (density <= 0.0)
+    error->one(FLERR,"Invalid density in Atoms section of data file");
+
+  if (radius[nlocal] == 0.0) rmass[nlocal] = density;
+  else
+    rmass[nlocal] = 4.0*MY_PI/3.0 *
+      radius[nlocal]*radius[nlocal]*radius[nlocal] * density;
+
+  return 2;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Velocities section of data file
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::data_vel(int m, char **values)
+{
+  sync(Host,V_MASK|OMEGA_MASK);
+  h_v(m,0) = atof(values[0]);
+  h_v(m,1) = atof(values[1]);
+  h_v(m,2) = atof(values[2]);
+  h_omega(m,0) = atof(values[3]);
+  h_omega(m,1) = atof(values[4]);
+  h_omega(m,2) = atof(values[5]);
+  modified(Host,V_MASK|OMEGA_MASK);
+}
+
+/* ----------------------------------------------------------------------
+   unpack hybrid quantities from one line in Velocities section of data file
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::data_vel_hybrid(int m, char **values)
+{
+  sync(Host,OMEGA_MASK);
+  omega[m][0] = atof(values[0]);
+  omega[m][1] = atof(values[1]);
+  omega[m][2] = atof(values[2]);
+  modified(Host,OMEGA_MASK);
+  return 3;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::pack_data(double **buf)
+{
+  atomKK->sync(Host,TAG_MASK|TYPE_MASK|RADIUS_MASK|RMASS_MASK|X_MASK|IMAGE_MASK);
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag[i]).d;
+    buf[i][1] = ubuf(h_type[i]).d;
+    buf[i][2] = 2.0*h_radius[i];
+    if (h_radius[i] == 0.0) buf[i][3] = h_rmass[i];
+    else
+      buf[i][3] = h_rmass[i] / (4.0*MY_PI/3.0 * h_radius[i]*h_radius[i]*h_radius[i]);
+    buf[i][4] = h_x(i,0);
+    buf[i][5] = h_x(i,1);
+    buf[i][6] = h_x(i,2);
+    buf[i][7] = ubuf((h_image[i] & IMGMASK) - IMGMAX).d;
+    buf[i][8] = ubuf((h_image[i] >> IMGBITS & IMGMASK) - IMGMAX).d;
+    buf[i][9] = ubuf((h_image[i] >> IMG2BITS) - IMGMAX).d;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack hybrid atom info for data file
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_data_hybrid(int i, double *buf)
+{
+  atomKK->sync(Host,RADIUS_MASK|RMASS_MASK);
+
+  buf[0] = 2.0*h_radius[i];
+  if (h_radius[i] == 0.0) buf[1] = h_rmass[i];
+  else buf[1] = h_rmass[i] / (4.0*MY_PI/3.0 * h_radius[i]*h_radius[i]*h_radius[i]);
+  return 2;
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,TAGINT_FORMAT
+            " %d %-1.16e %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
+            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
+            buf[i][2],buf[i][3],
+            buf[i][4],buf[i][5],buf[i][6],
+            (int) ubuf(buf[i][7]).i,(int) ubuf(buf[i][8]).i,
+            (int) ubuf(buf[i][9]).i);
+}
+
+/* ----------------------------------------------------------------------
+   write hybrid atom info to data file
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::write_data_hybrid(FILE *fp, double *buf)
+{
+  fprintf(fp," %-1.16e %-1.16e",buf[0],buf[1]);
+  return 2;
+}
+
+/* ----------------------------------------------------------------------
+   pack velocity info for data file
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::pack_vel(double **buf)
+{
+  sync(Host,TAG_MASK|V_MASK|OMEGA_MASK);
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag[i]).d;
+    buf[i][1] = h_v(i,0);
+    buf[i][2] = h_v(i,1);
+    buf[i][3] = h_v(i,2);
+    buf[i][4] = h_omega(i,0);
+    buf[i][5] = h_omega(i,1);
+    buf[i][6] = h_omega(i,2);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack hybrid velocity info for data file
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::pack_vel_hybrid(int i, double *buf)
+{
+  sync(Host,OMEGA_MASK);
+
+  buf[0] = h_omega(i,0);
+  buf[1] = h_omega(i,1);
+  buf[2] = h_omega(i,2);
+  return 3;
+}
+
+/* ----------------------------------------------------------------------
+   write velocity info to data file
+------------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::write_vel(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,TAGINT_FORMAT
+            " %-1.16e %-1.16e %-1.16e %-1.16e %-1.16e %-1.16e\n",
+            (tagint) ubuf(buf[i][0]).i,buf[i][1],buf[i][2],buf[i][3],
+            buf[i][4],buf[i][5],buf[i][6]);
+}
+
+/* ----------------------------------------------------------------------
+   write hybrid velocity info to data file
+------------------------------------------------------------------------- */
+
+int AtomVecSphereKokkos::write_vel_hybrid(FILE *fp, double *buf)
+{
+  fprintf(fp," %-1.16e %-1.16e %-1.16e",buf[0],buf[1],buf[2]);
+  return 3;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecSphereKokkos::memory_usage()
+{
+  bigint bytes = 0;
+
+  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
+  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
+  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
+  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
+  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
+  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
+  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*comm->nthreads,3);
+
+  if (atom->memcheck("radius")) bytes += memory->usage(radius,nmax);
+  if (atom->memcheck("rmass")) bytes += memory->usage(rmass,nmax);
+  if (atom->memcheck("omega")) bytes += memory->usage(omega,nmax,3);
+  if (atom->memcheck("torque"))
+    bytes += memory->usage(torque,nmax*comm->nthreads,3);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+    if (mask & RADIUS_MASK) atomKK->k_radius.sync<LMPDeviceType>();
+    if (mask & RMASS_MASK) atomKK->k_rmass.sync<LMPDeviceType>();
+    if (mask & OMEGA_MASK) atomKK->k_omega.sync<LMPDeviceType>();
+    if (mask & TORQUE_MASK) atomKK->k_torque.sync<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+    if (mask & RADIUS_MASK) atomKK->k_radius.sync<LMPHostType>();
+    if (mask & RMASS_MASK) atomKK->k_rmass.sync<LMPHostType>();
+    if (mask & OMEGA_MASK) atomKK->k_omega.sync<LMPHostType>();
+    if (mask & TORQUE_MASK) atomKK->k_torque.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & RADIUS_MASK) && atomKK->k_radius.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_float_1d>(atomKK->k_radius,space);
+    if ((mask & RMASS_MASK) && atomKK->k_rmass.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_float_1d>(atomKK->k_rmass,space);
+    if ((mask & OMEGA_MASK) && atomKK->k_omega.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_omega,space);
+    if ((mask & TORQUE_MASK) && atomKK->k_torque.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_torque,space);
+  } else {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & RADIUS_MASK) && atomKK->k_radius.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_float_1d>(atomKK->k_radius,space);
+    if ((mask & RMASS_MASK) && atomKK->k_rmass.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_float_1d>(atomKK->k_rmass,space);
+    if ((mask & OMEGA_MASK) && atomKK->k_omega.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_omega,space);
+    if ((mask & TORQUE_MASK) && atomKK->k_torque.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_torque,space);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSphereKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+    if (mask & RADIUS_MASK) atomKK->k_radius.modify<LMPDeviceType>();
+    if (mask & RMASS_MASK) atomKK->k_rmass.modify<LMPDeviceType>();
+    if (mask & OMEGA_MASK) atomKK->k_omega.modify<LMPDeviceType>();
+    if (mask & TORQUE_MASK) atomKK->k_torque.modify<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+    if (mask & RADIUS_MASK) atomKK->k_radius.modify<LMPHostType>();
+    if (mask & RMASS_MASK) atomKK->k_rmass.modify<LMPHostType>();
+    if (mask & OMEGA_MASK) atomKK->k_omega.modify<LMPHostType>();
+    if (mask & TORQUE_MASK) atomKK->k_torque.modify<LMPHostType>();
+  }
+}
diff --git a/src/KOKKOS/atom_vec_sphere_kokkos.h b/src/KOKKOS/atom_vec_sphere_kokkos.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6739cb7143f2028276859078d8fd5f1d65d974d
--- /dev/null
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.h
@@ -0,0 +1,155 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(sphere/kk,AtomVecSphereKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_SPHERE_KOKKOS_H
+#define LMP_ATOM_VEC_SPHERE_KOKKOS_H
+
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecSphereKokkos : public AtomVecKokkos {
+ public:
+  AtomVecSphereKokkos(class LAMMPS *);
+  ~AtomVecSphereKokkos() {}
+  void init();
+  void grow(int);
+  void grow_reset();
+  void copy(int, int, int);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  int pack_comm_hybrid(int, int *, double *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int unpack_comm_hybrid(int, int, double *);
+  int pack_reverse(int, int, double *);
+  int pack_reverse_hybrid(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int unpack_reverse_hybrid(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  int pack_border_hybrid(int, int *, double *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int unpack_border_hybrid(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, imageint, char **);
+  int data_atom_hybrid(int, char **);
+  void data_vel(int, char **);
+  int data_vel_hybrid(int, char **);
+  void pack_data(double **);
+  int pack_data_hybrid(int, double *);
+  void write_data(FILE *, int, double **);
+  int write_data_hybrid(FILE *, double *);
+  void pack_vel(double **);
+  int pack_vel_hybrid(int, double *);
+  void write_vel(FILE *, int, double **);
+  int write_vel_hybrid(FILE *, double *);
+  bigint memory_usage();
+
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                     const int & iswap, const int nfirst,
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+
+ private:
+  tagint *tag;
+  int *type,*mask;
+  imageint *image;
+  double **x,**v,**f;
+  double *radius,*rmass;
+  double **omega,**torque;
+  int radvary;
+
+  DAT::t_tagint_1d d_tag;
+  HAT::t_tagint_1d h_tag;
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  DAT::t_float_1d d_radius;
+  HAT::t_float_1d h_radius;
+  DAT::t_float_1d d_rmass;
+  HAT::t_float_1d h_rmass;
+  DAT::t_v_array d_omega;
+  HAT::t_v_array h_omega;
+  DAT::t_f_array d_torque;
+  HAT::t_f_array h_torque;
+
+  DAT::tdual_int_1d k_count;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+E: Invalid radius in Atoms section of data file
+
+Radius must be >= 0.0.
+
+E: Invalid density in Atoms section of data file
+
+Density value cannot be <= 0.0.
+
+*/
diff --git a/src/KOKKOS/fix_neigh_history_kokkos.cpp b/src/KOKKOS/fix_neigh_history_kokkos.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d5277c0188e58593db7759e84476907b25427c69
--- /dev/null
+++ b/src/KOKKOS/fix_neigh_history_kokkos.cpp
@@ -0,0 +1,346 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "fix_neigh_history_kokkos.h"
+#include "atom_kokkos.h"
+#include "error.h"
+#include "memory_kokkos.h"
+#include "neigh_list_kokkos.h"
+#include "pair_kokkos.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+FixNeighHistoryKokkos<DeviceType>::FixNeighHistoryKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixNeighHistory(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *)atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  memory->destroy(npartner);
+  memory->destroy(partner);
+  memory->destroy(valuepartner);
+
+  maxpartner = 8;
+  grow_arrays(atom->nmax);
+
+  d_resize = typename ArrayTypes<DeviceType>::t_int_scalar("FixNeighHistoryKokkos::resize");
+#ifndef KOKKOS_USE_CUDA_UVM
+  h_resize = Kokkos::create_mirror_view(d_resize);
+#else
+  h_resize = d_resize;
+#endif
+  h_resize() = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+FixNeighHistoryKokkos<DeviceType>::~FixNeighHistoryKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_npartner, npartner);
+  memoryKK->destroy_kokkos(k_partner, partner);
+  memoryKK->destroy_kokkos(k_valuepartner, valuepartner);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::init()
+{
+  if (atomKK->tag_enable == 0)
+    error->all(FLERR,"Neighbor history requires atoms have IDs");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::pre_exchange()
+{
+  copymode = 1;
+  
+  h_resize() = 1;
+  while (h_resize() > 0) {
+    FixNeighHistoryKokkosZeroPartnerCountFunctor<DeviceType> zero(this);
+    Kokkos::parallel_for(nlocal_neigh,zero);
+
+    h_resize() = 0;
+    deep_copy(d_resize, h_resize);
+
+    FixNeighHistoryKokkosPreExchangeFunctor<DeviceType> f(this);
+    Kokkos::parallel_for(nlocal_neigh,f);
+
+    deep_copy(h_resize, d_resize);
+    if (h_resize() > 0) {
+      maxpartner += 8;
+      memoryKK->grow_kokkos(k_partner,partner,atom->nmax,maxpartner,"neighbor_history:partner");
+      memoryKK->grow_kokkos(k_valuepartner,valuepartner,atom->nmax,dnum*maxpartner,"neighbor_history:valuepartner");
+    }
+  }
+
+  copymode = 0;
+
+  comm->maxexchange_fix = MAX(comm->maxexchange_fix,(dnum+1)*maxpartner+1);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::zero_partner_count_item(const int &i) const
+{
+  d_npartner[i] = 0;
+}
+
+template <class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::pre_exchange_item(const int &ii) const
+{
+  const int i = d_ilist[ii];
+  const int jnum = d_numneigh[i];
+
+  for (int jj = 0; jj < jnum; jj++) {
+    if (d_firstflag(i,jj)) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      int m = Kokkos::atomic_fetch_add(&d_npartner[i],1);
+      if (m < maxpartner) {
+	d_partner(i,m) = tag[j];
+	for (int k = 0; k < dnum; k++)
+	  d_valuepartner(i,dnum*m+k) = d_firstvalue(i,dnum*jj+k);
+      } else {
+	d_resize() = 1;
+      }
+      if (j < nlocal_neigh) {
+	m = Kokkos::atomic_fetch_add(&d_npartner[j],1);
+	if (m < maxpartner) {
+	  d_partner(j,m) = tag[i];
+	  for (int k = 0; k < dnum; k++)
+	    d_valuepartner(j,dnum*m+k) = d_firstvalue(i,dnum*jj+k);
+	} else {
+	  d_resize() = 1;
+	}
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::setup_post_neighbor()
+{
+  post_neighbor();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::post_neighbor()
+{
+  tag = atomKK->k_tag.view<DeviceType>();
+  
+  int inum = pair->list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(pair->list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  // store atom counts used for new neighbor list which was just built
+
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  nlocal_neigh = nlocal;
+  nall_neigh = nall;
+
+  // realloc firstflag and firstvalue if needed
+
+  if (maxatom < nlocal || k_list->maxneighs > d_firstflag.extent(1)) {
+    maxatom = nall;
+    d_firstflag = typename ArrayTypes<DeviceType>::t_int_2d("neighbor_history:firstflag",maxatom,k_list->maxneighs);
+    d_firstvalue = typename ArrayTypes<DeviceType>::t_float_2d("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum);
+  }
+
+  copymode = 1;
+  
+  FixNeighHistoryKokkosPostNeighborFunctor<DeviceType> f(this);
+  Kokkos::parallel_for(inum,f);
+
+  copymode = 0;
+
+#ifdef LMP_KOKKOS_DEBUG
+  typename ArrayTypes<LMPHostType>::t_neighbors_2d h_neighbors("neighbor_history:h_neighbor",k_list->d_neighbors.extent(0), k_list->d_neighbors.extent(1));
+  deep_copy(h_neighbors, d_neighbors);
+
+  typename ArrayTypes<LMPHostType>::t_int_1d h_numneigh("neighbor_history:h_numneigh",k_list->d_numneigh.extent(0));
+  deep_copy(h_numneigh, d_numneigh);
+  
+  typename ArrayTypes<LMPHostType>::t_int_2d h_firstflag("neighbor_history:h_firstflag",maxatom,k_list->maxneighs);
+  deep_copy(h_firstflag, d_firstflag);
+
+  typename ArrayTypes<LMPHostType>::t_float_2d h_firstvalue("neighbor_history:h_firstvalue",maxatom,k_list->maxneighs*dnum);
+  deep_copy(h_firstvalue, d_firstvalue);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNeighHistoryKokkos<DeviceType>::post_neighbor_item(const int &ii) const
+{
+  const int i = d_ilist[ii];
+  const int jnum = d_numneigh[i];
+  const int np = d_npartner[i];
+
+  for (int jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    const int rflag = j >> SBBITS & 3;
+    j &= NEIGHMASK;
+
+    int m;
+    if (rflag) {
+      int jtag = tag(j);
+      for (m = 0; m < np; m++)
+	if (d_partner(i, m) == jtag) break;
+      if (m < np) {
+	d_firstflag(i,jj) = 1;
+	for (int k = 0; k < dnum; k++) {
+	  d_firstvalue(i, dnum*jj+k) = d_valuepartner(i, dnum*m+k);
+	}
+      } else {
+	d_firstflag(i,jj) = 0;
+	for (int k = 0; k < dnum; k++) {
+	  d_firstvalue(i, dnum*jj+k) = 0;
+	}
+      }
+    } else {
+      d_firstflag(i,jj) = 0;
+      for (int k = 0; k < dnum; k++) {
+	d_firstvalue(i, dnum*jj+k) = 0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local atom-based arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double FixNeighHistoryKokkos<DeviceType>::memory_usage()
+{
+  double bytes = d_firstflag.extent(0)*d_firstflag.extent(1)*sizeof(int);
+  bytes += d_firstvalue.extent(0)*d_firstvalue.extent(1)*sizeof(double);
+  bytes += 2*k_npartner.extent(0)*sizeof(int);
+  bytes += 2*k_partner.extent(0)*k_partner.extent(1)*sizeof(int);
+  bytes += 2*k_valuepartner.extent(0)*k_valuepartner.extent(1)*sizeof(double);
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   allocate fictitious charge arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::grow_arrays(int nmax)
+{
+  k_npartner.template sync<LMPHostType>(); // force reallocation on host
+  k_partner.template sync<LMPHostType>();
+  k_valuepartner.template sync<LMPHostType>();
+  
+  memoryKK->grow_kokkos(k_npartner,npartner,nmax,"neighbor_history:npartner");
+  memoryKK->grow_kokkos(k_partner,partner,nmax,maxpartner,"neighbor_history:partner");
+  memoryKK->grow_kokkos(k_valuepartner,valuepartner,nmax,dnum*maxpartner,"neighbor_history:valuepartner");
+
+  d_npartner = k_npartner.template view<DeviceType>();
+  d_partner = k_partner.template view<DeviceType>();
+  d_valuepartner = k_valuepartner.template view<DeviceType>();
+
+  k_npartner.template modify<LMPHostType>();
+  k_partner.template modify<LMPHostType>();
+  k_valuepartner.template modify<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   copy values within fictitious charge arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNeighHistoryKokkos<DeviceType>::copy_arrays(int i, int j, int delflag)
+{
+  k_npartner.template sync<LMPHostType>();
+  k_partner.template sync<LMPHostType>();
+  k_valuepartner.template sync<LMPHostType>();
+
+  npartner[j] = npartner[i];
+  for (int m = 0; m < npartner[i]; m++) {
+    partner[j][m] = partner[i][m];
+    valuepartner[j][m] = valuepartner[i][m];
+  }
+
+  k_npartner.template modify<LMPHostType>();
+  k_partner.template modify<LMPHostType>();
+  k_valuepartner.template modify<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   pack values in local atom-based array for exchange with another proc
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixNeighHistoryKokkos<DeviceType>::pack_exchange(int i, double *buf)
+{
+  k_npartner.template sync<LMPHostType>();
+  k_partner.template sync<LMPHostType>();
+  k_valuepartner.template sync<LMPHostType>();
+
+  int n = 0;
+  buf[n++] = npartner[i];
+  for (int m = 0; m < npartner[i]; m++) buf[n++] = partner[i][m];
+  for (int m = 0; m < dnum*npartner[i]; m++) buf[n++] = valuepartner[i][m];
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   unpack values in local atom-based array from exchange with another proc
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixNeighHistoryKokkos<DeviceType>::unpack_exchange(int nlocal, double *buf)
+{
+  int n = 0;
+  npartner[nlocal] = static_cast<int>(buf[n++]);
+  for (int m = 0; m < npartner[nlocal]; m++) partner[nlocal][m] = static_cast<int>(buf[n++]);
+  for (int m = 0; m < dnum*npartner[nlocal]; m++) valuepartner[nlocal][m] = buf[n++];
+
+  k_npartner.template modify<LMPHostType>();
+  k_partner.template modify<LMPHostType>();
+  k_valuepartner.template modify<LMPHostType>();
+
+  return n;
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class FixNeighHistoryKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixNeighHistoryKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_neigh_history_kokkos.h b/src/KOKKOS/fix_neigh_history_kokkos.h
new file mode 100644
index 0000000000000000000000000000000000000000..68bd59722f53d91d3b0125a3b859487f1e5addd6
--- /dev/null
+++ b/src/KOKKOS/fix_neigh_history_kokkos.h
@@ -0,0 +1,107 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(NEIGH_HISTORY/KK,FixNeighHistoryKokkos<LMPDeviceType>)
+FixStyle(NEIGH_HISTORY/KK/DEVICE,FixNeighHistoryKokkos<LMPDeviceType>)
+FixStyle(NEIGH_HISTORY/KK/HOST,FixNeighHistoryKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_NEIGH_HISTORY_KOKKOS_H
+#define LMP_FIX_NEIGH_HISTORY_KOKKOS_H
+
+#include "fix_neigh_history.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+template <class DeviceType>
+class FixNeighHistoryKokkos : public FixNeighHistory {
+ public:
+  FixNeighHistoryKokkos(class LAMMPS *, int, char **);
+  ~FixNeighHistoryKokkos();
+
+  void init();
+  void pre_exchange();
+  void setup_post_neighbor();
+  virtual void post_neighbor();
+  double memory_usage();
+  void grow_arrays(int);
+  void copy_arrays(int, int, int);
+  int pack_exchange(int, double *);
+  int unpack_exchange(int, double *);
+
+  KOKKOS_INLINE_FUNCTION
+  void zero_partner_count_item(const int &i) const;
+  KOKKOS_INLINE_FUNCTION
+  void pre_exchange_item(const int &ii) const;
+  KOKKOS_INLINE_FUNCTION
+  void post_neighbor_item(const int &ii) const;
+
+  typename ArrayTypes<DeviceType>::t_int_2d d_firstflag;
+  typename ArrayTypes<DeviceType>::t_float_2d d_firstvalue;
+
+ private:
+  typename ArrayTypes<DeviceType>::tdual_int_1d k_npartner;
+  typename ArrayTypes<DeviceType>::tdual_int_2d k_partner;
+  typename ArrayTypes<DeviceType>::tdual_float_2d k_valuepartner;
+
+  // for neighbor list lookup
+  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread d_ilist;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread d_numneigh;
+  
+  typename ArrayTypes<DeviceType>::t_tagint_1d tag;
+  typename ArrayTypes<DeviceType>::t_int_1d d_npartner;
+  typename ArrayTypes<DeviceType>::t_int_2d d_partner;
+  typename ArrayTypes<DeviceType>::t_float_2d d_valuepartner; 
+
+  typename ArrayTypes<DeviceType>::t_int_scalar d_resize;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
+};
+
+template <class DeviceType>
+struct FixNeighHistoryKokkosZeroPartnerCountFunctor {
+  FixNeighHistoryKokkos<DeviceType> c;
+  FixNeighHistoryKokkosZeroPartnerCountFunctor(FixNeighHistoryKokkos<DeviceType> *c_ptr): c(*c_ptr) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int &i) const {
+    c.zero_partner_count_item(i);
+  }
+};
+
+template <class DeviceType>
+struct FixNeighHistoryKokkosPreExchangeFunctor {
+  FixNeighHistoryKokkos<DeviceType> c;
+  FixNeighHistoryKokkosPreExchangeFunctor(FixNeighHistoryKokkos<DeviceType> *c_ptr): c(*c_ptr) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+    c.pre_exchange_item(i);
+  }
+};
+
+template <class DeviceType>
+struct FixNeighHistoryKokkosPostNeighborFunctor {
+  FixNeighHistoryKokkos<DeviceType> c;
+  FixNeighHistoryKokkosPostNeighborFunctor(FixNeighHistoryKokkos<DeviceType> *c_ptr): c(*c_ptr) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &i) const {
+    c.post_neighbor_item(i);
+  }
+};
+
+} // namespace LAMMPS_NS
+
+#endif // LMP_FIX_NEIGH_HISTORY_KOKKOS_H
+#endif // FIX_CLASS
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index 4411627a784f01329cb35da5afbb9457597d45e1..c50d6305e7353b082ef16aa2ca2774bea6a0a414 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -24,8 +24,8 @@ namespace LAMMPS_NS {
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI>
-NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::NPairKokkos(LAMMPS *lmp) : NPair(lmp) {
+template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
+NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::NPairKokkos(LAMMPS *lmp) : NPair(lmp) {
 
 }
 
@@ -33,8 +33,8 @@ NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::NPairKokkos(LAMMPS *lmp) : NPair(l
    copy needed info from Neighbor class to this build class
    ------------------------------------------------------------------------- */
 
-template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI>
-void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_neighbor_info()
+template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
+void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::copy_neighbor_info()
 {
   NPair::copy_neighbor_info();
 
@@ -63,8 +63,8 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_neighbor_info()
  copy per-atom and per-bin vectors from NBin class to this build class
  ------------------------------------------------------------------------- */
 
-template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI>
-void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
+template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
+void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::copy_bin_info()
 {
   NPair::copy_bin_info();
 
@@ -80,8 +80,8 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
  copy needed info from NStencil class to this build class
  ------------------------------------------------------------------------- */
 
-template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI>
-void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()
+template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
+void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::copy_stencil_info()
 {
   NPair::copy_stencil_info();
 
@@ -110,8 +110,8 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI>
-void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
+template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
+void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
 {
   NeighListKokkos<DeviceType>* list = (NeighListKokkos<DeviceType>*) list_;
   const int nlocal = includegroup?atom->nfirst:atom->nlocal;
@@ -131,6 +131,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
          k_stencilxyz.view<DeviceType>(),
          nlocal,
          atomKK->k_x.view<DeviceType>(),
+         atomKK->k_radius.view<DeviceType>(),
          atomKK->k_type.view<DeviceType>(),
          atomKK->k_mask.view<DeviceType>(),
          atomKK->k_molecule.view<DeviceType>(),
@@ -155,7 +156,8 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
          k_ex_mol_intra.view<DeviceType>(),
          bboxhi,bboxlo,
          domain->xperiodic,domain->yperiodic,domain->zperiodic,
-         domain->xprd_half,domain->yprd_half,domain->zprd_half);
+         domain->xprd_half,domain->yprd_half,domain->zprd_half,
+	 skin);
 
   k_cutneighsq.sync<DeviceType>();
   k_ex1_type.sync<DeviceType>();
@@ -171,7 +173,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   k_bincount.sync<DeviceType>();
   k_bins.sync<DeviceType>();
   k_atom2bin.sync<DeviceType>();
-  atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
+  atomKK->sync(Device,X_MASK|RADIUS_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
 
   data.special_flag[0] = special_flag[0];
   data.special_flag[1] = special_flag[1];
@@ -198,25 +200,49 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
       Kokkos::parallel_for(nall, f);
     } else {
       if (newton_pair) {
-        NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+	if (SIZE) {
+	  NPairKokkosBuildFunctorSize<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
 #ifdef KOKKOS_HAVE_CUDA
-        if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
-          Kokkos::parallel_for(config, f);
-        else
-          Kokkos::parallel_for(nall, f);
+	  if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
+	    Kokkos::parallel_for(config, f);
+	  else
+	    Kokkos::parallel_for(nall, f);
 #else
-        Kokkos::parallel_for(nall, f);
+	  Kokkos::parallel_for(nall, f);
 #endif
+	} else {
+	  NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+#ifdef KOKKOS_HAVE_CUDA
+	  if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
+	    Kokkos::parallel_for(config, f);
+	  else
+	    Kokkos::parallel_for(nall, f);
+#else
+	  Kokkos::parallel_for(nall, f);
+#endif
+	}
       } else {
-        NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+	if (SIZE) {
+	  NPairKokkosBuildFunctorSize<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
 #ifdef KOKKOS_HAVE_CUDA
-        if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
-          Kokkos::parallel_for(config, f);
-        else
-          Kokkos::parallel_for(nall, f);
+	  if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
+	    Kokkos::parallel_for(config, f);
+	  else
+	    Kokkos::parallel_for(nall, f);
 #else
-        Kokkos::parallel_for(nall, f);
+	  Kokkos::parallel_for(nall, f);
 #endif
+	} else {
+	  NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+#ifdef KOKKOS_HAVE_CUDA
+	  if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
+	    Kokkos::parallel_for(config, f);
+	  else
+	    Kokkos::parallel_for(nall, f);
+#else
+	  Kokkos::parallel_for(nall, f);
+#endif
+	}
       }
     }
     deep_copy(data.h_resize, data.resize);
@@ -774,19 +800,307 @@ void NeighborKokkosExecute<DeviceType>::
   neigh_list.d_ilist(i) = i;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType> template<int HalfNeigh,int Newton,int Tri>
+void NeighborKokkosExecute<DeviceType>::
+   build_ItemSize(const int &i) const
+{
+  /* if necessary, goto next page and add pages */
+  int n = 0;
+  int which = 0;
+
+  // get subview of neighbors of i
+
+  const AtomNeighbors neighbors_i = neigh_list.get_neighbors(i);
+  const X_FLOAT xtmp = x(i, 0);
+  const X_FLOAT ytmp = x(i, 1);
+  const X_FLOAT ztmp = x(i, 2);
+  const X_FLOAT radi = radius(i);
+  const int itype = type(i);
+
+  const int ibin = c_atom2bin(i);
+
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
+    = d_stencil;
+
+  const int mask_history = 3 << SBBITS;
+
+  // loop over all bins in neighborhood (includes ibin)
+  if(HalfNeigh)
+  for(int m = 0; m < c_bincount(ibin); m++) {
+    const int j = c_bins(ibin,m);
+    const int jtype = type(j);
+
+    //for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using HalfNeighborlists
+    if((j == i) || (HalfNeigh && !Newton && (j < i))  ||
+        (HalfNeigh && Newton && ((j < i) || ((j >= nlocal) &&
+                                       ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) ||
+                                        (x(j, 2) == ztmp && x(j, 1)  == ytmp && x(j, 0) < xtmp)))))
+      ) continue;
+    if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+    const X_FLOAT delx = xtmp - x(j, 0);
+    const X_FLOAT dely = ytmp - x(j, 1);
+    const X_FLOAT delz = ztmp - x(j, 2);
+    const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+    const X_FLOAT radsum = radi + radius(j);
+    const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
+
+    if(rsq <= cutsq) {
+      if(n<neigh_list.maxneighs) {
+	if(neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
+	else neighbors_i(n++) = j;
+      }
+      else n++;
+    }
+  }
+
+  for(int k = 0; k < nstencil; k++) {
+    const int jbin = ibin + stencil[k];
+
+    // get subview of jbin
+    if(HalfNeigh && (ibin==jbin)) continue;
+    //const ArrayTypes<DeviceType>::t_int_1d_const_um =Kokkos::subview<t_int_1d_const_um>(bins,jbin,ALL);
+    for(int m = 0; m < c_bincount(jbin); m++) {
+
+      const int j = c_bins(jbin,m);
+      const int jtype = type(j);
+
+      if(HalfNeigh && !Newton && (j < i)) continue;
+      if(!HalfNeigh && j==i) continue;
+      if(Tri) {
+	if (x(j,2) < ztmp) continue;
+	if (x(j,2) == ztmp) {
+	  if (x(j,1) < ytmp) continue;
+	  if (x(j,1) == ytmp) {
+	    if (x(j,0) < xtmp) continue;
+	    if (x(j,0) == xtmp && j <= i) continue;
+	  }
+	}
+      }
+      if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+      const X_FLOAT radsum = radi + radius(j);
+      const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
+
+      if(rsq <= cutsq) {
+	if(n<neigh_list.maxneighs) {
+	  if(neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
+	  else neighbors_i(n++) = j;
+	}
+	else n++;
+      }
+    }
+  }
+
+  neigh_list.d_numneigh(i) = n;
+
+  if(n > neigh_list.maxneighs) {
+    resize() = 1;
+
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
+  }
+
+  neigh_list.d_ilist(i) = i;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef KOKKOS_HAVE_CUDA
+template<class DeviceType> template<int HalfNeigh,int Newton,int Tri>
+__device__ inline
+void NeighborKokkosExecute<DeviceType>::build_ItemSizeCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const
+{
+  /* loop over atoms in i's bin,
+   */
+  const int atoms_per_bin = c_bins.extent(1);
+  const int BINS_PER_TEAM = dev.team_size()/atoms_per_bin<1?1:dev.team_size()/atoms_per_bin;
+  const int TEAMS_PER_BIN = atoms_per_bin/dev.team_size()<1?1:atoms_per_bin/dev.team_size();
+  const int MY_BIN = dev.team_rank()/atoms_per_bin;
+
+  const int ibin = dev.league_rank()*BINS_PER_TEAM+MY_BIN;
+
+  if(ibin >=c_bincount.extent(0)) return;
+  X_FLOAT* other_x = sharedmem;
+  other_x = other_x + 6*atoms_per_bin*MY_BIN;
+
+  int* other_id = (int*) &other_x[5 * atoms_per_bin];
+
+  int bincount_current = c_bincount[ibin];
+
+  for(int kk = 0; kk < TEAMS_PER_BIN; kk++) {
+    const int MY_II = dev.team_rank()%atoms_per_bin+kk*dev.team_size();
+    const int i = MY_II < bincount_current ? c_bins(ibin, MY_II) : -1;
+    /* if necessary, goto next page and add pages */
+
+    int n = 0;
+
+    X_FLOAT xtmp;
+    X_FLOAT ytmp;
+    X_FLOAT ztmp;
+    X_FLOAT radi;
+    int itype;
+    const AtomNeighbors neighbors_i = neigh_list.get_neighbors((i>=0&&i<nlocal)?i:0);
+    const int mask_history = 3 << SBBITS;
+
+    if(i >= 0) {
+      xtmp = x(i, 0);
+      ytmp = x(i, 1);
+      ztmp = x(i, 2);
+      radi = radius(i);
+      itype = type(i);
+      other_x[MY_II] = xtmp;
+      other_x[MY_II + atoms_per_bin] = ytmp;
+      other_x[MY_II + 2 * atoms_per_bin] = ztmp;
+      other_x[MY_II + 3 * atoms_per_bin] = itype;
+      other_x[MY_II + 4 * atoms_per_bin] = radi;
+    }
+    other_id[MY_II] = i;
+    int test = (__syncthreads_count(i >= 0 && i <= nlocal) == 0);
+
+    if(test) return;
+
+    if(i >= 0 && i < nlocal) {
+      #pragma unroll 4
+      for(int m = 0; m < bincount_current; m++) {
+	int j = other_id[m];
+	const int jtype = other_x[m + 3 * atoms_per_bin];
+
+	//for same bin as atom i skip j if i==j and skip atoms "below and to the left" if using halfneighborlists
+	if((j == i) ||
+	   (HalfNeigh && !Newton && (j < i))  ||
+	   (HalfNeigh && Newton &&
+            ((j < i) ||
+	     ((j >= nlocal) && ((x(j, 2) < ztmp) || (x(j, 2) == ztmp && x(j, 1) < ytmp) ||
+				(x(j, 2) == ztmp && x(j, 1)  == ytmp && x(j, 0) < xtmp)))))
+	   ) continue;
+        if(Tri) {
+          if (x(j,2) < ztmp) continue;
+          if (x(j,2) == ztmp) {
+            if (x(j,1) < ytmp) continue;
+            if (x(j,1) == ytmp) {
+              if (x(j,0) < xtmp) continue;
+              if (x(j,0) == xtmp && j <= i) continue;
+            }
+          }
+        }
+	if(exclude && exclusion(i,j,itype,jtype)) continue;
+	const X_FLOAT delx = xtmp - other_x[m];
+	const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
+	const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
+	const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+	const X_FLOAT radsum = radi + other_x[m + 4 * atoms_per_bin];
+	const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
+
+	if(rsq <= cutsq) {
+	  if(n<neigh_list.maxneighs) {
+	    if (neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
+	    else neighbors_i(n++) = j;
+	  }
+	  else n++;
+	}
+      }
+    }
+    __syncthreads();
+
+    const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
+      = d_stencil;
+    for(int k = 0; k < nstencil; k++) {
+      const int jbin = ibin + stencil[k];
+
+      if(ibin == jbin) continue;
+
+      bincount_current = c_bincount[jbin];
+      int j = MY_II < bincount_current ? c_bins(jbin, MY_II) : -1;
+
+      if(j >= 0) {
+	other_x[MY_II] = x(j, 0);
+	other_x[MY_II + atoms_per_bin] = x(j, 1);
+	other_x[MY_II + 2 * atoms_per_bin] = x(j, 2);
+	other_x[MY_II + 3 * atoms_per_bin] = type(j);
+	other_x[MY_II + 4 * atoms_per_bin] = radius(j);
+      }
+
+      other_id[MY_II] = j;
+
+      __syncthreads();
+
+      if(i >= 0 && i < nlocal) {
+        #pragma unroll 8
+	for(int m = 0; m < bincount_current; m++) {
+	  const int j = other_id[m];
+	  const int jtype = other_x[m + 3 * atoms_per_bin];
+
+	  if(HalfNeigh && (j < i))  continue;
+	  if(HalfNeigh && !Newton && (j < i)) continue;
+	  if(!HalfNeigh && j==i) continue;
+	  if(Tri) {
+	    if (x(j,2) < ztmp) continue;
+	    if (x(j,2) == ztmp) {
+	      if (x(j,1) < ytmp) continue;
+	      if (x(j,1) == ytmp) {
+		if (x(j,0) < xtmp) continue;
+		if (x(j,0) == xtmp && j <= i) continue;
+	      }
+	    }
+	  }
+	  if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+	  const X_FLOAT delx = xtmp - other_x[m];
+	  const X_FLOAT dely = ytmp - other_x[m + atoms_per_bin];
+	  const X_FLOAT delz = ztmp - other_x[m + 2 * atoms_per_bin];
+	  const X_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+	  const X_FLOAT radsum = radi + other_x[m + 4 * atoms_per_bin];
+	  const X_FLOAT cutsq = (radsum + skin) * (radsum + skin);
+
+	  if(rsq <= cutsq) {
+	    if(n<neigh_list.maxneighs) {
+	      if (neigh_list.history && rsq < radsum*radsum) neighbors_i(n++) = j ^ mask_history;
+	      else neighbors_i(n++) = j;
+	    }
+	    else n++;
+	  }
+	}
+      }
+      __syncthreads();
+    }
+
+    if(i >= 0 && i < nlocal) {
+      neigh_list.d_numneigh(i) = n;
+      neigh_list.d_ilist(i) = i;
+    }
+
+    if(n > neigh_list.maxneighs) {
+      resize() = 1;
+
+      if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
+    }
+  }
+}
+#endif
+
 }
 
 namespace LAMMPS_NS {
-template class NPairKokkos<LMPDeviceType,0,0,0>;
-template class NPairKokkos<LMPDeviceType,0,1,0>;
-template class NPairKokkos<LMPDeviceType,1,0,0>;
-template class NPairKokkos<LMPDeviceType,1,1,0>;
-template class NPairKokkos<LMPDeviceType,1,0,1>;
+template class NPairKokkos<LMPDeviceType,0,0,0,0>;
+template class NPairKokkos<LMPDeviceType,0,1,0,0>;
+template class NPairKokkos<LMPDeviceType,1,0,0,0>;
+template class NPairKokkos<LMPDeviceType,1,1,0,0>;
+template class NPairKokkos<LMPDeviceType,1,0,1,0>;
+template class NPairKokkos<LMPDeviceType,1,0,0,1>;
+template class NPairKokkos<LMPDeviceType,1,0,1,1>;
 #ifdef KOKKOS_HAVE_CUDA
-template class NPairKokkos<LMPHostType,0,0,0>;
-template class NPairKokkos<LMPHostType,0,1,0>;
-template class NPairKokkos<LMPHostType,1,0,0>;
-template class NPairKokkos<LMPHostType,1,1,0>;
-template class NPairKokkos<LMPHostType,1,0,1>;
+template class NPairKokkos<LMPHostType,0,0,0,0>;
+template class NPairKokkos<LMPHostType,0,1,0,0>;
+template class NPairKokkos<LMPHostType,1,0,0,0>;
+template class NPairKokkos<LMPHostType,1,1,0,0>;
+template class NPairKokkos<LMPHostType,1,0,1,0>;
+template class NPairKokkos<LMPHostType,1,0,0,1>;
+template class NPairKokkos<LMPHostType,1,0,1,1>;
 #endif
 }
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index 6c1c0e958b49dcd09ff7bb6b32efca67b2d3f6a6..a8837fa9f35d7e4480fa3a42ebd1558e7a3d1a57 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -13,56 +13,76 @@
 
 #ifdef NPAIR_CLASS
 
-typedef NPairKokkos<LMPHostType,0,0,0> NPairKokkosFullBinHost;
+typedef NPairKokkos<LMPHostType,0,0,0,0> NPairKokkosFullBinHost;
 NPairStyle(full/bin/kk/host,
            NPairKokkosFullBinHost,
            NP_FULL | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI)
 
-typedef NPairKokkos<LMPDeviceType,0,0,0> NPairKokkosFullBinDevice;
+typedef NPairKokkos<LMPDeviceType,0,0,0,0> NPairKokkosFullBinDevice;
 NPairStyle(full/bin/kk/device,
            NPairKokkosFullBinDevice,
            NP_FULL | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI)
 
-typedef NPairKokkos<LMPHostType,0,1,0> NPairKokkosFullBinGhostHost;
+typedef NPairKokkos<LMPHostType,0,1,0,0> NPairKokkosFullBinGhostHost;
 NPairStyle(full/bin/ghost/kk/host,
            NPairKokkosFullBinGhostHost,
            NP_FULL | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_GHOST | NP_ORTHO | NP_TRI)
 
-typedef NPairKokkos<LMPDeviceType,0,1,0> NPairKokkosFullBinGhostDevice;
+typedef NPairKokkos<LMPDeviceType,0,1,0,0> NPairKokkosFullBinGhostDevice;
 NPairStyle(full/bin/ghost/kk/device,
            NPairKokkosFullBinGhostDevice,
            NP_FULL | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_GHOST | NP_ORTHO | NP_TRI)
 
-typedef NPairKokkos<LMPHostType,1,0,0> NPairKokkosHalfBinHost;
+typedef NPairKokkos<LMPHostType,1,0,0,0> NPairKokkosHalfBinHost;
 NPairStyle(half/bin/kk/host,
            NPairKokkosHalfBinHost,
            NP_HALF | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_ORTHO)
 
-typedef NPairKokkos<LMPDeviceType,1,0,0> NPairKokkosHalfBinDevice;
+typedef NPairKokkos<LMPDeviceType,1,0,0,0> NPairKokkosHalfBinDevice;
 NPairStyle(half/bin/kk/device,
            NPairKokkosHalfBinDevice,
            NP_HALF | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_ORTHO)
 
-typedef NPairKokkos<LMPHostType,1,0,1> NPairKokkosHalfBinHostTri;
+typedef NPairKokkos<LMPHostType,1,0,1,0> NPairKokkosHalfBinHostTri;
 NPairStyle(half/bin/kk/host,
            NPairKokkosHalfBinHostTri,
            NP_HALF | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_TRI)
 
-typedef NPairKokkos<LMPDeviceType,1,0,1> NPairKokkosHalfBinDeviceTri;
+typedef NPairKokkos<LMPDeviceType,1,0,1,0> NPairKokkosHalfBinDeviceTri;
 NPairStyle(half/bin/kk/device,
            NPairKokkosHalfBinDeviceTri,
            NP_HALF | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_TRI)
 
-typedef NPairKokkos<LMPHostType,1,1,0> NPairKokkosHalfBinGhostHost;
+typedef NPairKokkos<LMPHostType,1,1,0,0> NPairKokkosHalfBinGhostHost;
 NPairStyle(half/bin/ghost/kk/host,
            NPairKokkosHalfBinGhostHost,
            NP_HALF | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_GHOST | NP_ORTHO | NP_TRI)
 
-typedef NPairKokkos<LMPDeviceType,1,1,0> NPairKokkosHalfBinGhostDevice;
+typedef NPairKokkos<LMPDeviceType,1,1,0,0> NPairKokkosHalfBinGhostDevice;
 NPairStyle(half/bin/ghost/kk/device,
            NPairKokkosHalfBinGhostDevice,
            NP_HALF | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_GHOST | NP_ORTHO | NP_TRI)
 
+typedef NPairKokkos<LMPHostType,1,0,0,1> NPairKokkosHalfSizeBinHost;
+NPairStyle(half/size/bin/kk/host,
+           NPairKokkosHalfSizeBinHost,
+           NP_HALF | NP_SIZE | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_ORTHO)
+
+typedef NPairKokkos<LMPDeviceType,1,0,0,1> NPairKokkosHalfSizeBinDevice;
+NPairStyle(half/size/bin/kk/device,
+           NPairKokkosHalfSizeBinDevice,
+           NP_HALF | NP_SIZE | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_ORTHO)
+
+typedef NPairKokkos<LMPHostType,1,0,1,1> NPairKokkosHalfSizeBinHostTri;
+NPairStyle(half/size/bin/kk/host,
+           NPairKokkosHalfSizeBinHostTri,
+           NP_HALF | NP_SIZE | NP_BIN | NP_KOKKOS_HOST | NP_NEWTON | NP_NEWTOFF | NP_TRI)
+
+typedef NPairKokkos<LMPDeviceType,1,0,1,1> NPairKokkosHalfSizeBinDeviceTri;
+NPairStyle(half/size/bin/kk/device,
+           NPairKokkosHalfSizeBinDeviceTri,
+           NP_HALF | NP_SIZE | NP_BIN | NP_KOKKOS_DEVICE | NP_NEWTON | NP_NEWTOFF | NP_TRI)
+
 #else
 
 #ifndef LMP_NPAIR_KOKKOS_H
@@ -73,7 +93,7 @@ NPairStyle(half/bin/ghost/kk/device,
 
 namespace LAMMPS_NS {
 
-template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI>
+template<class DeviceType, int HALF_NEIGH, int GHOST, int TRI, int SIZE>
 class NPairKokkos : public NPair {
  public:
   NPairKokkos(class LAMMPS *);
@@ -162,6 +182,7 @@ class NeighborKokkosExecute
   // data from Atom class
 
   const typename AT::t_x_array_randomread x;
+  const typename AT::t_float_1d_randomread radius;
   const typename AT::t_int_1d_const type,mask;
   const typename AT::t_tagint_1d_const molecule;
   const typename AT::t_tagint_1d_const tag;
@@ -188,6 +209,9 @@ class NeighborKokkosExecute
   const int xperiodic, yperiodic, zperiodic;
   const int xprd_half, yprd_half, zprd_half;
 
+  // GRANULAR required member variables
+  const X_FLOAT skin;
+
   NeighborKokkosExecute(
                         const NeighListKokkos<DeviceType> &_neigh_list,
                         const typename AT::t_xfloat_2d_randomread &_cutneighsq,
@@ -199,6 +223,7 @@ class NeighborKokkosExecute
                         const typename AT::t_int_1d_3 &_d_stencilxyz,
                         const int _nlocal,
                         const typename AT::t_x_array_randomread &_x,
+			const typename AT::t_float_1d_randomread &_radius,
                         const typename AT::t_int_1d_const &_type,
                         const typename AT::t_int_1d_const &_mask,
                         const typename AT::t_tagint_1d_const &_molecule,
@@ -225,13 +250,14 @@ class NeighborKokkosExecute
                         const typename AT::t_int_1d_const & _ex_mol_intra,
                         const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo,
                         const int & _xperiodic, const int & _yperiodic, const int & _zperiodic,
-                        const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
+                        const int & _xprd_half, const int & _yprd_half, const int & _zprd_half,
+			const X_FLOAT _skin):
     neigh_list(_neigh_list), cutneighsq(_cutneighsq),
     bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
     atom2bin(_atom2bin),c_atom2bin(_atom2bin),
     nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
     nlocal(_nlocal),
-    x(_x),type(_type),mask(_mask),molecule(_molecule),
+    x(_x),radius(_radius),type(_type),mask(_mask),molecule(_molecule),
     tag(_tag),special(_special),nspecial(_nspecial),molecular(_molecular),
     nbinx(_nbinx),nbiny(_nbiny),nbinz(_nbinz),
     mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz),
@@ -245,7 +271,8 @@ class NeighborKokkosExecute
     ex_mol_group(_ex_mol_group),ex_mol_bit(_ex_mol_bit),
     ex_mol_intra(_ex_mol_intra),
     xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic),
-    xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half) {
+    xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half),
+    skin(_skin) {
 
     if (molecular == 2) moltemplate = 1;
     else moltemplate = 0;
@@ -280,10 +307,18 @@ class NeighborKokkosExecute
   KOKKOS_FUNCTION
   void build_Item_Ghost(const int &i) const;
 
+  template<int HalfNeigh, int Newton, int Tri>
+  KOKKOS_FUNCTION
+  void build_ItemSize(const int &i) const;
+
 #ifdef KOKKOS_HAVE_CUDA
   template<int HalfNeigh, int Newton, int Tri>
   __device__ inline
   void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
+
+  template<int HalfNeigh, int Newton, int Tri>
+  __device__ inline
+  void build_ItemSizeCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
 #endif
 
   KOKKOS_INLINE_FUNCTION
@@ -399,6 +434,48 @@ struct NPairKokkosBuildFunctorGhost {
   }
 };
 
+template <class DeviceType, int HALF_NEIGH, int GHOST_NEWTON, int TRI>
+struct NPairKokkosBuildFunctorSize {
+  typedef DeviceType device_type;
+
+  const NeighborKokkosExecute<DeviceType> c;
+  const size_t sharedsize;
+
+  NPairKokkosBuildFunctorSize(const NeighborKokkosExecute<DeviceType> &_c,
+			      const size_t _sharedsize): c(_c), sharedsize(_sharedsize) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.template build_ItemSize<HALF_NEIGH,GHOST_NEWTON,TRI>(i);
+  }
+
+#ifdef KOKKOS_HAVE_CUDA
+  __device__ inline
+  void operator() (typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const {
+    c.template build_ItemSizeCuda<HALF_NEIGH,GHOST_NEWTON,TRI>(dev);
+  }
+  size_t shmem_size(const int team_size) const { (void) team_size; return sharedsize; }
+#endif
+};
+
+template <int HALF_NEIGH, int GHOST_NEWTON, int TRI>
+struct NPairKokkosBuildFunctorSize<LMPHostType,HALF_NEIGH,GHOST_NEWTON,TRI> {
+  typedef LMPHostType device_type;
+
+  const NeighborKokkosExecute<LMPHostType> c;
+  const size_t sharedsize;
+
+  NPairKokkosBuildFunctorSize(const NeighborKokkosExecute<LMPHostType> &_c,
+			      const size_t _sharedsize): c(_c), sharedsize(_sharedsize) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.template build_ItemSize<HALF_NEIGH,GHOST_NEWTON,TRI>(i);
+  }
+
+  void operator() (typename Kokkos::TeamPolicy<LMPHostType>::member_type dev) const {} // Should error out
+};
+
 }
 
 #endif
diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7c8f8a99e6a2faacb896ad248a61dc3b148e14a
--- /dev/null
+++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp
@@ -0,0 +1,440 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "pair_gran_hooke_history_kokkos.h"
+#include "kokkos.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "memory_kokkos.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "error.h"
+#include "modify.h"
+#include "fix_neigh_history_kokkos.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairGranHookeHistoryKokkos<DeviceType>::PairGranHookeHistoryKokkos(LAMMPS *lmp) : PairGranHookeHistory(lmp)
+{
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | V_MASK | OMEGA_MASK | F_MASK | TORQUE_MASK | TYPE_MASK | MASK_MASK | ENERGY_MASK | VIRIAL_MASK | RMASS_MASK | RADIUS_MASK;
+  datamask_modify = F_MASK | TORQUE_MASK | ENERGY_MASK | VIRIAL_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairGranHookeHistoryKokkos<DeviceType>::~PairGranHookeHistoryKokkos()
+{
+  if (copymode) return;
+
+  if (allocated) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    eatom = NULL;
+    vatom = NULL;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairGranHookeHistoryKokkos<DeviceType>::init_style()
+{
+  if (history && fix_history == NULL) {
+    char dnumstr[16];
+    sprintf(dnumstr,"%d",3);
+    char **fixarg = new char*[4];
+    fixarg[0] = (char *) "NEIGH_HISTORY";
+    fixarg[1] = (char *) "all";
+    if (execution_space == Device)
+      fixarg[2] = (char *) "NEIGH_HISTORY/KK/DEVICE";
+    else
+      fixarg[2] = (char *) "NEIGH_HISTORY/KK/HOST";
+    fixarg[3] = dnumstr;
+    modify->add_fix(4,fixarg,1);
+    delete [] fixarg;
+    fix_history = (FixNeighHistory *) modify->fix[modify->nfix-1];
+    fix_history->pair = this;
+    fix_historyKK = (FixNeighHistoryKokkos<DeviceType> *)fix_history;
+  }
+
+  PairGranHookeHistory::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with gran/hooke/history/kk");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairGranHookeHistoryKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  copymode = 1;
+
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
+  else evflag = vflag_fdotr = 0;
+
+  int shearupdate = 1;
+  if (update->setupflag) shearupdate = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  atomKK->sync(execution_space,datamask_read);
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK | TORQUE_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  v = atomKK->k_v.view<DeviceType>();
+  omega = atomKK->k_omega.view<DeviceType>();
+  c_x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  torque = atomKK->k_torque.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  tag = atomKK->k_tag.view<DeviceType>();
+  rmass = atomKK->k_rmass.view<DeviceType>();
+  radius = atomKK->k_radius.view<DeviceType>();
+  nlocal = atom->nlocal;
+  nall = atom->nlocal + atom->nghost;
+  newton_pair = force->newton_pair;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+
+  int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  d_firsttouch = fix_historyKK->d_firstflag;
+  d_firstshear = fix_historyKK->d_firstvalue;
+  
+  EV_FLOAT ev;
+
+  if (lmp->kokkos->neighflag == HALF) {
+    if (force->newton_pair) {
+      if (evflag) {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,1,1,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,1,1,0>>(0,inum),*this);
+	}
+      } else {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,1,0,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,1,0,0>>(0,inum),*this);
+	}
+      }
+    } else {
+      if (evflag) {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,0,1,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,0,1,0>>(0,inum),*this);
+	}
+      } else {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,0,0,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALF,0,0,0>>(0,inum),*this);
+	}
+      }
+    }
+  } else { // HALFTHREAD
+    if (force->newton_pair) {
+      if (evflag) {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,1,1,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,1,1,0>>(0,inum),*this);
+	}
+      } else {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,1,0,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,1,0,0>>(0,inum),*this);
+	}
+      }
+    } else {
+      if (evflag) {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,0,1,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,0,1,0>>(0,inum),*this);
+	}
+      } else {
+	if (shearupdate) {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,0,0,1>>(0,inum),*this);
+	} else {
+	  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairGranHookeHistoryCompute<HALFTHREAD,0,0,0>>(0,inum),*this);
+	}
+      }
+    }
+  }
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int SHEARUPDATE>
+KOKKOS_INLINE_FUNCTION
+void PairGranHookeHistoryKokkos<DeviceType>::operator()(TagPairGranHookeHistoryCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,SHEARUPDATE>, const int &ii) const {
+
+  // The f and torque arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_torque = torque;
+
+  int i = d_ilist[ii];
+  double xtmp = x(i,0);
+  double ytmp = x(i,1);
+  double ztmp = x(i,2);
+  int itype = type[i];
+  double imass = rmass[i];
+  double irad = radius[i];
+  int jnum = d_numneigh[i];
+  
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+
+  double torquex_i = 0.0;
+  double torquey_i = 0.0;
+  double torquez_i = 0.0;
+  
+  for (int jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    j &= NEIGHMASK;
+
+    double delx = xtmp - x(j,0);
+    double dely = ytmp - x(j,1);
+    double delz = ztmp - x(j,2);
+    double rsq = delx*delx + dely*dely + delz*delz;
+    int jtype = type[j];
+    double jmass = rmass[j];
+    double jrad = radius[j];
+    double radsum = irad + jrad;
+
+    // check for touching neighbors
+
+    if (rsq >= radsum * radsum) {
+      d_firsttouch(i,jj) = 0;
+      d_firstshear(i,3*jj) = 0;
+      d_firstshear(i,3*jj+1) = 0;
+      d_firstshear(i,3*jj+2) = 0;
+    } else {
+      double r = sqrt(rsq);
+      double rinv = 1.0/r;
+      double rsqinv = 1/rsq;
+
+      // relative translational velocity
+
+      double vr1 = v(i,0) - v(j,0);
+      double vr2 = v(i,1) - v(j,1);
+      double vr3 = v(i,2) - v(j,2);
+      
+      // normal component
+
+      double vnnr = vr1*delx + vr2*dely + vr3*delz;
+      double vn1 = delx*vnnr * rsqinv;
+      double vn2 = dely*vnnr * rsqinv;
+      double vn3 = delz*vnnr * rsqinv;
+      
+      // tangential component
+      
+      double vt1 = vr1 - vn1;
+      double vt2 = vr2 - vn2;
+      double vt3 = vr3 - vn3;
+
+      // relative rotational velocity
+
+      double wr1 = (irad*omega(i,0) + jrad*omega(j,0)) * rinv;
+      double wr2 = (irad*omega(i,1) + jrad*omega(j,1)) * rinv;
+      double wr3 = (irad*omega(i,2) + jrad*omega(j,2)) * rinv;
+
+      double meff = imass*jmass / (imass+jmass);
+      if (mask[i] & freeze_group_bit) meff = jmass;
+      if (mask[j] & freeze_group_bit) meff = imass;
+
+      double damp = meff*gamman*vnnr*rsqinv;
+      double ccel = kn*(radsum-r)*rinv - damp;
+
+      // relative velocities
+
+      double vtr1 = vt1 - (delz*wr2-dely*wr3);
+      double vtr2 = vt2 - (delx*wr3-delz*wr1);
+      double vtr3 = vt3 - (dely*wr1-delx*wr2);
+      double vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
+      vrel = sqrt(vrel);
+
+      // shear history effects
+
+      d_firsttouch(i,jj) = 1;
+      double shear1 = d_firstshear(i,3*jj);
+      double shear2 = d_firstshear(i,3*jj+1);
+      double shear3 = d_firstshear(i,3*jj+2);
+      if (SHEARUPDATE) {
+  	shear1 += vtr1*dt;
+  	shear2 += vtr2*dt;
+  	shear3 += vtr3*dt;
+      }
+      double shrmag = sqrt(shear1*shear1 + shear2*shear2 +
+  			   shear3*shear3);
+
+      // rotate shear displacements
+
+      double rsht = shear1*delx + shear2*dely + shear3*delz;
+      rsht *= rsqinv;
+      if (SHEARUPDATE) {
+  	shear1 -= rsht*delx;
+  	shear2 -= rsht*dely;
+  	shear3 -= rsht*delz;
+      }
+
+      // tangential forces = shear + tangential velocity damping
+
+      double fs1 = - (kt*shear1 + meff*gammat*vtr1);
+      double fs2 = - (kt*shear2 + meff*gammat*vtr2);
+      double fs3 = - (kt*shear3 + meff*gammat*vtr3);
+
+      // rescale frictional displacements and forces if needed
+
+      double fs = sqrt(fs1*fs1 + fs2*fs2 + fs3*fs3);
+      double fn = xmu * fabs(ccel*r);
+
+      if (fs > fn) {
+  	if (shrmag != 0.0) {
+  	  shear1 = (fn/fs) * (shear1 + meff*gammat*vtr1/kt) -
+  	    meff*gammat*vtr1/kt;
+  	  shear2 = (fn/fs) * (shear2 + meff*gammat*vtr2/kt) -
+  	    meff*gammat*vtr2/kt;
+  	  shear3 = (fn/fs) * (shear3 + meff*gammat*vtr3/kt) -
+  	    meff*gammat*vtr3/kt;
+  	  fs1 *= fn/fs;
+  	  fs2 *= fn/fs;
+  	  fs3 *= fn/fs;
+  	} else fs1 = fs2 = fs3 = 0.0;
+      }
+
+      if (SHEARUPDATE) {
+  	d_firstshear(i,3*jj) = shear1;
+  	d_firstshear(i,3*jj+1) = shear2;
+  	d_firstshear(i,3*jj+2) = shear3;
+      }
+
+      // forces & torques
+
+      double fx = delx*ccel + fs1;
+      double fy = dely*ccel + fs2;
+      double fz = delz*ccel + fs3;
+      fx_i += fx;
+      fy_i += fy;
+      fz_i += fz;
+
+      double tor1 = rinv * (dely*fs3 - delz*fs2);
+      double tor2 = rinv * (delz*fs1 - delx*fs3);
+      double tor3 = rinv * (delx*fs2 - dely*fs1);
+      torquex_i -= irad*tor1;
+      torquey_i -= irad*tor2;
+      torquez_i -= irad*tor3;
+      
+      if (NEWTON_PAIR || j < nlocal) {
+        a_f(j,0) -= fx;
+        a_f(j,1) -= fy;
+        a_f(j,2) -= fz;
+  	a_torque(j,0) -= jrad*tor1;
+  	a_torque(j,1) -= jrad*tor2;
+  	a_torque(j,2) -= jrad*tor3;
+      }
+    }
+  }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+  a_torque(i,0) += torquex_i;
+  a_torque(i,1) += torquey_i;
+  a_torque(i,2) += torquez_i;
+}
+
+namespace LAMMPS_NS {
+template class PairGranHookeHistoryKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairGranHookeHistoryKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.h b/src/KOKKOS/pair_gran_hooke_history_kokkos.h
new file mode 100644
index 0000000000000000000000000000000000000000..84dc4120781681f33db4c465cdab4200677967e1
--- /dev/null
+++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.h
@@ -0,0 +1,127 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gran/hooke/history/kk,PairGranHookeHistoryKokkos<LMPDeviceType>)
+PairStyle(gran/hooke/history/kk/device,PairGranHookeHistoryKokkos<LMPDeviceType>)
+PairStyle(gran/hooke/history/kk/host,PairGranHookeHistoryKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_GRAN_HOOKE_HISTORY_KOKKOS_H
+#define LMP_PAIR_GRAN_HOOKE_HISTORY_KOKKOS_H
+
+#include "pair_gran_hooke_history.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template <class DeviceType>
+class FixNeighHistoryKokkos;
+  
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int SHEARUPDATE>
+struct TagPairGranHookeHistoryCompute{};
+
+template <class DeviceType>
+class PairGranHookeHistoryKokkos : public PairGranHookeHistory {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  PairGranHookeHistoryKokkos(class LAMMPS *);
+  virtual ~PairGranHookeHistoryKokkos();
+  virtual void compute(int, int);
+  void init_style();
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int SHEARUPDATE>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairGranHookeHistoryCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,SHEARUPDATE>, const int&) const;
+
+ protected:
+  typename AT::t_x_array_randomread x;
+  typename AT::t_x_array c_x;
+  typename AT::t_v_array_randomread v;
+  typename AT::t_v_array_randomread omega;
+  typename AT::t_f_array f;
+  typename AT::t_f_array torque;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_int_1d_randomread mask;
+  typename AT::t_float_1d_randomread rmass;
+  typename AT::t_float_1d_randomread radius;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+  typename AT::t_tagint_1d tag;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  typename AT::t_int_2d d_firsttouch;
+  typename AT::t_float_2d d_firstshear;
+  
+  int newton_pair;
+  double special_lj[4];
+
+  int neighflag;
+  int nlocal,nall,eflag,vflag;
+
+  FixNeighHistoryKokkos<DeviceType> *fix_historyKK;
+
+  friend void pair_virial_fdotr_compute<PairGranHookeHistoryKokkos>(PairGranHookeHistoryKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair granular requires atom attributes radius, rmass
+
+The atom style defined does not have these attributes.
+
+E: Pair granular requires ghost atoms store velocity
+
+Use the comm_modify vel yes command to enable this.
+
+E: Could not find pair fix neigh history ID
+
+UNDOCUMENTED
+
+U: Pair granular with shear history requires newton pair off
+
+This is a current restriction of the implementation of pair
+granular styles with history.
+
+U: Could not find pair fix ID
+
+A fix is created internally by the pair style to store shear
+history information.  You cannot delete it.
+
+*/
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index 136e543a2cd708501004d2f94290b21454dabe0a..417de2d94aec73baaa56f14dc7a100d9bed69fa7 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -55,6 +55,20 @@ struct ForceAdder {
 
 /* ---------------------------------------------------------------------- */
 
+template<class View>
+struct Zero {
+  View v;
+  Zero(const View &v_):v(v_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int &i) const {
+    v(i,0) = 0;
+    v(i,1) = 0;
+    v(i,2) = 0;
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
 VerletKokkos::VerletKokkos(LAMMPS *lmp, int narg, char **arg) :
   Verlet(lmp, narg, arg)
 {
@@ -120,6 +134,7 @@ void VerletKokkos::setup(int flag)
   atomKK->modified(Host,ALL_MASK);
 
   neighbor->build(1);
+  modify->setup_post_neighbor();
   neighbor->ncalls = 0;
 
   // compute all forces
@@ -175,7 +190,7 @@ void VerletKokkos::setup(int flag)
   modify->setup(vflag);
   output->setup(flag);
   lmp->kokkos->auto_sync = 1;
-  update->setupflag = 1;
+  update->setupflag = 0;
 }
 
 /* ----------------------------------------------------------------------
@@ -223,6 +238,7 @@ void VerletKokkos::setup_minimal(int flag)
     atomKK->modified(Host,ALL_MASK);
 
     neighbor->build(1);
+    modify->setup_post_neighbor();
     neighbor->ncalls = 0;
   }
 
@@ -567,8 +583,6 @@ void VerletKokkos::run(int n)
 
 void VerletKokkos::force_clear()
 {
-  int i;
-
   if (external_force_clear) return;
 
   // clear force on all particles
@@ -576,23 +590,15 @@ void VerletKokkos::force_clear()
   // when using threads always clear all forces.
 
   if (neighbor->includegroup == 0) {
-    int nall;
-    if (force->newton) nall = atomKK->nlocal + atomKK->nghost;
-    else nall = atomKK->nlocal;
+    int nall = atomKK->nlocal;
+    if (force->newton) nall += atomKK->nghost;
 
-    size_t nbytes = sizeof(double) * nall;
-
-    if (nbytes) {
-      if (atomKK->k_f.modified_host() > atomKK->k_f.modified_device()) {
-        memset_kokkos(atomKK->k_f.view<LMPHostType>());
-        atomKK->modified(Host,F_MASK);
-        atomKK->sync(Device,F_MASK);
-      } else {
-        memset_kokkos(atomKK->k_f.view<LMPDeviceType>());
-        atomKK->modified(Device,F_MASK);
-      }
-      if (torqueflag)  memset(&(atomKK->torque[0][0]),0,3*nbytes);
+    Kokkos::parallel_for(nall, Zero<typename ArrayTypes<LMPDeviceType>::t_f_array>(atomKK->k_f.view<LMPDeviceType>()));
+    atomKK->modified(Device,F_MASK);
 
+    if (torqueflag) {
+      Kokkos::parallel_for(nall, Zero<typename ArrayTypes<LMPDeviceType>::t_f_array>(atomKK->k_torque.view<LMPDeviceType>()));
+      atomKK->modified(Device,TORQUE_MASK);
     }
 
   // neighbor includegroup flag is set
@@ -600,35 +606,23 @@ void VerletKokkos::force_clear()
   // if either newton flag is set, also include ghosts
 
   } else {
-    int nall = atomKK->nfirst;
-    if (atomKK->k_f.modified_host() > atomKK->k_f.modified_device()) {
-      memset_kokkos(atomKK->k_f.view<LMPHostType>());
-      atomKK->modified(Host,F_MASK);
-    } else {
-      memset_kokkos(atomKK->k_f.view<LMPDeviceType>());
-      atomKK->modified(Device,F_MASK);
-    }
+    Kokkos::parallel_for(atomKK->nfirst, Zero<typename ArrayTypes<LMPDeviceType>::t_f_array>(atomKK->k_f.view<LMPDeviceType>()));
+    atomKK->modified(Device,F_MASK);
+
     if (torqueflag) {
-      double **torque = atomKK->torque;
-      for (i = 0; i < nall; i++) {
-        torque[i][0] = 0.0;
-        torque[i][1] = 0.0;
-        torque[i][2] = 0.0;
-      }
+      Kokkos::parallel_for(atomKK->nfirst, Zero<typename ArrayTypes<LMPDeviceType>::t_f_array>(atomKK->k_torque.view<LMPDeviceType>()));
+      atomKK->modified(Device,TORQUE_MASK);
     }
 
     if (force->newton) {
-      nall = atomKK->nlocal + atomKK->nghost;
+      auto range = Kokkos::RangePolicy<LMPDeviceType>(atomKK->nlocal, atomKK->nlocal + atomKK->nghost);
+      Kokkos::parallel_for(range, Zero<typename ArrayTypes<LMPDeviceType>::t_f_array>(atomKK->k_f.view<LMPDeviceType>()));
+      atomKK->modified(Device,F_MASK);
 
       if (torqueflag) {
-        double **torque = atomKK->torque;
-        for (i = atomKK->nlocal; i < nall; i++) {
-          torque[i][0] = 0.0;
-          torque[i][1] = 0.0;
-          torque[i][2] = 0.0;
-        }
+	Kokkos::parallel_for(range, Zero<typename ArrayTypes<LMPDeviceType>::t_f_array>(atomKK->k_torque.view<LMPDeviceType>()));
+	atomKK->modified(Device,TORQUE_MASK);
       }
-
     }
   }
 }
diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp
index 3aa6d7748ce96b2c095857ed5e9f6c2acbc9314d..f4e733a5acdc58958f21e904c831367c264536d6 100644
--- a/src/comm_brick.cpp
+++ b/src/comm_brick.cpp
@@ -609,7 +609,7 @@ void CommBrick::exchange()
   maxexchange = maxexchange_atom + maxexchange_fix;
   bufextra = maxexchange + BUFEXTRA;
   if (bufextra > bufextra_old)
-    memory->grow(buf_send,maxsend+bufextra,"comm:buf_send");
+    grow_send(maxsend+bufextra,1);
 
   // subbox bounds for orthogonal or triclinic
 
diff --git a/src/fix_neigh_history.cpp b/src/fix_neigh_history.cpp
index 3329b604efa4a0e2f6310e34cce95d7ca6902a14..90b77821e68cb47e06c9cafb8fc4eeed97f6d56e 100644
--- a/src/fix_neigh_history.cpp
+++ b/src/fix_neigh_history.cpp
@@ -95,6 +95,8 @@ FixNeighHistory::FixNeighHistory(LAMMPS *lmp, int narg, char **arg) :
 
 FixNeighHistory::~FixNeighHistory()
 {
+  if (copymode) return;
+
   // unregister this fix so atom class doesn't invoke it any more
 
   atom->delete_callback(id,0);