diff --git a/doc/src/Speed_intel.txt b/doc/src/Speed_intel.txt
index bf078fb57652b747c99b17d12d4f441618e58d24..d35a311f460f91b3a9732fa19c6ad1f3f3df3d4a 100644
--- a/doc/src/Speed_intel.txt
+++ b/doc/src/Speed_intel.txt
@@ -499,7 +499,7 @@ MPI task.
 When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles
 that require skip lists for neighbor builds cannot be offloaded.
 Using "hybrid/overlay"_pair_hybrid.html is allowed.  Only one intel
-accelerated style may be used with hybrid styles.
+accelerated style may be used with hybrid styles when offloading.
 "Special_bonds"_special_bonds.html exclusion lists are not currently
 supported with offload, however, the same effect can often be
 accomplished by setting cutoffs for excluded atom types to 0.  None of
diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp
index 5777dad9479957be52f9496435a6fdf845d828ff..6d8901a5ee1141f563e954c8781cccbf96d0948c 100644
--- a/src/USER-INTEL/angle_harmonic_intel.cpp
+++ b/src/USER-INTEL/angle_harmonic_intel.cpp
@@ -314,7 +314,7 @@ void AngleHarmonicIntel::init_style()
 
 template <class flt_t, class acc_t>
 void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                        IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                          IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
   const int bp1 = atom->nangletypes + 1;
   fc.set_ntypes(bp1,memory);
diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp
index 9d54269c0563cb4134fdd278eb123ebc66ace8b4..bff3722a44a06ee657f19479ad7b28452567bb97 100644
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@@ -291,7 +291,7 @@ void BondFENEIntel::init_style()
 
 template <class flt_t, class acc_t>
 void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                         IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                     IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
   const int bp1 = atom->nbondtypes + 1;
   fc.set_ntypes(bp1,memory);
diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp
index ef95cc75bb1bf998418152d47e994edbeb35f976..7a60b62cae62fc64c5f9d4f1432bfe1cbbe90dd9 100644
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@@ -416,7 +416,7 @@ void DihedralOPLSIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                             IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                         IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 37212928dfdc5333a50c7fd3daa3b373c9bc70b6..e0865f043164a21a88e1aaee5115b72a6080c9ec 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -65,6 +65,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
 
   _nbor_pack_width = 1;
   _three_body_neighbor = 0;
+  _hybrid_nonpair = 0;
 
   _precision_mode = PREC_MODE_MIXED;
   _offload_balance = -1.0;
@@ -266,8 +267,7 @@ FixIntel::~FixIntel()
     double *time1 = off_watch_pair();
     double *time2 = off_watch_neighbor();
     int *overflow = get_off_overflow_flag();
-    if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
-        overflow != NULL) {
+    if (_offload_balance != 0.0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
     }
@@ -314,34 +314,63 @@ void FixIntel::init()
 
   int nstyles = 0;
   if (force->pair_match("hybrid", 1) != NULL) {
+    _pair_hybrid_flag = 1;
     PairHybrid *hybrid = (PairHybrid *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
         nstyles++;
+    if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
+      error->all(FLERR,
+                 "Intel package requires fdotr virial with newton on.");
   } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    _pair_hybrid_flag = 1;
     PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
         nstyles++;
-      else
-        force->pair->no_virial_fdotr_compute = 1;
+    if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
+      error->all(FLERR,
+                 "Intel package requires fdotr virial with newton on.");
+  } else
+    _pair_hybrid_flag = 0;
+
+  if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2;
+  else if (force->newton_pair == 0) _pair_hybrid_flag = 0;
+
+  _pair_hybrid_zero = 0;
+  _zero_master = 0;
+
+  if (_pair_hybrid_flag && _hybrid_nonpair)
+    if (_pair_hybrid_flag > 1 || force->newton_pair == 0)
+      _pair_hybrid_zero = 1;
+  _hybrid_nonpair = 0;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_balance() != 0.0) {
+    _pair_hybrid_zero = 0;
+    if (force->newton_pair == 0) _pair_hybrid_flag = 0;
+    if (nstyles > 1)
+      error->all(FLERR,
+        "Currently, cannot offload more than one intel style with hybrid.");
   }
-  if (nstyles > 1)
-    error->all(FLERR,
-               "Currently, cannot use more than one intel style with hybrid.");
+  #endif
 
   check_neighbor_intel();
+
   int off_mode = 0;
   if (_offload_balance != 0.0) off_mode = 1;
   if (_precision_mode == PREC_MODE_SINGLE) {
     _single_buffers->zero_ev();
     _single_buffers->grow_ncache(off_mode,_nthreads);
+    _single_buffers->free_list_ptrs();
   } else if (_precision_mode == PREC_MODE_MIXED) {
     _mixed_buffers->zero_ev();
     _mixed_buffers->grow_ncache(off_mode,_nthreads);
+    _mixed_buffers->free_list_ptrs();
   } else {
     _double_buffers->zero_ev();
     _double_buffers->grow_ncache(off_mode,_nthreads);
+    _double_buffers->free_list_ptrs();
   }
 
   _need_reduce = 0;
@@ -349,7 +378,7 @@ void FixIntel::init()
 
 /* ---------------------------------------------------------------------- */
 
-void FixIntel::setup(int /*vflag*/)
+void FixIntel::setup(int vflag)
 {
   if (neighbor->style != Neighbor::BIN)
     error->all(FLERR,
@@ -395,8 +424,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
     double *time1 = off_watch_pair();
     double *time2 = off_watch_neighbor();
     int *overflow = get_off_overflow_flag();
-    if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
-        overflow != NULL) {
+    if (_offload_balance !=0.0) {
       #pragma offload_transfer target(mic:_cop)  \
         nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
         in(overflow:length(5) alloc_if(1) free_if(0))
@@ -419,6 +447,21 @@ void FixIntel::pair_init_check(const bool cdmessage)
     #endif
   }
 
+  #ifndef LMP_INTEL_NBOR_COMPAT
+  if (force->pair->manybody_flag && atom->molecular) {
+    int flag = 0;
+    if (atom->nbonds > 0 && force->special_lj[1] == 0.0 &&
+        force->special_coul[1] == 0.0) flag = 1;
+    if (atom->nangles > 0 && force->special_lj[2] == 0.0 &&
+        force->special_coul[2] == 0.0) flag = 1;
+    if (atom->ndihedrals > 0 && force->special_lj[3] == 0.0 &&
+        force->special_coul[3] == 0.0) flag = 1;
+    if (flag)
+      error->all(FLERR,"Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond"
+                 "exclusions with Intel");
+  }
+  #endif
+  
   int need_tag = 0;
   if (atom->molecular) need_tag = 1;
 
@@ -477,11 +520,13 @@ void FixIntel::bond_init_check()
   if (force->pair_match("/intel", 0) != NULL)
     intel_pair = 1;
   else if (force->pair_match("hybrid", 1) != NULL) {
+    _hybrid_nonpair = 1;
     PairHybrid *hybrid = (PairHybrid *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
         intel_pair = 1;
   } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    _hybrid_nonpair = 1;
     PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
@@ -501,11 +546,13 @@ void FixIntel::kspace_init_check()
   if (force->pair_match("/intel", 0) != NULL)
     intel_pair = 1;
   else if (force->pair_match("hybrid", 1) != NULL) {
+    _hybrid_nonpair = 1;
     PairHybrid *hybrid = (PairHybrid *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
         intel_pair = 1;
   } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    _hybrid_nonpair = 1;
     PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
     for (int i = 0; i < hybrid->nstyles; i++)
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
@@ -522,51 +569,60 @@ void FixIntel::check_neighbor_intel()
 {
   #ifdef _LMP_INTEL_OFFLOAD
   _full_host_list = 0;
-  #endif
-  const int nrequest = neighbor->nrequest;
 
+  const int nrequest = neighbor->nrequest;
   for (int i = 0; i < nrequest; ++i) {
-    #ifdef _LMP_INTEL_OFFLOAD
     if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
       _full_host_list = 1;
       _offload_noghost = 0;
     }
-    #endif
+    if (neighbor->requests[i]->skip && _offload_balance != 0.0)
+      error->all(FLERR, "Cannot yet use hybrid styles with Intel offload.");
 
     // avoid flagging a neighbor list as both USER-INTEL and USER-OMP
     if (neighbor->requests[i]->intel)
       neighbor->requests[i]->omp = 0;
-
-    if (neighbor->requests[i]->skip)
-      error->all(FLERR, "Hybrid styles with Intel package are unsupported.");
   }
+  #else
+  // avoid flagging a neighbor list as both USER-INTEL and USER-OMP
+  const int nrequest = neighbor->nrequest;
+  for (int i = 0; i < nrequest; ++i)
+    if (neighbor->requests[i]->intel)
+      neighbor->requests[i]->omp = 0;
+  #endif
 }
 
 /* ---------------------------------------------------------------------- */
 
-void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
+void FixIntel::_sync_main_arrays(const int prereverse)
 {
+  if (!prereverse) _zero_master = 1;
+  int done_this_step = prereverse;
+  if (_pair_hybrid_zero == 0) done_this_step = 1;
   if (_force_array_m != 0) {
     if (_need_reduce) {
       reduce_results(&_force_array_m[0].x);
       _need_reduce = 0;
     }
     add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
-    _force_array_m = 0;
+    if (done_this_step) _force_array_m = 0;
+    else _ev_array_d = 0;
   } else if (_force_array_d != 0) {
     if (_need_reduce) {
       reduce_results(&_force_array_d[0].x);
       _need_reduce = 0;
     }
     add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
-    _force_array_d = 0;
+    if (done_this_step) _force_array_d = 0;
+    else _ev_array_d = 0;
   } else if (_force_array_s != 0) {
     if (_need_reduce) {
       reduce_results(&_force_array_s[0].x);
       _need_reduce = 0;
     }
     add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
-    _force_array_s = 0;
+    if (done_this_step) _force_array_s = 0;
+    else _ev_array_s = 0;
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
@@ -576,6 +632,13 @@ void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
 
 /* ---------------------------------------------------------------------- */
 
+void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
+{
+  _sync_main_arrays(1);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class acc_t>
 void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 {
@@ -657,7 +720,7 @@ template <class ft, class acc_t>
 void FixIntel::add_results(const ft * _noalias const f_in,
                            const acc_t * _noalias const ev_global,
                            const int eatom, const int vatom,
-                           const int /*offload*/) {
+                           const int offload) {
   start_watch(TIME_PACK);
   int f_length;
   #ifdef _LMP_INTEL_OFFLOAD
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index e6e09be503dc0d69840e251c0abc20a103890a04..81dd0b5d975886d6c6a5444b6bf666543ffc9a56 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -74,11 +74,12 @@ class FixIntel : public Fix {
   inline int nbor_pack_width() const { return _nbor_pack_width; }
   inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
   inline int three_body_neighbor() { return _three_body_neighbor; }
-  inline void three_body_neighbor(const int /*i*/) { _three_body_neighbor = 1; }
+  inline void three_body_neighbor(const int i) { _three_body_neighbor = i; }
 
   inline int need_zero(const int tid) {
     if (_need_reduce == 0 && tid > 0) return 1;
-    return 0;
+    else if (_zero_master && tid == 0) { _zero_master = 0; return 1; }
+    else return 0;
   }
   inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
   inline int lrt() {
@@ -100,7 +101,10 @@ class FixIntel : public Fix {
   IntelBuffers<double,double> *_double_buffers;
 
   int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
-
+  int _pair_hybrid_flag;
+  // These should be removed in subsequent update w/ simpler hybrid arch
+  int _pair_hybrid_zero, _hybrid_nonpair, _zero_master;
+  
  public:
   inline int* get_overflow_flag() { return _overflow_flag; }
   inline int* get_off_overflow_flag() { return _off_overflow_flag; }
@@ -210,6 +214,8 @@ class FixIntel : public Fix {
   _alignvar(double _stopwatch_offload_neighbor[1],64);
   _alignvar(double _stopwatch_offload_pair[1],64);
 
+  void _sync_main_arrays(const int prereverse);
+  
   template <class ft>
   void reduce_results(ft * _noalias const f_in);
 
@@ -238,7 +244,7 @@ class FixIntel : public Fix {
 
 /* ---------------------------------------------------------------------- */
 
-void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall,
+void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
                            int &minlocal) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
@@ -273,7 +279,7 @@ void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall,
 /* ---------------------------------------------------------------------- */
 
 void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
-                                double *ev_in, const int /*offload*/,
+                                double *ev_in, const int offload,
                                 const int eatom, const int vatom,
                                 const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
@@ -282,6 +288,8 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
     _off_results_vatom = vatom;
     _off_force_array_d = f_in;
     _off_ev_array_d = ev_in;
+    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
+       _sync_main_arrays(1);
     return;
   }
   #endif
@@ -296,12 +304,15 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
 
   if (_overflow_flag[LMP_OVERFLOW])
     error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
+  if (_pair_hybrid_flag > 1 ||
+      (_pair_hybrid_flag && force->pair->fdotr_is_set())) _sync_main_arrays(0);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
-                                double *ev_in, const int /*offload*/,
+                                double *ev_in, const int offload,
                                 const int eatom, const int vatom,
                                 const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
@@ -310,6 +321,8 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
     _off_results_vatom = vatom;
     _off_force_array_m = f_in;
     _off_ev_array_d = ev_in;
+    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
+       _sync_main_arrays(1);
     return;
   }
   #endif
@@ -324,12 +337,16 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
 
   if (_overflow_flag[LMP_OVERFLOW])
     error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
+  if (_pair_hybrid_flag > 1 ||
+      (_pair_hybrid_flag && force->pair->fdotr_is_set()))
+    _sync_main_arrays(0);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
-                                float *ev_in, const int /*offload*/,
+                                float *ev_in, const int offload,
                                 const int eatom, const int vatom,
                                 const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
@@ -338,6 +355,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
     _off_results_vatom = vatom;
     _off_force_array_s = f_in;
     _off_ev_array_s = ev_in;
+    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
+       _sync_main_arrays(1);
     return;
   }
   #endif
@@ -352,6 +371,10 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
 
   if (_overflow_flag[LMP_OVERFLOW])
     error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
+  if (_pair_hybrid_flag > 1 ||
+      (_pair_hybrid_flag && force->pair->fdotr_is_set()))
+    _sync_main_arrays(0);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -487,16 +510,16 @@ The compiler version used to build LAMMPS is not supported when using
 offload to a coprocessor. There could be performance or correctness
 issues. Please use 14.0.1.106 or 15.1.133 or later.
 
-E: Currently, cannot use more than one intel style with hybrid.
+E: Currently, cannot offload more than one intel style with hybrid.
 
-Currently, hybrid pair styles can only use the intel suffix for one of the
-pair styles.
+Currently, when using offload, hybrid pair styles can only use the intel 
+suffix for one of the pair styles.
 
-E: Cannot yet use hybrid styles with Intel package.
+E: Cannot yet use hybrid styles with Intel offload.
 
-The hybrid pair style configuration is not yet supported by the Intel
-package. Support is limited to hybrid/overlay or a hybrid style that does
-not require a skip list.
+The hybrid pair style configuration is not yet supported when using offload
+within the Intel package. Support is limited to hybrid/overlay or a hybrid 
+style that does not require a skip list.
 
 W: Leaving a core/node free can improve performance for offload
 
@@ -538,4 +561,16 @@ E: Too few atoms for load balancing offload.
 When using offload to a coprocessor, each MPI task must have at least 2
 atoms throughout the simulation.
 
+E: Intel package requires fdotr virial with newton on.
+
+This error can occur with a hybrid pair style that mixes styles that are
+incompatible with the newton pair setting turned on. Try turning the 
+newton pair setting off.
+
+E: Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond exclusions with Intel
+
+When using a manybody pair style, bonds/angles/dihedrals, and special_bond
+exclusions, LAMMPS should be built with the above compile flag for compatible
+results.
+
 */
diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp
index c20c004657b1b53ce16c55dd6a41a87f37d708f4..03bd134b401f67e4aec8603bd858e7dc9716a597 100644
--- a/src/USER-INTEL/improper_cvff_intel.cpp
+++ b/src/USER-INTEL/improper_cvff_intel.cpp
@@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style()
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                             IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                         IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index 05176e1fd1e1bf4bc629dc6d8b40275b4bed81bb..b7026f90b762793124c42a4c474db8e09b3756a9 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -24,7 +24,9 @@ using namespace LAMMPS_NS;
 template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
     lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
-    _buf_size(0), _buf_local_size(0) {
+    _buf_size(0), _buf_local_size(0), _n_list_ptrs(1), _max_list_ptrs(4) {
+  _neigh_list_ptrs = new IntelNeighListPtrs[_max_list_ptrs];
+  _neigh_list_ptrs[0].cnumneigh = 0;
   _list_alloc_atoms = 0;
   _ntypes = 0;
   _off_map_listlocal = 0;
@@ -55,6 +57,7 @@ IntelBuffers<flt_t, acc_t>::~IntelBuffers()
   free_all_nbor_buffers();
   free_ccache();
   set_ntypes(0);
+  delete []_neigh_list_ptrs;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -109,7 +112,7 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
                                        const int nthreads,
-                                       const int /*offload_end*/)
+                                       const int offload_end)
 {
   free_buffers();
   _buf_size = static_cast<double>(nall) * 1.1 + 1;
@@ -186,11 +189,9 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
     const int * tag = _off_map_tag;
     const int * special = _off_map_special;
     const int * nspecial = _off_map_nspecial;
-    if (tag != 0 && special != 0 && nspecial !=0) {
-      #pragma offload_transfer target(mic:_cop) \
-        nocopy(tag:alloc_if(0) free_if(1)) \
-        nocopy(special,nspecial:alloc_if(0) free_if(1))
-    }
+    #pragma offload_transfer target(mic:_cop) \
+      nocopy(tag:alloc_if(0) free_if(1)) \
+      nocopy(special,nspecial:alloc_if(0) free_if(1))
     _off_map_nmax = 0;
     _host_nmax = 0;
   }
@@ -200,7 +201,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int /*offload_end*/)
+void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int offload_end)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   free_nmax();
@@ -243,46 +244,117 @@ template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_list_local()
 {
   if (_off_map_listlocal > 0) {
-    int * cnumneigh = _cnumneigh;
+    if (_neigh_list_ptrs[0].cnumneigh) {
+      int * cnumneigh = _neigh_list_ptrs[0].cnumneigh;
+      _neigh_list_ptrs[0].cnumneigh = 0;
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (_off_map_ilist != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cnumneigh:alloc_if(0) free_if(1))
+      }
+      #endif
+      lmp->memory->destroy(cnumneigh);
+    }
+      
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_map_ilist != NULL) {
       const int * ilist = _off_map_ilist;
       const int * numneigh = _off_map_numneigh;
+      const int ** firstneigh = (const int **)_off_map_firstneigh;
       _off_map_ilist = NULL;
-      if (numneigh != 0 && ilist != 0) {
-        #pragma offload_transfer target(mic:_cop) \
-          nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1))
-      }
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ilist,firstneigh,numneigh:alloc_if(0) free_if(1))
     }
     #endif
-    lmp->memory->destroy(cnumneigh);
     _off_map_listlocal = 0;
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_list_ptrs()
+{
+  for (int list_num = 1; list_num < _n_list_ptrs; list_num++) {
+    if (_neigh_list_ptrs[list_num].size) {
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh);
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf);
+    }
+    _neigh_list_ptrs[list_num].size = 0;
+    _neigh_list_ptrs[list_num].list_ptr = 0;
+  }
+  _n_list_ptrs = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::grow_data3(NeighList *list,
+                                            int *&numneighhalf,
+                                            int *&cnumneigh)
+{
+  const int size = list->get_maxlocal();
+  int list_num;
+  for (list_num = 0; list_num < _n_list_ptrs; list_num++) 
+    if (_neigh_list_ptrs[list_num].list_ptr == (void*)list) break;
+  if (list_num == _n_list_ptrs) {
+    if (_n_list_ptrs == _max_list_ptrs) {
+      _max_list_ptrs *= 2;
+      IntelNeighListPtrs *new_list = new IntelNeighListPtrs[_max_list_ptrs];
+      for (int i = 0; i < _n_list_ptrs; i++) new_list[i] = _neigh_list_ptrs[i];
+      delete []_neigh_list_ptrs;
+      _neigh_list_ptrs = new_list;
+    }
+    _neigh_list_ptrs[list_num].list_ptr = (void *)list;
+    _neigh_list_ptrs[list_num].size = 0;
+    _n_list_ptrs++;
+  }
+  if (size > _neigh_list_ptrs[list_num].size) {
+    if (_neigh_list_ptrs[list_num].size) {
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh);
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf);
+    }
+    lmp->memory->create(_neigh_list_ptrs[list_num].cnumneigh, size,
+                        "_cnumneigh");
+    lmp->memory->create(_neigh_list_ptrs[list_num].numneighhalf, size,
+                        "_cnumneigh");
+    _neigh_list_ptrs[list_num].size = size;
+  }
+  numneighhalf = _neigh_list_ptrs[list_num].numneighhalf;
+  cnumneigh = _neigh_list_ptrs[list_num].cnumneigh;
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
-                                                  const int /*offload_end*/)
+                                                  const int three_body,
+                                                  const int offload_end)
 {
   free_list_local();
   int size = list->get_maxlocal();
-  lmp->memory->create(_cnumneigh, size, "_cnumneigh");
   _off_map_listlocal = size;
+  if (three_body)
+    lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, size, "_cnumneigh");
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload_end > 0) {
+    int tb_size = size;
+    if (three_body == 0) {
+      lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, 16, "_cnumneigh");
+      tb_size = 16;
+    }
+    int ** firstneigh = list->firstneigh;
     int * numneigh = list->numneigh;
     int * ilist = list->ilist;
-    int * cnumneigh = _cnumneigh;
-    if (cnumneigh != 0) {
-      #pragma offload_transfer target(mic:_cop) \
-        nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
-        nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
-        nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
-    }
+    int * cnumneigh = _neigh_list_ptrs[0].cnumneigh;
+    #pragma offload_transfer target(mic:_cop) \
+      nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
+      nocopy(firstneigh:length(size) alloc_if(1) free_if(0)) \
+      nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
+      nocopy(cnumneigh:length(tb_size) alloc_if(1) free_if(0))
     _off_map_ilist = ilist;
+    _off_map_firstneigh = firstneigh;
     _off_map_numneigh = numneigh;
   }
   #endif
@@ -313,7 +385,7 @@ template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList * /*list*/,
                                                  const int nlocal,
                                                  const int nthreads,
-                                                 const int /*offload_end*/,
+                                                 const int offload_end,
                                                  const int pack_width)
 {
   free_nbor_list();
@@ -382,7 +454,7 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::grow_ccache(const int /*off_flag*/,
+void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
         const int nthreads,
         const int width)
 {
@@ -481,7 +553,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::grow_ncache(const int /*off_flag*/,
+void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
                                              const int nthreads)
 {
   const int nsize = get_max_nbors() * 3;
@@ -576,12 +648,12 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes,
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * cutneighsqo = _cutneighsq[0];
-      if (_off_threads > 0 && cutneighsqo != 0) {
+      if (_off_threads > 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(cutneighsqo:alloc_if(0) free_if(1))
       }
       flt_t * cutneighghostsqo;
-      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
+      if (_cutneighghostsq && _off_threads > 0) {
         cutneighghostsqo = _cutneighghostsq[0];
         #pragma offload_transfer target(mic:_cop) \
           nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
@@ -637,6 +709,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
   tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
   tmem += _ntypes * _ntypes * sizeof(int);
 
+  tmem += _buf_local_size + (_n_list_ptrs - 1) * _buf_local_size * 2;
+
   return tmem;
 }
 
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index e8225a4a30f42f9d9012452fbf9bdfbebdce5195..276b87ec5f2a2d1ff86302a9d257bd1aec6ec7a9 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -33,12 +33,20 @@ namespace LAMMPS_NS {
 #define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t
 #define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t
 
+struct IntelNeighListPtrs {
+  void * list_ptr;
+  int *cnumneigh;
+  int *numneighhalf;
+  int size;
+};
+
 // May not need a separate force array for mixed/double
 template <class flt_t, class acc_t>
 class IntelBuffers {
  public:
   typedef struct { flt_t x,y,z; int w; } atom_t;
   typedef struct { flt_t w,i,j,k; } quat_t;
+  typedef struct { flt_t x,y; } vec2_t;
   typedef struct { flt_t x,y,z,w; } vec3_t;
   typedef struct { flt_t x,y,z,w; } vec4_t;
   typedef struct { acc_t x,y,z,w; } vec3_acc_t;
@@ -62,8 +70,12 @@ class IntelBuffers {
 
   void free_buffers();
   void free_nmax();
-  inline void set_bininfo(int *atombin, int *binpacked)
-    { _atombin = atombin; _binpacked = binpacked; }
+  inline void set_bininfo(int *atombin, int *binpacked) {
+    _atombin = atombin;
+    _binpacked = binpacked;
+    _neigh_list_ptrs[0].numneighhalf = atombin;
+  }
+
   inline void grow(const int nall, const int nlocal, const int nthreads,
                    const int offload_end) {
     if (nall >= _buf_size || nlocal >= _buf_local_size)
@@ -79,18 +91,22 @@ class IntelBuffers {
     free_nmax();
     free_list_local();
     free_ncache();
+    free_list_ptrs();
   }
 
   inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
-                        const int offload_end, const int pack_width=1) {
-    grow_list_local(list, offload_end);
+                        const int three_body, const int offload_end,
+                        const int pack_width=1) {
+    grow_list_local(list, three_body, offload_end);
     grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
   void free_list_local();
-  inline void grow_list_local(NeighList *list, const int offload_end) {
+  inline void grow_list_local(NeighList *list, const int three_body,
+                              const int offload_end) {
+    _neigh_list_ptrs[0].list_ptr = (void *)list;
     if (list->get_maxlocal() > _off_map_listlocal)
-      _grow_list_local(list, offload_end);
+      _grow_list_local(list, three_body, offload_end);
   }
 
   void free_ccache();
@@ -133,26 +149,36 @@ class IntelBuffers {
       _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
-  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 1);
 
-  inline int * firstneigh(const NeighList * /*list*/) { return _list_alloc; }
-  inline int * cnumneigh(const NeighList * /*list*/) { return _cnumneigh; }
+  inline int * intel_list(const NeighList * /*list*/) { return _list_alloc; }
   inline int * get_atombin() { return _atombin; }
   inline int * get_binpacked() { return _binpacked; }
+  inline int * cnumneigh() { return _neigh_list_ptrs[0].cnumneigh; }
+  inline void get_list_data3(const NeighList *list, int *&numneighhalf,
+                             int *&cnumneigh) {
+    for (int i = 0; i < _n_list_ptrs; i++)
+      if ((void *)list == _neigh_list_ptrs[i].list_ptr) {
+        numneighhalf = _neigh_list_ptrs[i].numneighhalf;
+        cnumneigh = _neigh_list_ptrs[i].cnumneigh;
+      }
+  }
+  void grow_data3(NeighList *list, int *&numneighhalf, int *&cnumneigh);
+  void free_list_ptrs();
 
-  inline atom_t * get_x(const int /*offload*/ = 1) {
+  inline atom_t * get_x(const int offload = 1) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_separate_buffers && offload == 0) return _host_x;
     #endif
     return _x;
   }
-  inline flt_t * get_q(const int /*offload*/ = 1) {
+  inline flt_t * get_q(const int offload = 1) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_separate_buffers && offload == 0) return _host_q;
     #endif
     return _q;
   }
-  inline quat_t * get_quat(const int /*offload*/ = 1) {
+  inline quat_t * get_quat(const int offload = 1) {
     #ifdef _LMP_INTEL_OFFLOAD
     if (_separate_buffers && offload == 0) return _host_quat;
     #endif
@@ -298,6 +324,9 @@ class IntelBuffers {
   int _list_alloc_atoms;
   int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
 
+  IntelNeighListPtrs *_neigh_list_ptrs;
+  int _n_list_ptrs, _max_list_ptrs;
+
   flt_t **_cutneighsq, **_cutneighghostsq;
   int _ntypes;
 
@@ -325,7 +354,7 @@ class IntelBuffers {
   int _off_map_nmax, _cop, _off_ccache, _off_ncache;
   int *_off_map_ilist;
   int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
-  int *_off_map_numneigh;
+  int **_off_map_firstneigh, *_off_map_numneigh;
   bool _off_list_alloc;
   #endif
 
@@ -336,7 +365,8 @@ class IntelBuffers {
   void _grow(const int nall, const int nlocal, const int nthreads,
              const int offload_end);
   void _grow_nmax(const int offload_end);
-  void _grow_list_local(NeighList *list, const int offload_end);
+  void _grow_list_local(NeighList *list, const int three_body,
+                        const int offload_end);
   void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
                        const int offload_end, const int pack_width);
 };
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 2f1b02a49f48826c3cf8841b7643972ec9914354..178a20c6e13396c0e2b9c00ab3814f7aa6a6e8d5 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -20,6 +20,9 @@
 #if (__INTEL_COMPILER_BUILD_DATE > 20160720)
 #define LMP_INTEL_USE_SIMDOFF
 #endif
+#pragma warning (disable:3948)
+#pragma warning (disable:3949)
+#pragma warning (disable:13200)
 #endif
 
 #ifdef __INTEL_OFFLOAD
@@ -606,7 +609,7 @@ inline double MIC_Wtime() {
   if (newton)                                                           \
     f_stride = buffers->get_stride(nall);                               \
   else                                                                  \
-    f_stride = buffers->get_stride(inum);                               \
+    f_stride = buffers->get_stride(nlocal);                             \
 }
 
 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h
index e9d5d929953682276884274fe6d90285a88bf5eb..75fc9828b96f0f99cfcfac86ce384c9f535efe73 100644
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@@ -213,6 +213,12 @@ namespace ip_simd {
                                        _MM_SCALE_8);
   }
 
+  inline SIMD_int SIMD_gatherz(const SIMD_mask &m, const int *p,
+                               const SIMD_int &i) {
+    return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
+                                    _MM_SCALE_4);
+  }
+
   inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p,
                                  const SIMD_int &i) {
     return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p,
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index 789fa35b429448a5e50651b206d5baea7364dd63..8359587eb28c4ea46592db8022abb8b460209749 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -192,7 +192,6 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
 
   // ---------- Bin Atoms -------------
   _fix->start_watch(TIME_HOST_NEIGHBOR);
-  //const ATOM_T * _noalias const x = buffers->get_x();
   int * _noalias const atombin = this->_atombin;
   int * _noalias const binpacked = this->_binpacked;
 
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
index 74a04f0e7de9dd97197049c7ac9227d2b8ee2516..5149b26f2f82019149589a2c307f58fbcdbc82cd 100644
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -86,7 +86,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list,
 
   // only uses offload_end_neighbor to check whether we are doing offloading
   // at all, no need to correct this later
-  buffers->grow_list(list, nall, comm->nthreads, off_end,
+  buffers->grow_list(list, nall, comm->nthreads, 0, off_end,
                      _fix->nbor_pack_width());
 
   int need_ic = 0;
@@ -106,7 +106,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list,
 /* ---------------------------------------------------------------------- */
 
 template<class flt_t, class acc_t, int need_ic>
-void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
+void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
                                  IntelBuffers<flt_t,acc_t> * buffers,
                                  const int pstart, const int pend) {
   if (pend-pstart == 0) return;
@@ -116,7 +116,8 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
   const int aend = nall;
 
   const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * _noalias const intel_list = buffers->intel_list(list);
+  int ** _noalias const firstneigh = list->firstneigh;
   const int e_nall = nall_t;
 
   const int molecular = atom->molecular;
@@ -140,7 +141,6 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
 
   int * _noalias const ilist = list->ilist;
   int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int nstencil = this->nstencil;
   const int * _noalias const stencil = this->stencil;
   const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
@@ -228,7 +228,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
     in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
     in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(intel_list:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(atombin:length(aend) alloc_if(0) free_if(0)) \
@@ -295,7 +295,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
 
       int pack_offset = maxnbors;
       int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
+      int *neighptr = intel_list + ct;
       const int obound = pack_offset + maxnbors * 2;
 
       const int toffs = tid * ncache_stride;
@@ -528,12 +528,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
         if (ns > maxnbors) *overflow = 1;
 
         ilist[i] = i;
-        cnumneigh[i] = ct;
+        firstneigh[i] = intel_list + ct;
         numneigh[i] = ns;
 
         ct += ns;
         IP_PRE_edge_align(ct, sizeof(int));
-        neighptr = firstneigh + ct;
+        neighptr = intel_list + ct;
         if (ct + obound > list_size) {
           if (i < ito - 1) {
             *overflow = 1;
@@ -564,13 +564,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
   if (offload) {
     _fix->stop_watch(TIME_OFFLOAD_LATENCY);
     _fix->start_watch(TIME_HOST_NEIGHBOR);
+    firstneigh[0] = intel_list;
     for (int n = 0; n < aend; n++) {
       ilist[n] = n;
       numneigh[n] = 0;
     }
   } else {
-    for (int i = 0; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
     if (separate_buffers) {
       _fix->start_watch(TIME_PACK);
       _fix->set_neighbor_host_sizes();
@@ -581,10 +580,5 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
       _fix->stop_watch(TIME_PACK);
     }
   }
-  #else
-  #pragma vector aligned
-  #pragma simd
-  for (int i = 0; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
   #endif
 }
diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp
index 60b912d796146d2c9a7f7a15d116afdc430c20ff..4ef84be75ff0cffadce6e5afb54408e58c65dd98 100644
--- a/src/USER-INTEL/npair_full_bin_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_intel.cpp
@@ -70,7 +70,8 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int offload_noghost = _fix->offload_noghost();
   #endif
 
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
+  buffers->grow_list(list, atom->nlocal, comm->nthreads,
+                     _fix->three_body_neighbor(), off_end,
                      _fix->nbor_pack_width());
 
   int need_ic = 0;
diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
index 8c024a46046827985db3f8047446ad06df61eba1..799ba2b57b67e26e88b8a9a959a23d5fbd4a8167 100644
--- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
@@ -71,7 +71,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int offload_noghost = _fix->offload_noghost();
   #endif
 
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
+  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
 
   int need_ic = 0;
   if (atom->molecular)
diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
index 653a95139498dbe1d34b5c8d77ca706664e02c56..1f859cb5f1bc821202ceb80271d4e336c3772fd0 100644
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
@@ -71,7 +71,7 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int offload_noghost = _fix->offload_noghost();
   #endif
 
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
+  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);
 
   int need_ic = 0;
   if (atom->molecular)
diff --git a/src/USER-INTEL/npair_halffull_newtoff_intel.h b/src/USER-INTEL/npair_halffull_newtoff_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..c4c41a85a105fd453db73473dfbf69ccd80e99d8
--- /dev/null
+++ b/src/USER-INTEL/npair_halffull_newtoff_intel.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+// Only used for hybrid to generate list for non-intel style. Use
+// standard routines.
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(halffull/newtoff/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_INTEL)
+
+NPairStyle(halffull/newtoff/skip/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL)
+
+NPairStyle(halffull/newtoff/ghost/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL)
+
+NPairStyle(halffull/newtoff/skip/ghost/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_INTEL)
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/USER-INTEL/npair_halffull_newton_intel.cpp b/src/USER-INTEL/npair_halffull_newton_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..93a1a9792392a6ffd89f6fe8e9e41a0d65f836f7
--- /dev/null
+++ b/src/USER-INTEL/npair_halffull_newton_intel.cpp
@@ -0,0 +1,233 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_halffull_newton_intel.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "modify.h"
+#include "molecule.h"
+#include "domain.h"
+#include "my_page.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairHalffullNewtonIntel::NPairHalffullNewtonIntel(LAMMPS *lmp) : NPair(lmp) {
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  _fix = static_cast<FixIntel *>(modify->fix[ifix]);
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full list
+   pair stored once if i,j are both owned and i < j
+   if j is ghost, only store if j coords are "above and to the right" of i
+   works if full list is a skip list
+------------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void NPairHalffullNewtonIntel::build_t(NeighList *list,
+                                       IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int inum_full = list->listfull->inum;
+  const int nlocal = atom->nlocal;
+  const int e_nall = nlocal + atom->nghost;
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = list->firstneigh;
+  const int * _noalias const ilist_full = list->listfull->ilist;
+  const int * _noalias const numneigh_full = list->listfull->numneigh;
+  const int ** _noalias const firstneigh_full =
+    (const int ** const)list->listfull->firstneigh;
+  
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, comm->nthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      const int i = ilist_full[ii];
+      const flt_t xtmp = x[i].x;
+      const flt_t ytmp = x[i].y;
+      const flt_t ztmp = x[i].z;
+
+      // loop over full neighbor list
+
+      const int * _noalias const jlist = firstneigh_full[i];
+      const int jnum = numneigh_full[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
+      for (int jj = 0; jj < jnum; jj++) {
+        const int joriginal = jlist[jj];
+        const int j = joriginal & NEIGHMASK;
+        int addme = 1;
+        if (j < nlocal) {
+          if (i > j) addme = 0;
+        } else {
+          if (x[j].z < ztmp) addme = 0;
+          if (x[j].z == ztmp) {
+            if (x[j].y < ytmp) addme = 0;
+            if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+          }
+        }
+        if (addme)
+          neighptr[n++] = joriginal;
+      }
+
+      ilist[ii] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  list->inum = inum_full;
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full 3-body list
+   half list is already stored as first part of 3-body list
+------------------------------------------------------------------------- */
+
+template <class flt_t>
+void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf)
+{
+  const int inum_full = list->listfull->inum;
+  const int e_nall = atom->nlocal + atom->nghost;
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = list->firstneigh;
+  const int * _noalias const ilist_full = list->listfull->ilist;
+  const int * _noalias const numneigh_full = numhalf;
+  const int ** _noalias const firstneigh_full =
+    (const int ** const)list->listfull->firstneigh;
+  
+  int packthreads = 1;
+  if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel if(packthreads > 1)
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, packthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      const int i = ilist_full[ii];
+
+      // loop over full neighbor list
+
+      const int * _noalias const jlist = firstneigh_full[i];
+      const int jnum = numneigh_full[ii];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
+      for (int jj = 0; jj < jnum; jj++) {
+        const int joriginal = jlist[jj];
+        neighptr[n++] = joriginal;
+      }
+
+      ilist[ii] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  list->inum = inum_full;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairHalffullNewtonIntel::build(NeighList *list)
+{
+  if (_fix->three_body_neighbor() == 0) {
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+      build_t(list, _fix->get_mixed_buffers());
+    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+      build_t(list, _fix->get_double_buffers());
+    else
+      build_t(list, _fix->get_single_buffers());
+  } else {
+    int *nhalf, *cnum;
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      _fix->get_mixed_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<float>(list, nhalf);
+    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      _fix->get_double_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<double>(list, nhalf);
+    } else {
+      _fix->get_single_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<float>(list, nhalf);
+    }
+  }
+}
diff --git a/src/USER-INTEL/npair_halffull_newton_intel.h b/src/USER-INTEL/npair_halffull_newton_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3069d80747845f3a8157d0bce4c0ac2e5cd7eab
--- /dev/null
+++ b/src/USER-INTEL/npair_halffull_newton_intel.h
@@ -0,0 +1,72 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(halffull/newton/intel,
+           NPairHalffullNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI| NP_INTEL)
+
+NPairStyle(halffull/newton/skip/intel,
+           NPairHalffullNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
+#define LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
+
+#include "npair.h"
+#include "fix_intel.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+
+namespace LAMMPS_NS {
+
+class NPairHalffullNewtonIntel : public NPair {
+ public:
+  NPairHalffullNewtonIntel(class LAMMPS *);
+  ~NPairHalffullNewtonIntel() {}
+  void build(class NeighList *);
+
+ protected:
+  FixIntel *_fix;
+
+  template<class flt_t, class acc_t>
+  void build_t(NeighList *, IntelBuffers<flt_t,acc_t> *);
+
+  template<class flt_t>
+  void build_t3(NeighList *, int *);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self explanatory.
+
+*/
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index 6d4529752ab38dd490680069d627c97a0de22446..a1c0785d4c4191312f769a577c2104e5060114ec 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -52,12 +52,61 @@ NPairIntel::~NPairIntel() {
 
 /* ---------------------------------------------------------------------- */
 
+void NPairIntel::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    copy_cutsq_info(_fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    copy_cutsq_info(_fix->get_double_buffers());
+  else
+    copy_cutsq_info(_fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void NPairIntel::copy_cutsq_info(IntelBuffers<flt_t,acc_t> *buffers) {
+  int tp1 = atom->ntypes + 1;
+  int use_ghost_cut = 0;
+  if (cutneighghostsq)
+    use_ghost_cut = 1;
+  buffers->set_ntypes(tp1, use_ghost_cut);
+  
+  flt_t **cutneighsqb = buffers->get_cutneighsq();
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = 1; j <= atom->ntypes; j++)
+      cutneighsqb[i][j] = cutneighsq[i][j];
+
+  flt_t **cutneighghostsqb;
+  if (use_ghost_cut) {
+    cutneighghostsqb = buffers->get_cutneighghostsq();
+    for (int i = 1; i <= atom->ntypes; i++)
+      for (int j = 1; j <= atom->ntypes; j++)
+        cutneighghostsqb[i][j] = cutneighghostsq[i][j];
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  int tp1sq = tp1 * tp1;
+  flt_t * ocutneighsq = cutneighsqb[0];
+  #pragma offload_transfer target(mic:_cop) in(ocutneighsq: length(tp1sq))
+  if (use_ghost_cut) {
+    flt_t * ocutneighghostsq = cutneighghostsqb[0];
+    #pragma offload_transfer target(mic:_cop) \
+      in(ocutneighghostsq: length(tp1sq))
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t, int offload_noghost, int need_ic,
           int FULL, int TRI, int THREE>
-void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
+void NPairIntel::bin_newton(const int offload, NeighList *list,
                             IntelBuffers<flt_t,acc_t> *buffers,
                             const int astart, const int aend,
-                            const int /*offload_end*/) {
+                            const int offload_end) {
 
   if (aend-astart == 0) return;
 
@@ -71,7 +120,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
   const int pack_width = _fix->nbor_pack_width();
 
   const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * _noalias const intel_list = buffers->intel_list(list);
   const int e_nall = nall_t;
 
   const int molecular = atom->molecular;
@@ -94,8 +143,10 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
   const tagint * _noalias const tag = atom->tag;
 
   int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  int ** _noalias const firstneigh = list->firstneigh;
+  int * _noalias const numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh();
+ 
   const int nstencil = this->nstencil;
   const int * _noalias const stencil = this->stencil;
   const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
@@ -168,6 +219,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
     in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
     in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(intel_list:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     out(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(ilist:length(0) alloc_if(0) free_if(0)) \
@@ -178,7 +230,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
     in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \
     in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
-    in(pack_width,special_bound) \
+    in(pack_width,special_bound)                                        \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(tag)
@@ -212,7 +264,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
 
     #if defined(_OPENMP)
     #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
+      shared(overflow, nstencilp, binstart, binend)
     #endif
     {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -246,7 +298,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
       const int obound = maxnbors * 3;
       #endif
       int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
+      int *neighptr = intel_list + ct;
       int *neighptr2;
       if (THREE) neighptr2 = neighptr;
 
@@ -605,12 +657,24 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
           if (n > maxnbors) *overflow = 1;
 
         ilist[i] = i;
-        cnumneigh[i] = ct;
+        firstneigh[i] = intel_list + ct;
         if (THREE) {
+          numneigh[i] = ns;
+          cnumneigh[i] = ct;
           #ifdef LMP_INTEL_3BODY_FAST
           cnumneigh[i] += lane;
+          #else
+          // Pad anyways just in case we have hybrid with 2-body and newton off
+          int pad_end = ns;
+          IP_PRE_neighbor_pad(pad_end, offload);
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+                  avg=INTEL_COMPILE_WIDTH/2
+          #endif
+          for ( ; ns < pad_end; ns++)
+            neighptr[n++] = e_nall;
           #endif
-          numneigh[i] = ns;
         } else {
           numneigh[i] = n;
           int pad_end = n;
@@ -631,7 +695,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
           if (lane == pack_width) {
             ct += max_chunk * pack_width;
             IP_PRE_edge_align(ct, sizeof(int));
-            neighptr = firstneigh + ct;
+            neighptr = intel_list + ct;
             neighptr2 = neighptr;
             max_chunk = 0;
             lane = 0;
@@ -647,7 +711,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
         {
           ct += n;
           //IP_PRE_edge_align(ct, sizeof(int));
-          neighptr = firstneigh + ct;
+          neighptr = intel_list + ct;
           if (THREE) neighptr2 = neighptr;
           if (ct + obound > list_size) {
             if (i < ito - 1) {
@@ -667,7 +731,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
       int ghost_offset = 0, nall_offset = e_nall;
       if (separate_buffers) {
         for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh[i];
           int jnum = numneigh[i];
           if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           #if __INTEL_COMPILER+0 > 1499
@@ -712,7 +776,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
         }
 
         for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh[i];
           int jnum = numneigh[i];
           if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           int jj = 0;
@@ -739,13 +803,12 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
   if (offload) {
     _fix->stop_watch(TIME_OFFLOAD_LATENCY);
     _fix->start_watch(TIME_HOST_NEIGHBOR);
+    firstneigh[0] = intel_list;
     for (int n = 0; n < aend; n++) {
       ilist[n] = n;
       numneigh[n] = 0;
     }
   } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
     if (separate_buffers) {
       _fix->start_watch(TIME_PACK);
       _fix->set_neighbor_host_sizes();
@@ -756,11 +819,6 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
       _fix->stop_watch(TIME_PACK);
     }
   }
-  #else
-  #pragma vector aligned
-  #pragma simd
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
   #endif
 }
 
diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h
index 55a529b2cb53a044b20a3a0f21f7ce9480ca6e89..e47687abea10424a5e052afdfd6b81b949e0574e 100644
--- a/src/USER-INTEL/npair_intel.h
+++ b/src/USER-INTEL/npair_intel.h
@@ -75,7 +75,8 @@ class NPairIntel : public NPair {
  public:
   NPairIntel(class LAMMPS *);
   ~NPairIntel();
-
+  virtual void copy_neighbor_info();
+  
   #ifdef _LMP_INTEL_OFFLOAD
   void grow_stencil();
   #endif
@@ -83,6 +84,9 @@ class NPairIntel : public NPair {
  protected:
   FixIntel *_fix;
 
+  template <class flt_t, class acc_t>
+  void copy_cutsq_info(IntelBuffers<flt_t,acc_t> *);
+  
   template <class flt_t, class acc_t, int, int, int, int, int>
   void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
                   const int, const int, const int offload_end = 0);
diff --git a/src/USER-INTEL/npair_skip_intel.cpp b/src/USER-INTEL/npair_skip_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3d2463a0b978d883c66a4c364427668b1b7a1719
--- /dev/null
+++ b/src/USER-INTEL/npair_skip_intel.cpp
@@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_skip_intel.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "modify.h"
+#include "molecule.h"
+#include "neigh_request.h"
+#include "domain.h"
+#include "my_page.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairSkipIntel::NPairSkipIntel(LAMMPS *lmp) : NPair(lmp) {
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  _fix = static_cast<FixIntel *>(modify->fix[ifix]);
+  _inum_starts = new int[comm->nthreads];
+  _inum_counts = new int[comm->nthreads];
+  _full_props = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+NPairSkipIntel::~NPairSkipIntel() {
+  delete []_inum_starts;
+  delete []_inum_counts;
+  if (_full_props) delete []_full_props;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairSkipIntel::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+  if (_full_props) delete []_full_props;
+  _full_props = new int[neighbor->nlist];
+  for (int i = 0; i < neighbor->nlist; i++)
+    _full_props[i] = neighbor->requests[i]->full;
+}
+
+/* ----------------------------------------------------------------------
+   build skip list for subset of types from parent list
+   works for half and full lists
+   works for owned (non-ghost) list, also for ghost list
+   iskip and ijskip flag which atom types and type pairs to skip
+   if ghost, also store neighbors of ghost atoms & set inum,gnum correctly
+------------------------------------------------------------------------- */
+
+template<class flt_t, int THREE>
+void NPairSkipIntel::build_t(NeighList *list, int *numhalf, int *cnumneigh,
+                             int *numhalf_skip)
+{
+  const int nlocal = atom->nlocal;
+  const int e_nall = nlocal + atom->nghost;
+  const int * _noalias const type = atom->type;
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = (int ** const)list->firstneigh;
+  const int * _noalias const ilist_skip = list->listskip->ilist;
+  const int * _noalias const numneigh_skip = list->listskip->numneigh;
+  const int ** _noalias const firstneigh_skip =
+    (const int ** const)list->listskip->firstneigh;
+  const int * _noalias const iskip = list->iskip;
+  const int ** _noalias const ijskip = (const int ** const)list->ijskip;
+
+  int num_skip = list->listskip->inum;
+  if (list->ghost) num_skip += list->listskip->gnum;
+
+  int packthreads;
+  if (comm->nthreads > INTEL_HTHREADS && THREE==0)
+    packthreads = comm->nthreads;
+  else
+    packthreads = 1;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel if(packthreads > 1)
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, num_skip, packthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    int my_inum = ifrom;
+    _inum_starts[tid] = ifrom;
+    
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      const int i = ilist_skip[ii];
+      const int itype = type[i];
+      if (iskip[itype]) continue;
+
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      // loop over parent non-skip list
+
+      const int * _noalias const jlist = firstneigh_skip[i];
+      const int jnum = numneigh_skip[i];
+
+      if (THREE) {
+        const int jnumhalf = numhalf_skip[ii];
+        for (int jj = 0; jj < jnumhalf; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
+        }
+        numhalf[my_inum] = n; 
+
+        for (int jj = jnumhalf; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
+        }
+      }
+
+      ilist[my_inum++] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+
+    int last_inum = 0, loop_end;
+    _inum_counts[tid] = my_inum;
+  }
+  int inum = _inum_counts[0];
+  for (int tid = 1; tid < packthreads; tid++) {
+    for (int i = _inum_starts[tid]; i < _inum_counts[tid]; i++) {
+      if (THREE) numhalf[inum] = numhalf[i];
+      ilist[inum++] = ilist[i];
+    }
+  }
+  list->inum = inum;
+
+  if (THREE && num_skip > 0) {
+    int * const list_start = firstneigh[ilist[0]];
+    for (int ii = 0; ii < inum; ii++) {
+      int i = ilist[ii];
+      cnumneigh[ii] = static_cast<int>(firstneigh[i] - list_start);
+    }
+  }
+  if (list->ghost) {
+    int num = 0;
+    int my_inum = list->inum;
+    for (int i = 0; i < my_inum; i++)
+      if (ilist[i] < nlocal) num++;
+      else break;
+    list->inum = num;
+    list->gnum = my_inum - num;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairSkipIntel::build(NeighList *list)
+{
+  if (_fix->three_body_neighbor()==0 ||
+      _full_props[list->listskip->index] == 0) {
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+      build_t<float,0>(list, 0, 0, 0);
+    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+      build_t<double,0>(list, 0, 0, 0);
+    else
+      build_t<float,0>(list, 0, 0, 0);
+  } else {
+    int *nhalf, *cnumneigh, *nhalf_skip, *u;
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      _fix->get_mixed_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_mixed_buffers()->grow_data3(list, nhalf, cnumneigh);
+      build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip);
+    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      _fix->get_double_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_double_buffers()->grow_data3(list, nhalf, cnumneigh);
+      build_t<double,1>(list, nhalf, cnumneigh, nhalf_skip);
+    } else {
+      _fix->get_single_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_single_buffers()->grow_data3(list,nhalf,cnumneigh);
+      build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip);
+    }
+  }
+}
diff --git a/src/USER-INTEL/npair_skip_intel.h b/src/USER-INTEL/npair_skip_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3277708d5c4188ebeef2197a18270ae561bacef
--- /dev/null
+++ b/src/USER-INTEL/npair_skip_intel.h
@@ -0,0 +1,69 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(skip/intel,
+           NPairSkipIntel,
+           NP_SKIP | NP_HALF | NP_FULL |
+           NP_NSQ | NP_BIN | NP_MULTI |
+           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
+
+NPairStyle(skip/ghost/intel,
+           NPairSkipIntel,
+           NP_SKIP | NP_HALF | NP_FULL |
+           NP_NSQ | NP_BIN | NP_MULTI |
+           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_SKIP_INTEL_H
+#define LMP_NPAIR_SKIP_INTEL_H
+
+#include "npair.h"
+#include "fix_intel.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+
+namespace LAMMPS_NS {
+
+class NPairSkipIntel : public NPair {
+ public:
+  NPairSkipIntel(class LAMMPS *);
+  ~NPairSkipIntel();
+  virtual void copy_neighbor_info();
+  void build(class NeighList *);
+  
+ protected:
+  FixIntel *_fix;
+  int *_inum_starts, *_inum_counts, *_full_props;
+
+  template<class flt_t, int THREE>
+  void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self explanatory.
+
+*/
diff --git a/src/USER-INTEL/pair_airebo_intel.cpp b/src/USER-INTEL/pair_airebo_intel.cpp
index 601144a3c03ae6a14f8cb2cb697cf19d7cdb020e..198f8798fa29e35e00429022cc37566edd6e88cb 100644
--- a/src/USER-INTEL/pair_airebo_intel.cpp
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
@@ -97,6 +97,13 @@ struct LAMMPS_NS::PairAIREBOIntelParam {
 
 namespace {
 
+struct NeighListLMPAIREBO {
+  int * num; /* num_all */
+  int * num_half; /* num_all */
+  int * offset; /* num_all */
+  int ** entries; /* num_all * num_neighs_per_atom */
+};
+
 struct NeighListAIREBO {
   int * num; /* num_all */
   int * num_half; /* num_all */
@@ -125,7 +132,7 @@ struct KernelArgsAIREBOT {
   int neigh_from_atom, neigh_to_atom;
   int rebuild_flag;
   flt_t skin;
-  struct NeighListAIREBO neigh_lmp;
+  struct NeighListLMPAIREBO neigh_lmp;
   struct NeighListAIREBO neigh_rebo;
   PairAIREBOIntelParam<flt_t,acc_t> params;
   struct AtomAIREBOT<flt_t> * x; /* num_all */
@@ -176,6 +183,11 @@ void PairAIREBOIntel::init_style()
   PairAIREBO::init_style();
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
+  const int nrequest = neighbor->nrequest;
+  for (int i = 0; i < nrequest; ++i)
+    if (neighbor->requests[i]->skip)
+      error->all(FLERR, "Cannot yet use airebo/intel with hybrid.");
+
   int ifix = modify->find_fix("package_intel");
   if (ifix < 0)
     error->all(FLERR,
@@ -399,8 +411,7 @@ void PairAIREBOIntel::eval(
   ATOM_T * _noalias const x = buffers->get_x(offload);
   const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
   tagint * const tag = atom->tag;
 
   const int ntypes = atom->ntypes + 1;
@@ -441,7 +452,6 @@ void PairAIREBOIntel::eval(
 
   #pragma offload target(mic:_cop) if(offload) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
@@ -472,10 +482,8 @@ void PairAIREBOIntel::eval(
                               f_stride, x, 0/*q*/);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EVFLAG)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -508,8 +516,7 @@ void PairAIREBOIntel::eval(
       args.skin = skin;
       args.neigh_lmp.num = const_cast<int*>(numneigh);
       args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
-      args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
-      args.neigh_lmp.entries = const_cast<int*>(firstneigh);
+      args.neigh_lmp.entries = const_cast<int**>(firstneigh);
       args.neigh_rebo.num = REBO_numneigh;
       args.neigh_rebo.num_half = REBO_num_skin;
       args.neigh_rebo.offset = REBO_cnumneigh;
@@ -543,18 +550,14 @@ void PairAIREBOIntel::eval(
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
     if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -578,28 +581,20 @@ template<class flt_t, class acc_t>
 void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
   int tp1 = atom->ntypes + 1;
 
-  buffers->set_ntypes(tp1,1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
-  flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
-
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-        cut = cutghost[i][j] + neighbor->skin;
-        cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
-  flt_t * ocutneighsq = cutneighsq[0];
   size_t VL = 512 / 8 / sizeof(flt_t);
   int ntypes = tp1;
   int tp1sq = tp1 * tp1;
@@ -607,7 +602,6 @@ void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
   // it might not be freed if this method is called more than once
   int * map = this->map;
   #pragma offload_transfer target(mic:_cop) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
     in(map: length(tp1) alloc_if(1) free_if(0))
   #endif
 
@@ -1580,7 +1574,7 @@ void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
     ka->nC[i] = 0;
     ka->nH[i] = 0;
     for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
-      int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
+      int ji = ka->neigh_lmp.entries[i][j];
       flt_t delx = ka->x[i].x - ka->x[ji].x;
       flt_t dely = ka->x[i].y - ka->x[ji].y;
       flt_t delz = ka->x[i].z - ka->x[ji].z;
@@ -2168,7 +2162,7 @@ void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
                                    int morseflag) {
   AtomAIREBOT<flt_t> * x = ka->x;
   int jj;
-  int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+  int * neighs = ka->neigh_lmp.entries[i];
   int jnum = ka->neigh_lmp.num_half[i];
   for (jj = 0; jj < jnum; jj++) {
     int j = neighs[jj];
@@ -2332,7 +2326,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
     int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] :
       ka->neigh_rebo.num_half[i];
     int * neighs = ka->rebuild_flag ?
-      &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] :
+      ka->neigh_lmp.entries[i] :
       &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
     int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
     int n = 0;
@@ -4554,7 +4548,7 @@ static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
     fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
     fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);
 
-    int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+    int * neighs = ka->neigh_lmp.entries[i];
     int jnum = ka->neigh_lmp.num_half[i];
 
     bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);
diff --git a/src/USER-INTEL/pair_airebo_intel.h b/src/USER-INTEL/pair_airebo_intel.h
index 2d32925f68f6731acb516b22ea03d788fd0a1287..8a319536de119f6f02ac2a42b90f493e1c389f33 100644
--- a/src/USER-INTEL/pair_airebo_intel.h
+++ b/src/USER-INTEL/pair_airebo_intel.h
@@ -107,4 +107,10 @@ E: Cannot open AIREBO potential file %s
 The specified AIREBO potential file cannot be opened.  Check that the
 path and name are correct.
 
+E: Cannot yet use airebo/intel with hybrid.
+
+Pair style airebo/intel cannot currently be used as part of a hybrid
+pair style (with the exception of hybrid/overlay). 
+
+
 */
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index 39fa9014db45238da0f50c04eb171843610d2979..3954d559e1472b6fe808cb2fa28c683e28140a6f 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -144,8 +144,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
@@ -182,7 +181,6 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy, c_cut:length(0) alloc_if(0) free_if(0))      \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
@@ -204,8 +202,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)     
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -231,8 +231,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -361,13 +361,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -375,6 +372,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -439,19 +438,16 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -483,12 +479,10 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
   C_CUT_T * c_cut = fc.c_cut[0];
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
-    in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))   \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index fe4d408a1333be088bb49dc200d0292ba7c1ef7d..059413c3d94c1e1d73ba7eee56417f9f07e3554e 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -144,8 +144,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
@@ -206,7 +205,6 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
     in(rho_inv:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
@@ -230,8 +228,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -265,8 +265,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -437,13 +437,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -451,6 +448,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -523,19 +522,16 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -591,15 +587,13 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
     in(rho_inv: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
-    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
   #endif
 }
 
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index 8ce3d121e0b6f0cfd18149bf111e27b403db4b51..0e0bd0f56fdf8977f70e38cf78ad8886de5da488 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -135,8 +135,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
 
   const flt_t * _noalias const special_lj = fc.special_lj;
   const C_FORCE_T * _noalias const c_force = fc.c_force[0];
@@ -167,7 +166,6 @@ void PairBuckIntel::eval(const int offload, const int vflag,
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0))      \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(ilist:length(0) alloc_if(0) free_if(0)) \
@@ -188,8 +186,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl =  (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -214,8 +214,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -324,13 +324,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -338,6 +334,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -396,19 +394,16 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -434,12 +429,10 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * special_lj = fc.special_lj;
   C_FORCE_T * c_force = fc.c_force[0];
   C_ENERGY_T * c_energy = fc.c_energy[0];
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj: length(4) alloc_if(0) free_if(0)) \
-    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))   \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp
index 7c33ed21d35cb130b4271e62cf9b98cfe77e2132..9baa64e8e05984afd58aa55115d6ca2c2b911dac 100644
--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@@ -173,8 +173,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
   const FC_PACKED1_T * _noalias const param = fc.param[0];
   const flt_t * _noalias const special_lj = fc.special_lj;
   int * _noalias const rngi_thread = fc.rngi;
@@ -204,8 +203,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -233,10 +234,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,
 
       flt_t icut, a0, gamma, sigma;
       if (ONETYPE) {
-        icut = param[3].icut;
-        a0 = param[3].a0;
-        gamma = param[3].gamma;
-        sigma = param[3].sigma;
+        icut = param[_onetype].icut;
+        a0 = param[_onetype].a0;
+        gamma = param[_onetype].gamma;
+        sigma = param[_onetype].sigma;
       }
       for (int ii = iifrom; ii < iito; ii += iip) {
         const int i = ilist[ii];
@@ -248,8 +249,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
           parami = param + ptr_off;
         }
 
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
@@ -379,13 +380,9 @@ void PairDPDIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -393,6 +390,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -503,32 +502,26 @@ void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
                                     IntelBuffers<flt_t,acc_t> *buffers)
 {
   _onetype = 0;
-  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
+  int mytypes = 0;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
-        cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-        double icut = 1.0 / cut;
-        fc.param[i][j].icut = fc.param[j][i].icut = icut;
-      } else {
-        cut = init_one(i,j);
-        double icut = 1.0 / cut;
-        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+        mytypes++;
+        _onetype = i * tp1 + j;
       }
+      double cut = init_one(i,j);
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
+      double icut = 1.0 / cut;
+      fc.param[i][j].icut = fc.param[j][i].icut = icut;
     }
   }
-
+  if (mytypes > 1 || atom->molecular) _onetype = 0;
+  
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
     fc.special_lj[0] = 1.0;
diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp
index ce9ede69d680684a6af9a896187d24fb425ed52b..95d0272d33361609d0372138811b407faa59f26e 100644
--- a/src/USER-INTEL/pair_eam_intel.cpp
+++ b/src/USER-INTEL/pair_eam_intel.cpp
@@ -193,8 +193,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
   const FC_PACKED1_T * _noalias const rhor_spline_f = fc.rhor_spline_f;
   const FC_PACKED1_T * _noalias const rhor_spline_e = fc.rhor_spline_e;
   const FC_PACKED2_T * _noalias const z2r_spline_t = fc.z2r_spline_t;
@@ -243,8 +242,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -293,8 +294,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           itype = x[i].w;
           rhor_ioff = istride * itype;
         }
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         const flt_t xtmp = x[i].x;
@@ -450,8 +451,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 
       if (tid == 0)
         comm->forward_comm_pair(this);
-      if (NEWTON_PAIR)
-        memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       #if defined(_OPENMP)
       #pragma omp barrier
@@ -469,8 +469,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           rhor_ioff = istride * itype;
           scale_fi = scale_f + itype*ntypes;
         }
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
@@ -597,11 +597,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
@@ -610,6 +606,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -682,19 +680,16 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,nr,nrho,memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -733,13 +728,13 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
           fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
           fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
           fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
-          fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
-          fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
-          fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
-          fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
-          fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
-          fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
-          fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
+          fc.z2r_spline_t[joff + k].a=z2r_spline[type2z2r[j][i]][k][0];
+          fc.z2r_spline_t[joff + k].b=z2r_spline[type2z2r[j][i]][k][1];
+          fc.z2r_spline_t[joff + k].c=z2r_spline[type2z2r[j][i]][k][2];
+          fc.z2r_spline_t[joff + k].d=z2r_spline[type2z2r[j][i]][k][3];
+          fc.z2r_spline_t[joff + k].e=z2r_spline[type2z2r[j][i]][k][4];
+          fc.z2r_spline_t[joff + k].f=z2r_spline[type2z2r[j][i]][k][5];
+          fc.z2r_spline_t[joff + k].g=z2r_spline[type2z2r[j][i]][k][6];
         }
       }
     }
@@ -754,7 +749,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                  const int nr, const int nrho,
                                                  Memory *memory,
                                                  const int cop) {
-  if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
+  if (ntypes != _ntypes || nr + 1 > _nr || nrho + 1 > _nrho) {
     if (_ntypes > 0) {
       _memory->destroy(rhor_spline_f);
       _memory->destroy(rhor_spline_e);
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index 30941a7594810f8bb7037ee162fc5a361a69c0fa..51524355d5cf8b56b48316a6aad86b109ccca2d9 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -228,8 +228,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
   const flt_t * _noalias const special_lj = fc.special_lj;
 
   const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
@@ -283,7 +282,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
     in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(quat:length(nall+1) alloc_if(0) free_if(0)) \
@@ -337,9 +335,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     #endif
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0.0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     if (NEWTON_PAIR == 0) f_start[1].w = 0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -372,8 +372,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
         const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
 
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         const flt_t xtmp = x[i].x;
@@ -833,13 +833,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         f_start[1].w = ierror;
     } // omp
 
-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
         ov0 *= (acc_t)-0.5;
         ov1 *= (acc_t)-0.5;
         ov2 *= (acc_t)-0.5;
@@ -847,6 +843,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)-0.5;
         ov5 *= (acc_t)-0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -914,19 +912,16 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
   if (mthreads < buffers->get_off_threads())
     mthreads = buffers->get_off_threads();
   fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -963,14 +958,12 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
   FC_PACKED1_T *oijc = fc.ijc[0];
   FC_PACKED2_T *olj34 = fc.lj34[0];
   FC_PACKED3_T *oic = fc.ic;
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   if (oijc != NULL && oic != NULL) {
     #pragma offload_transfer target(mic:_cop) \
       in(special_lj: length(4) alloc_if(0) free_if(0)) \
       in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
-      in(oic: length(tp1) alloc_if(0) free_if(0)) \
-      in(ocutneighsq: length(tp1sq))
+      in(oic: length(tp1) alloc_if(0) free_if(0))
   }
   #endif
 }
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index 8e6395c3885cecf42540f303ed278996b2656ed1..0b6ac3ffa56d45333631c5bd4e01c2e08d8fd7bf 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -138,8 +138,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
@@ -186,7 +185,6 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
@@ -212,8 +210,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -248,8 +248,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
         const flt_t * _noalias const cutsqi = cutsq + ptr_off;
         const LJ_T * _noalias const lji = lj + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -403,16 +403,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
         oevdwl *= (acc_t)0.5;
         oecoul *= (acc_t)0.5;
-      }
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
-      if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -420,6 +414,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -487,22 +483,19 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   if (cut_lj > cut_coul)
     error->all(FLERR,
          "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -540,12 +533,10 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * special_coul = fc.special_coul;
   flt_t * cutsq = fc.cutsq[0];
   LJ_T * lj = fc.lj[0];
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
-    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0))
   #endif
 }
 
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index a01a4688a582036b6a0498b460dfbcb3691f28ba..753a9afdd9e4ede4977f50021645abb5b58b6eb8 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -28,7 +28,7 @@
 #include "suffix.h"
 using namespace LAMMPS_NS;
 
-#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec2_t
 #define TABLE_T typename ForceConst<flt_t>::table_t
 
 /* ---------------------------------------------------------------------- */
@@ -142,8 +142,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
@@ -207,7 +206,6 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
@@ -232,8 +230,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -268,8 +268,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         const flt_t * _noalias const cutsqi = cutsq + ptr_off;
         const LJ_T * _noalias const lji = lj + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -376,8 +376,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
           if (rsq < cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
-            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
-            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+            flt_t eps4 = lji[jtype].x;
+            flt_t sigp6 = lji[jtype].y;
+            flt_t lj4 = eps4 * sigp6;
+            flt_t lj3 = lj4 * sigp6;
+            flt_t lj2 = (flt_t)6.0 * lj4;
+            flt_t lj1 = (flt_t)12.0 * lj3;
+            forcelj = r6inv * (lj1 * r6inv - lj2);
+            if (EFLAG) evdwl = r6inv*(lj3 * r6inv - lj4);
 
             #ifdef INTEL_VMASK
             if (rsq > cut_lj_innersq) {
@@ -397,8 +403,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
                 }
                 #endif
               } else {
-                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
-                    lji[jtype].w);
+                const flt_t philj = r6inv * (lj3 * r6inv - lj4);
                 #ifndef INTEL_VMASK
                 if (rsq > cut_lj_innersq)
                 #endif
@@ -463,16 +468,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
         oevdwl *= (acc_t)0.5;
         oecoul *= (acc_t)0.5;
-      }
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
-      if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -480,6 +479,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -552,22 +553,19 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   if (cut_lj > cut_coul)
     error->all(FLERR,
          "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -591,10 +589,13 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   for (int i = 1; i < tp1; i++) {
     for (int j = 1; j < tp1; j++) {
-      fc.lj[i][j].x = lj1[i][j];
-      fc.lj[i][j].y = lj2[i][j];
-      fc.lj[i][j].z = lj3[i][j];
-      fc.lj[i][j].w = lj4[i][j];
+      if (i <= j) {
+        fc.lj[i][j].x = epsilon[i][j] * 4.0;
+        fc.lj[i][j].y = pow(sigma[i][j],6.0);
+      } else {
+        fc.lj[i][j].x = epsilon[j][i] * 4.0;
+        fc.lj[i][j].y = pow(sigma[j][i],6.0);
+      }
       fc.cutsq[i][j] = cutsq[i][j];
     }
   }
@@ -623,14 +624,12 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
-    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
   #endif
 }
 
@@ -647,7 +646,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       flt_t * ocutsq = cutsq[0];
-      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0];
       table_t * otable = table;
       flt_t * oetable = etable;
       flt_t * odetable = detable;
@@ -687,7 +686,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       flt_t * ospecial_lj = special_lj;
       flt_t * ospecial_coul = special_coul;
       flt_t * ocutsq = cutsq[0];
-      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0];
       table_t * otable = table;
       flt_t * oetable = etable;
       flt_t * odetable = detable;
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
index 1b13d784971667c7f68b48afbe2e057013e0cb5e..006674507889077820cd843a20b1eedfc8da08e8 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -69,7 +69,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
     flt_t cut_lj_innersq;
     table_t *table;
     flt_t *etable, *detable, *ctable, *dctable;
-    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+    typename IntelBuffers<flt_t,flt_t>::vec2_t **lj;
 
     ForceConst() : _ntypes(0), _ntable(0) {}
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index ab0b5b3d55e65d75e0fb2acd076182931990e151..35ed9061ce0be04d05f9c522e46fd9cdcd01b30e 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -141,8 +141,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
 
   const flt_t * _noalias const special_coul = fc.special_coul;
   const flt_t * _noalias const special_lj = fc.special_lj;
@@ -201,7 +200,6 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
     in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
@@ -225,8 +223,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
                               f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -260,8 +260,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
@@ -433,16 +433,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
         oevdwl *= (acc_t)0.5;
         oecoul *= (acc_t)0.5;
-      }
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
-      if (NEWTON_PAIR == 0) {
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -450,6 +444,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -522,19 +518,16 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
 
   fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -587,14 +580,12 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * detable = fc.detable;
   flt_t * ctable = fc.ctable;
   flt_t * dctable = fc.dctable;
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   #pragma offload_transfer target(mic:_cop) \
     in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
     in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
     in(table: length(ntable) alloc_if(0) free_if(0)) \
-    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
   #endif
 }
 
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index c973639709e079e2f731f8eb903282ff8bcb8275..94133a7f4771e349e61e365aca9f0fce1670e4b4 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -150,8 +150,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
   const flt_t * _noalias const special_lj = fc.special_lj;
   const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
   const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
@@ -180,8 +179,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
@@ -201,12 +202,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 
       flt_t cutsq, lj1, lj2, lj3, lj4, offset;
       if (ONETYPE) {
-        cutsq = ljc12o[3].cutsq;
-        lj1 = ljc12o[3].lj1;
-        lj2 = ljc12o[3].lj2;
-        lj3 = lj34[3].lj3;
-        lj4 = lj34[3].lj4;
-        offset = ljc12o[3].offset;
+        cutsq = ljc12o[_onetype].cutsq;
+        lj1 = ljc12o[_onetype].lj1;
+        lj2 = ljc12o[_onetype].lj2;
+        lj3 = lj34[_onetype].lj3;
+        lj4 = lj34[_onetype].lj4;
+        offset = ljc12o[_onetype].offset;
       }
       for (int ii = iifrom; ii < iito; ii += iip) {
         const int i = ilist[ii];
@@ -219,8 +220,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           ljc12oi = ljc12o + ptr_off;
           lj34i = lj34 + ptr_off;
         }
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
         IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
@@ -338,13 +339,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
       if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
         ov0 *= (acc_t)0.5;
         ov1 *= (acc_t)0.5;
         ov2 *= (acc_t)0.5;
@@ -352,6 +349,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
         ov4 *= (acc_t)0.5;
         ov5 *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -414,25 +413,26 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                       IntelBuffers<flt_t,acc_t> *buffers)
 {
   _onetype = 0;
-  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
+  int mytypes = 0;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
+      double cut;
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
         cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        mytypes++;
+        _onetype = i * tp1 + j;
+      } else {
+        cut = 0.0;
       }
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
+  if (mytypes > 1 || atom->molecular) _onetype = 0;
 
   for (int i = 0; i < 4; i++) {
     fc.special_lj[i] = force->special_lj[i];
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index aff8ba88a74971f7ee1e4e7f824334af0f056330..9e00516087cb3711f43d37927b6c98448bbc5b35 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -130,37 +130,37 @@ void PairSWIntel::compute(int eflag, int vflag,
   if (_onetype) {
     if (_spq) {
       if (eflag) {
-        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (eflag) {
-        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (_spq) {
       if (eflag) {
-        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (eflag) {
-        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
@@ -173,7 +173,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
-                       const int aend, const int /*pad_width*/)
+                       const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -185,10 +185,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   const int * _noalias const ilist = list->ilist;
-  const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
+
+  int *nhalf, *cnum;
+  buffers->get_list_data3(list, nhalf, cnum);
+  const int * _noalias const numneighhalf = nhalf;
+  const int * _noalias const cnumneigh = cnum;
 
   const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
   const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
@@ -206,6 +209,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
+  const int onetype = _onetype;
+  const int onetype3 = _onetype3;
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
@@ -235,8 +240,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
-    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
-    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
+    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload,onetype3) \
+    in(astart,nlocal,f_stride,minlocal,separate_flag,onetype) \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -251,8 +256,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
@@ -278,24 +283,24 @@ void PairSWIntel::eval(const int offload, const int vflag,
       flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3, c4, c5, c6;
       flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
-        cutsq = p2[3].cutsq;
-        cut = p2f[3].cut;
-        sigma = p2f[3].sigma;
-        c1 = p2f2[3].c1;
-        c2 = p2f2[3].c2;
-        c3 = p2f2[3].c3;
-        c4 = p2f2[3].c4;
-        sigma_gamma = p2[3].sigma_gamma;
-        costheta = p3[7].costheta;
-        lambda_epsilon = p3[7].lambda_epsilon;
-        lambda_epsilon2 = p3[7].lambda_epsilon2;
+        cutsq = p2[onetype].cutsq;
+        cut = p2f[onetype].cut;
+        sigma = p2f[onetype].sigma;
+        c1 = p2f2[onetype].c1;
+        c2 = p2f2[onetype].c2;
+        c3 = p2f2[onetype].c3;
+        c4 = p2f2[onetype].c4;
+        sigma_gamma = p2[onetype].sigma_gamma;
+        costheta = p3[onetype3].costheta;
+        lambda_epsilon = p3[onetype3].lambda_epsilon;
+        lambda_epsilon2 = p3[onetype3].lambda_epsilon2;
         if (SPQ == 0) {
-          powerp = p2f[3].powerp;
-          powerq = p2f[3].powerq;
+          powerp = p2f[onetype].powerp;
+          powerq = p2f[onetype].powerq;
         }
         if (EFLAG) {
-          c5 = p2e[3].c5;
-          c6 = p2e[3].c6;
+          c5 = p2e[onetype].c5;
+          c6 = p2e[onetype].c6;
         }
       }
 
@@ -312,7 +317,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
         }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        const int jnum = numneigh[ii];
+        const int jnum = numneigh[i];
         const int jnumhalf = numneighhalf[ii];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
@@ -541,11 +546,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
+    if (EFLAG || vflag) {
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -585,7 +588,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
-                       const int aend, const int pad_width)
+                       const int aend)
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
   typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
@@ -601,10 +604,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
   const int * _noalias const ilist = list->ilist;
-  const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
+
+  int *nhalf, *cnum;
+  buffers->get_list_data3(list, nhalf, cnum);
+  const int * _noalias const numneighhalf = nhalf;
+  const int * _noalias const cnumneigh = cnum;
 
   const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
   const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
@@ -654,7 +660,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
-    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
+    in(astart,nlocal,f_stride,minlocal,separate_flag) \
     in(ccache_stride3)                                          \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
@@ -670,8 +676,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
     #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
@@ -701,34 +707,38 @@ void PairSWIntel::eval(const int offload, const int vflag,
       SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
       SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
-        cutsq = SIMD_set(p2[3].cutsq);
-        cut = SIMD_set(p2f[3].cut);
-        sigma = SIMD_set(p2f[3].sigma);
-        c1 = SIMD_set(p2f2[3].c1);
-        c2 = SIMD_set(p2f2[3].c2);
-        c3 = SIMD_set(p2f2[3].c3);
-        c4 = SIMD_set(p2f2[3].c4);
-        sigma_gamma = SIMD_set(p2[3].sigma_gamma);
-        costheta = SIMD_set(p3[7].costheta);
-        lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
-        lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
+        cutsq = SIMD_set(p2[_onetype].cutsq);
+        cut = SIMD_set(p2f[_onetype].cut);
+        sigma = SIMD_set(p2f[_onetype].sigma);
+        c1 = SIMD_set(p2f2[_onetype].c1);
+        c2 = SIMD_set(p2f2[_onetype].c2);
+        c3 = SIMD_set(p2f2[_onetype].c3);
+        c4 = SIMD_set(p2f2[_onetype].c4);
+        sigma_gamma = SIMD_set(p2[_onetype].sigma_gamma);
+        costheta = SIMD_set(p3[_onetype3].costheta);
+        lambda_epsilon = SIMD_set(p3[_onetype3].lambda_epsilon);
+        lambda_epsilon2 = SIMD_set(p3[_onetype3].lambda_epsilon2);
         if (SPQ == 0) {
-          powerp = SIMD_set(p2f[3].powerp);
-          powerq = SIMD_set(p2f[3].powerq);
+          powerp = SIMD_set(p2f[_onetype].powerp);
+          powerq = SIMD_set(p2f[_onetype].powerq);
         }
         if (EFLAG) {
-          c5 = SIMD_set(p2e[3].c5);
-          c6 = SIMD_set(p2e[3].c6);
+          c5 = SIMD_set(p2e[_onetype].c5);
+          c6 = SIMD_set(p2e[_onetype].c6);
         }
       }
 
-      const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
-                                        144,160,176,192,208,224,240);
       acc_t * const dforce = &(f[0].x);
       for (int i = iifrom; i < iito; i += iip) {
         SIMD_int ilistv = SIMD_load(ilist + i);
         SIMD_int goffset = ilistv * 16;
-        SIMD_mask imask = ilistv < iito;
+        SIMD_mask imask;
+        if (swidth == 16)
+          imask = 0xFFFF;
+        else
+          imask = 0xFF;
+        const int rem = iito - i;
+        if (rem < swidth) imask = imask >> (swidth - rem);
         SIMD_flt_t xtmp, ytmp, ztmp;
         SIMD_int itype, itype_offset;
 
@@ -741,11 +751,12 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
         #ifdef LMP_INTEL_3BODY_FAST
         const int* ng = firstneigh + cnumneigh[i] - swidth;
+        const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
         #else
         SIMD_int ng = SIMD_load(cnumneigh + i);
+        const SIMD_int jnum = SIMD_gatherz(imask, numneigh, ilistv);
         ng = ng - 1;
         #endif
-        const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
         const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
         const int jnum_max = SIMD_max(jnum);
 
@@ -1051,11 +1062,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
+    if (EFLAG || vflag) {
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -1162,26 +1171,22 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
   _onetype = 0;
-  if (atom->ntypes == 1) _onetype = 1;
-
   _spq = 1;
+  int mytypes = 0;
   for (int ii = 0; ii < tp1; ii++) {
     int i = map[ii];
     for (int jj = 0; jj < tp1; jj++) {
@@ -1231,6 +1236,9 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
           fc.p3[ii][jj][kk].lambda_epsilon = 0;
           fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
         } else {
+          mytypes++;
+          _onetype = ii * tp1 + jj;
+          _onetype3 = ii * tp1 * tp1 + jj * tp1 + kk;
           int ijkparam = elem2param[i][j][k];
           fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
           fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
@@ -1239,12 +1247,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
       }
     }
   }
-
-  _host_pad = 1;
-  _offload_pad = 1;
-
-  if (INTEL_NBOR_PAD > 1)
-    _host_pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  if (mytypes > 1) _onetype = 0;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop < 0) return;
@@ -1253,16 +1256,11 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
   FC_PACKED1p2_T *op2f2 = fc.p2f2[0];
   FC_PACKED2_T *op2e = fc.p2e[0];
   FC_PACKED3_T *op3 = fc.p3[0][0];
-  flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   int tp1cu = tp1sq * tp1;
-  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
-      op3 != NULL && ocutneighsq != NULL) {
-    #pragma offload_transfer target(mic:_cop) \
-      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
-      in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
-      in(ocutneighsq: length(tp1sq))
-  }
+  #pragma offload_transfer target(mic:_cop) \
+    in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
+    in(op3: length(tp1cu) alloc_if(0) free_if(0))
   #endif
 }
 
diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h
index ffcf9a6fb6b69cbfe9747d4b19752a546006ed7d..36e39e3e853d39a9c6a6ce6e4602b34358283471 100644
--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@@ -49,13 +49,13 @@ class PairSWIntel : public PairSW {
   template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
-            const int astart, const int aend, const int pad_width);
+            const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
 
-  int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
+  int _ccache_stride, _spq, _onetype, _onetype3;
   #ifdef LMP_USE_AVXCD
   int _ccache_stride3;
   #endif
@@ -75,7 +75,7 @@ class PairSWIntel : public PairSW {
       flt_t c1, c2, c3, c4;
     } fc_packed1p2;
     typedef struct {
-      flt_t c5, c6;
+      flt_t c5, c6, d1, d2;
     } fc_packed2;
     typedef struct {
       flt_t costheta, lambda_epsilon, lambda_epsilon2, pad;
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index 584b3717842cfcd3b2219fd7419040a497582bb0..5d4c5f2cb677f3a5ec5e2d5ce88e6f4e83e4daa7 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -220,7 +220,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   static void kernel_step(
       int eatom, int vflag,
       const int * _noalias const numneigh,
-      const int * _noalias const cnumneigh,
+      iarr vcnumneigh,
       const int * _noalias const firstneigh,
       int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
@@ -241,7 +241,8 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
     const c_inner_t * _noalias const c_inner,
     const c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
+    avec *vsevdwl, int compress_idx, int ii, int i, iarr js,
+    bvec vmask_repulsive
   );
 };
 
@@ -274,9 +275,12 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
 
   const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const numneighhalf = buffers->get_atombin();
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
+
+  int *nhalf, *cnum;
+  buffers->get_list_data3(list, nhalf, cnum);
+  const int * _noalias const numneighhalf = nhalf;
+  const int * _noalias const cnumneigh = cnum;
 
   typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
   typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
@@ -332,13 +336,13 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
     IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
                               f_stride, x, 0);
 
-    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
@@ -379,11 +383,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                         ov0, ov1, ov2, ov3, ov4, ov5);
 
-    if (EFLAG) {
+    if (EFLAG || vflag) {
       ev_global[0] = oevdwl;
       ev_global[1] = 0.0;
-    }
-    if (vflag) {
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -461,19 +463,16 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
   int tp1 = atom->ntypes + 1;
 
   fc.set_ntypes(tp1, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
 
   // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
         cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
     }
   }
 
@@ -547,7 +546,6 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
   typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
   typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
   typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
-  flt_t * ocutneighsq = cutneighsq[0];
   size_t VL = 512 / 8 / sizeof(flt_t);
   int ntypes = tp1;
   int ntypes_pad = ntypes + VL - ntypes % VL;
@@ -557,8 +555,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
   #pragma offload_transfer target(mic:_cop) \
     in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
     in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
-    in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0))
   #endif
 }
 
@@ -647,7 +644,7 @@ template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<int EFLAG>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     int eatom, int vflag,
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const numneigh, iarr vcnumneigh,
     const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
     const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
@@ -681,6 +678,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   // TDO: We could extract this from the driver routine
   ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
   ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
+  ivec vcnumneigh_i = v::int_load_vl(vcnumneigh);
   bvec vmask = v::mask_enable_lower(compress_idx);
   fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
   ivec vw_i = v_i0;
@@ -692,7 +690,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
   fvec vrij = sqrt(vrijsq);
   ivec vis_orig = v::int_load_vl(is);
-  ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
   ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
   ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
 
@@ -930,6 +927,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl,
     int compress_idx,
+    int ii,
     int i,
     iarr js,
     bvec vmask_repulsive
@@ -970,7 +968,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
   fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
   fvec vrij = sqrt(vrijsq);
 
-  int cnumneigh_i = cnumneigh[i];
+  int cnumneigh_i = cnumneigh[ii];
   int numneigh_i = numneigh[i];
   ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
   ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);
@@ -1147,7 +1145,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
 ) {
   int compress_idx = 0;
   int ii, jj;
-  iarr is, js;
+  iarr is, js, vc;
   avec vsevdwl = v::acc_zero();
   ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
   ivec vj, v_NEIGHMASK(NEIGHMASK);
@@ -1166,7 +1164,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
     flt_t y_i = x[i].y;
     flt_t z_i = x[i].z;
     int jlist_off_i = cnumneigh[ii];
-    int jnum = numneigh[ii];
+    int jnum = numneigh[i];
     for (jj = 0; jj < jnum; jj++) {
       int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
       int w_j = x[j].w;
@@ -1178,6 +1176,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
       if (rsq < cutsq) {
         is[compress_idx] = ii;
         js[compress_idx] = j;
+        vc[compress_idx] = jlist_off_i;
         if (jj < numneighhalf[ii])
           repulsive_flag[compress_idx] = 1;
         compress_idx += 1;
@@ -1187,7 +1186,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
           kernel_step<EFLAG>(
               eatom, vflag,
-              numneigh, cnumneigh, firstneigh, ntypes,
+              numneigh, vc, firstneigh, ntypes,
               x, c_inner, c_outer, f,
               &vsevdwl, compress_idx,
               is, js, vmask_repulsive
@@ -1203,7 +1202,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
               &vsevdwl, compress_idx,
-              i, js, vmask_repulsive
+              ii, i, js, vmask_repulsive
           );
           compress_idx = 0;
           v::int_clear_arr(repulsive_flag);
@@ -1215,7 +1214,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
         vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
         IntelKernelTersoff::kernel_step<EFLAG>(
             eatom, vflag,
-            numneigh, cnumneigh, firstneigh, ntypes,
+            numneigh, vc, firstneigh, ntypes,
             x, c_inner, c_outer, f,
             &vsevdwl, compress_idx,
             is, js, vmask_repulsive
diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp
index e5540e637751bc758a347bcf00ee71aaf1c91e86..537a573f23598f30e7d570ce8542082d08dff4ac 100644
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@@ -581,6 +581,12 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
   else
     nthr = comm->nthreads;
 
+  if (fix->need_zero(0)) {
+    int zl = nlocal;
+    if (force->newton_pair) zl += atom->nghost;
+    memset(f, 0, zl * sizeof(FORCE_T));
+  }
+  
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(nlocal, nthr) if(!_use_lrt)
@@ -726,6 +732,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
   FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
   FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
 
+  if (fix->need_zero(0)) {
+    int zl = nlocal;
+    if (force->newton_pair) zl += atom->nghost;
+    memset(f, 0, zl * sizeof(FORCE_T));
+  }
+  
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(nlocal, nthr) if(!_use_lrt)
diff --git a/src/memory.cpp b/src/memory.cpp
index 7a23a230791c51f078b1195e7867dfb2d91c4739..971de3dce6cca6771d2cba400ef922eec2e6557d 100644
--- a/src/memory.cpp
+++ b/src/memory.cpp
@@ -79,7 +79,9 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name)
 
 #if defined(LMP_USE_TBB_ALLOCATOR)
   ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN);
-#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN)
+#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \
+      defined(__INTEL_COMPILER)
+
   ptr = realloc(ptr, nbytes);
   uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN;
   if (offset) {
diff --git a/src/pair.h b/src/pair.h
index 844bc0cdc75c12983d98da16273b70c1257044a8..c1a9e6eef8df3c181a2284f0770ea602266a48cb 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -207,6 +207,10 @@ class Pair : protected Pointers {
 
   typedef union {int i; float f;} union_int_float_t;
 
+  // Accessor for the user-intel package to determine virial calc for hybrid
+
+  inline int fdotr_is_set() const { return vflag_fdotr; }
+
  protected:
   int vflag_fdotr;
   int maxeatom,maxvatom;