From 2fe0eabc0988feda90ac85ef1331dcd83ad028ac Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Fri, 22 Jun 2018 01:52:55 -0700
Subject: [PATCH] Intel Package: Bug fix when using LRT with minimize. Bug fix
 for virial calculation when using GNU compilers. Most of framework for adding
 hybrid support is now in place.

---
 src/USER-INTEL/fix_intel.h                    |   3 +-
 src/USER-INTEL/intel_preprocess.h             |  53 +++-
 src/USER-INTEL/npair_full_bin_ghost_intel.cpp | 195 ++++++------
 src/USER-INTEL/npair_intel.cpp                | 287 ++++++++++--------
 src/USER-INTEL/pair_buck_coul_cut_intel.cpp   |  10 +-
 src/USER-INTEL/pair_buck_coul_long_intel.cpp  |  10 +-
 src/USER-INTEL/pair_buck_intel.cpp            |  10 +-
 src/USER-INTEL/pair_dpd_intel.cpp             |   9 +-
 src/USER-INTEL/pair_eam_intel.cpp             |  29 +-
 src/USER-INTEL/pair_gayberne_intel.cpp        |  28 +-
 .../pair_lj_charmm_coul_charmm_intel.cpp      |  11 +-
 .../pair_lj_charmm_coul_long_intel.cpp        |  11 +-
 .../pair_lj_cut_coul_long_intel.cpp           |  10 +-
 src/USER-INTEL/pair_lj_cut_intel.cpp          |  10 +-
 src/USER-INTEL/pair_sw_intel.cpp              |  44 +--
 src/USER-INTEL/pair_tersoff_intel.cpp         |  15 +-
 src/USER-INTEL/verlet_lrt_intel.cpp           |   2 +-
 src/USER-INTEL/verlet_lrt_intel.h             |  22 +-
 18 files changed, 450 insertions(+), 309 deletions(-)

diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index d7093e79bb..79fb23ab3d 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -82,7 +82,8 @@ class FixIntel : public Fix {
   }
   inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
   inline int lrt() {
-    if (force->kspace_match("pppm/intel", 0)) return _lrt;
+    if (force->kspace_match("pppm/intel", 0) && update->whichflag == 1) 
+      return _lrt;
     else return 0;
   }
   inline int pppm_table() {
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 29cc45f755..2f1b02a49f 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -134,7 +134,21 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_HTHREADS 2
 #endif
 
-#define IP_PRE_get_stride(stride, n, datasize, torque)  \
+#if INTEL_DATA_ALIGN > 1
+
+#define IP_PRE_edge_align(n, esize)                                     \
+  {                                                                     \
+    const int pad_mask = ~static_cast<int>(INTEL_DATA_ALIGN/esize-1);   \
+    n = (n + INTEL_DATA_ALIGN / esize - 1) & pad_mask;                  \
+  }
+
+#else
+  
+#define IP_PRE_edge_align(n, esize)                                     \
+
+#endif
+
+#define IP_PRE_get_stride(stride, n, datasize, torque)          \
   {                                                             \
     int blength = n;                                            \
     if (torque) blength *= 2;                                   \
@@ -303,7 +317,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
   {                                                             \
     tid = 0;                                                    \
     ifrom = 0;                                                  \
-    ip = 1;                                                     \
+    ip = vecsize;                                               \
     ito = inum;                                                 \
   }
 
@@ -316,7 +330,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
   acc_t *f_scalar = &f_start[0].x;                                      \
   flt_t *x_scalar = &pos[minlocal].x;                                   \
   int f_stride4 = f_stride * 4;                                         \
-  _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);                         \
+  _alignvar(acc_t ovv[16],64);                                          \
   int vwidth;                                                           \
   if (sizeof(acc_t) == sizeof(double))                                  \
     vwidth = INTEL_COMPILE_WIDTH/2;                                     \
@@ -516,6 +530,22 @@ inline double MIC_Wtime() {
   return time;
 }
 
+#define IP_PRE_neighbor_pad(jnum, offload)                              \
+{                                                                       \
+  const int opad_mask = ~static_cast<int>(INTEL_MIC_NBOR_PAD *          \
+                                          sizeof(float) /               \
+                                          sizeof(flt_t) - 1);           \
+  const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD *               \
+                                          sizeof(float) /               \
+                                          sizeof(flt_t) - 1);           \
+  if (offload && INTEL_MIC_NBOR_PAD > 1)                                \
+    jnum = (jnum + INTEL_MIC_NBOR_PAD * sizeof(float) /                 \
+            sizeof(flt_t) - 1) & opad_mask;                             \
+  else if (INTEL_NBOR_PAD > 1)                                          \
+    jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) /                     \
+            sizeof(flt_t) - 1) & pad_mask;                              \
+}
+
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)                      \
 {                                                                       \
@@ -644,6 +674,23 @@ inline double MIC_Wtime() {
 
 #else
 
+#if INTEL_NBOR_PAD > 1
+
+#define IP_PRE_neighbor_pad(jnum, offload)                              \
+{                                                                       \
+  const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD *               \
+                                         sizeof(float) /                \
+                                         sizeof(flt_t) - 1);            \
+  jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) /                       \
+          sizeof(flt_t) - 1) & pad_mask;                                \
+}
+
+#else
+
+#define IP_PRE_neighbor_pad(jnum, offload)
+
+#endif
+
 #define MIC_Wtime MPI_Wtime
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
index 24d5076c34..ae961e84b5 100644
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -112,7 +112,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
   if (pend-pstart == 0) return;
 
   const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
   int nall_t = nall;
   const int aend = nall;
 
@@ -207,6 +206,17 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
   const int mbinz = this->mbinz;
   const int * const stencilxyz = &this->stencilxyz[0][0];
 
+  int sb = 1;
+  if (special_flag[1] == 0) {
+    sb = 2;
+    if (special_flag[2] == 0) {
+      sb = 3;
+      if (special_flag[3] == 0)
+        sb = 4;
+    }
+  }
+  const int special_bound = sb;
+
   #ifdef _LMP_INTEL_OFFLOAD
   const int * _noalias const binhead = this->binhead;
   const int * _noalias const bins = this->bins;
@@ -230,7 +240,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
     in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
     in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
-    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
+    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny,special_bound)\
     in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
     in(stencilxyz:length(3*nstencil)) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
@@ -287,8 +297,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
       int e_ito = ito;
       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
 
-      int which;
-
       int pack_offset = maxnbors;
       int ct = (ifrom + tid * 2) * maxnbors;
       int *neighptr = firstneigh + ct;
@@ -418,41 +426,109 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
           }
         } // for u
 
+        if (molecular && i < nlocal) {
+          int alln = n;
+          n = 0;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #ifdef LMP_INTEL_NBOR_COMPAT
+          #pragma ivdep
+          #else
+          #pragma simd
+          #endif
+          #endif
+          for (int u = 0; u < alln; u++) {
+            int which;
+            int addme = 1;
+            int j = neighptr[u];
+            if (need_ic && j < 0) {
+              which = 0;
+              j = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            if (which) {
+              j = j ^ (which << SBBITS);
+              if (which < special_bound) addme = 0;
+            }
+            #ifdef LMP_INTEL_NBOR_COMPAT
+            if (addme) neighptr2[n++] = j;
+            #else
+            neighptr2[n++] = j;
+            #endif
+          }
+          alln = n2;
+          n2 = maxnbors * 2;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #ifdef LMP_INTEL_NBOR_COMPAT
+          #pragma ivdep
+          #else
+          #pragma simd
+          #endif
+          #endif
+          for (int u = n2; u < alln; u++) {
+            int which;
+            int addme = 1;
+            int j = neighptr[u];
+            if (need_ic && j < 0) {
+              which = 0;
+              j = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            if (which) {
+              j = j ^ (which << SBBITS);
+              if (which < special_bound) addme = 0;
+            }
+            #ifdef LMP_INTEL_NBOR_COMPAT
+            if (addme) neighptr2[n2++] = j;
+            #else
+            neighptr2[n2++] = j;
+            #endif
+          }
+        }
+       
         #ifndef _LMP_INTEL_OFFLOAD
         if (exclude) {
           int alln = n;
           n = maxnbors;
-          for (int u = pack_offset; u < alln; u++) {
-            const int j = neighptr[u];
-            int pj = j;
-            if (need_ic)
-              if (pj < 0) pj = -j - 1;
-            const int jtype = x[pj].w;
-            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
-            neighptr[n++] = j;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma ivdep
+          #endif
+          for (int u = n; u < alln; u++) {
+            int addme = 1;
+            const int js = neighptr[u];
+            const int j = js & NEIGHMASK;
+            const int jtype = x[j].w;
+            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
+            if (addme) neighptr2[n++] = js;
           }
           alln = n2;
           n2 = maxnbors * 2;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma ivdep
+          #endif
           for (int u = n2; u < alln; u++) {
-            const int j = neighptr[u];
-            int pj = j;
-            if (need_ic)
-              if (pj < 0) pj = -j - 1;
-            const int jtype = x[pj].w;
-            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
-            neighptr[n2++] = j;
+            int addme = 1;
+            const int js = neighptr[u];
+            const int j = js & NEIGHMASK;
+            const int jtype = x[j].w;
+            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
+            if (addme) neighptr2[n2++] = js;
           }
         }
         #endif
+        
         int ns = n - maxnbors;
         int alln = n;
         atombin[i] = ns;
         n = 0;
         for (int u = maxnbors; u < alln; u++)
-          neighptr[n++] = neighptr[u];
+          neighptr[n++] = neighptr2[u];
         ns += n2 - maxnbors * 2;
         for (int u = maxnbors * 2; u < n2; u++)
-          neighptr[n++] = neighptr[u];
+          neighptr[n++] = neighptr2[u];
         if (ns > maxnbors) *overflow = 1;
 
         ilist[i] = i;
@@ -460,9 +536,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
         numneigh[i] = ns;
 
         ct += ns;
-        const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-        const int edge = ct & (alignb - 1);
-        if (edge) ct += alignb - edge;
+        IP_PRE_edge_align(ct, sizeof(int));
         neighptr = firstneigh + ct;
         if (ct + obound > list_size) {
           if (i < ito - 1) {
@@ -477,84 +551,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
           numneigh[i] = 0;
 
       #ifdef _LMP_INTEL_OFFLOAD
-      int ghost_offset = 0, nall_offset = e_nall;
       if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          #if __INTEL_COMPILER+0 > 1499
-          #pragma vector aligned
-          #pragma simd
-          #endif
-          for (int jj = 0; jj < jnum; jj++) {
-            int j = jlist[jj];
-            if (need_ic && j < 0) j = -j - 1;
-          }
-        }
-
         overflow[LMP_LOCAL_MIN] = 0;
         overflow[LMP_LOCAL_MAX] = nlocal - 1;
         overflow[LMP_GHOST_MIN] = nlocal;
         overflow[LMP_GHOST_MAX] = e_nall - 1;
-
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      } // if separate_buffers
-      #endif
-
-      if (molecular) {
-        int ito_m = ito;
-        if (ito >= nlocal) ito_m = nlocal;
-        for (int i = ifrom; i < ito_m; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
-          #pragma simd
-          #endif
-          for (int jj = 0; jj < jnum; jj++) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-            if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        } // for i
-      } // if molecular
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          #pragma vector aligned
-          #pragma simd
-          for (jj = 0; jj < jnum; jj++) {
-            if (jlist[jj] >= nlocal) {
-              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-              else jlist[jj] -= ghost_offset;
-            }
-          }
-        }
       }
       #endif
     } // end omp
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index d3d2745aee..355c8db199 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -62,19 +62,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
   if (aend-astart == 0) return;
 
   const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
   int nall_t = nall;
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload_noghost && offload) nall_t = atom->nlocal;
-  if (THREE == 0 && offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
   #endif
-    if (THREE == 0 && INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
+  
   const int pack_width = _fix->nbor_pack_width();
 
   const ATOM_T * _noalias const x = buffers->get_x();
@@ -150,6 +143,17 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
   int * _noalias const ncachetag = buffers->get_ncachetag();
   const int ncache_stride = buffers->ncache_stride();
 
+  int sb = 1;
+  if (special_flag[1] == 0) {
+    sb = 2;
+    if (special_flag[2] == 0) {
+      sb = 3;
+      if (special_flag[3] == 0)
+        sb = 4;
+    }
+  }
+  const int special_bound = sb;
+  
   #ifdef _LMP_INTEL_OFFLOAD
   const int * _noalias const binhead = this->binhead;
   const int * _noalias const bins = this->bins;
@@ -172,9 +176,9 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
     in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
     in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
-    in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
+    in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \
     in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
-    in(pack_width) \
+    in(pack_width,special_bound) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
     signal(tag)
@@ -226,19 +230,25 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       ifrom += astart;
       ito += astart;
       int e_ito = ito;
+      #ifdef LMP_INTEL_3BODY_FAST
       if (THREE && ito == num) {
         int imod = ito & (pack_width - 1);
         if (imod) e_ito += pack_width - imod;
       }
+      #endif
       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
 
-      int which;
-
-      int pack_offset = maxnbors;
-      if (THREE) pack_offset *= pack_width;
+      #ifdef LMP_INTEL_3BODY_FAST
+      const int pack_offset = maxnbors * pack_width;
+      const int obound = pack_offset + maxnbors * 2;
+      #else
+      const int pack_offset = 0;
+      const int obound = maxnbors * 3;
+      #endif
       int ct = (ifrom + tid * 2) * maxnbors;
       int *neighptr = firstneigh + ct;
-      const int obound = pack_offset + maxnbors * 2;
+      int *neighptr2;
+      if (THREE) neighptr2 = neighptr;
 
       const int toffs = tid * ncache_stride;
       flt_t * _noalias const tx = ncachex + toffs;
@@ -256,10 +266,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
       // loop over all atoms in other bins in stencil, store every pair
       int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      #ifdef LMP_INTEL_3BODY_FAST
       if (THREE) {
         lane = 0;
         max_chunk = 0;
       }
+      #endif
       for (int i = ifrom; i < ito; i++) {
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
@@ -298,9 +310,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           if (FULL == 0 || TRI == 1) {
             icount = 0;
             istart = ncount;
-            const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-            int nedge = istart & (alignb - 1);
-            if (nedge) istart + (alignb - nedge);
+            IP_PRE_edge_align(istart, sizeof(int));
             itx = tx + istart;
             ity = ty + istart;
             itz = tz + istart;
@@ -383,11 +393,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
         // ---------------------- Loop over other bins
 
-        int n2, *neighptr2;
+        int n2;
         if (THREE) {
+          #ifdef LMP_INTEL_3BODY_FAST
           n = pack_offset;
+          #endif
           n2 = pack_offset + maxnbors;
-          neighptr2 = neighptr;
         }
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
@@ -461,49 +472,133 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           }
         } // for u
 
+        if (molecular) {
+          if (!THREE) neighptr2 = neighptr;
+          int alln = n;
+
+          n = pack_offset;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #ifdef LMP_INTEL_NBOR_COMPAT
+          #pragma ivdep
+          #else
+          #pragma simd
+          #endif
+          #endif
+          for (int u = n; u < alln; u++) {
+            int which;
+            int addme = 1;
+            int j = neighptr[u];
+            if (need_ic && j < 0) {
+              which = 0;
+              j = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            
+            if (which) {
+              j = j ^ (which << SBBITS);
+              if (which < special_bound) addme = 0;
+            }
+            #ifdef LMP_INTEL_NBOR_COMPAT
+            if (addme) neighptr2[n++] = j;
+            #else
+            neighptr2[n++]=j;
+            #endif
+          }
+
+          if (THREE) {
+            alln = n2;
+            n2 = pack_offset + maxnbors;
+            
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma vector aligned
+            #ifdef LMP_INTEL_NBOR_COMPAT
+            #pragma ivdep
+            #else
+            #pragma simd
+            #endif
+            #endif
+            for (int u = n2; u < alln; u++) {
+              int which;
+              int addme = 1;
+              int j = neighptr[u];
+              if (need_ic && j < 0) {
+                which = 0;
+                j = -j - 1;
+              } else
+                ofind_special(which, special, nspecial, i, tag[j]);
+              if (which) {
+                j = j ^ (which << SBBITS);
+                if (which < special_bound) addme = 0;
+              }
+              #ifdef LMP_INTEL_NBOR_COMPAT
+              if (addme) neighptr2[n2++] = j;
+              #else
+              neighptr2[n2++]=j;
+              #endif
+            }
+          }
+        }
+        
         #ifndef _LMP_INTEL_OFFLOAD
         if (exclude) {
+          neighptr2 = neighptr;
           int alln = n;
-          if (THREE) n = pack_offset;
-          else n = 0;
-          for (int u = pack_offset; u < alln; u++) {
-            const int j = neighptr[u];
-            int pj = j;
-            if (need_ic)
-              if (pj < 0) pj = -j - 1;
-            const int jtype = x[pj].w;
-            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
-            neighptr[n++] = j;
+          n = pack_offset;
+
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma ivdep
+          #endif
+          for (int u = n; u < alln; u++) {
+            int addme = 1;
+            const int js = neighptr[u];
+            const int j = js & NEIGHMASK;
+            const int jtype = x[j].w;
+            if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
+            if (addme) neighptr2[n++] = js;
           }
           if (THREE) {
             alln = n2;
             n2 = pack_offset + maxnbors;
-            for (int u = pack_offset + maxnbors; u < alln; u++) {
-              const int j = neighptr[u];
-              int pj = j;
-              if (need_ic)
-                if (pj < 0) pj = -j - 1;
-              const int jtype = x[pj].w;
-              if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
-              neighptr[n2++] = j;
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma vector aligned
+            #pragma ivdep
+            #endif
+            for (int u = n2; u < alln; u++) {
+              int addme = 1;
+              const int js = neighptr[u];
+              const int j = js & NEIGHMASK;
+              const int jtype = x[j].w;
+              if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0;
+              if (addme) neighptr2[n2++] = js;
             }
           }
         }
         #endif
+
         int ns;
         if (THREE) {
           int alln = n;
           ns = n - pack_offset;
           atombin[i] = ns;
+          ns += n2 - pack_offset - maxnbors;
+
+          #ifdef LMP_INTEL_3BODY_FAST
           n = lane;
           for (int u = pack_offset; u < alln; u++) {
-            neighptr[n] = neighptr[u];
+            neighptr[n] = neighptr2[u];
             n += pack_width;
           }
-          ns += n2 - pack_offset - maxnbors;
+          #endif
+          
           for (int u = pack_offset + maxnbors; u < n2; u++) {
-            neighptr[n] = neighptr[u];
+            #ifdef LMP_INTEL_3BODY_FAST
+            neighptr[n] = neighptr2[u];
             n += pack_width;
+            #else
+            neighptr[n++] = neighptr2[u];
+            #endif
           }
           if (ns > maxnbors) *overflow = 1;
         } else
@@ -512,34 +607,33 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
         ilist[i] = i;
         cnumneigh[i] = ct;
         if (THREE) {
+          #ifdef LMP_INTEL_3BODY_FAST
           cnumneigh[i] += lane;
+          #endif
           numneigh[i] = ns;
         } else {
-          int edge = n & (pad_width - 1);
-          if (edge) {
-            const int pad_end = n + (pad_width - edge);
-            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
-            #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
-                    avg=INTEL_COMPILE_WIDTH/2
-            #endif
-            for ( ; n < pad_end; n++)
-              neighptr[n] = e_nall;
-          }
           numneigh[i] = n;
+          int pad_end = n;
+          IP_PRE_neighbor_pad(pad_end, offload);
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+                  avg=INTEL_COMPILE_WIDTH/2
+          #endif
+          for ( ; n < pad_end; n++)
+            neighptr[n] = e_nall;
         }
 
+        #ifdef LMP_INTEL_3BODY_FAST
         if (THREE) {
           if (ns > max_chunk) max_chunk = ns;
           lane++;
           if (lane == pack_width) {
             ct += max_chunk * pack_width;
-            const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-            const int edge = ct & (alignb - 1);
-            if (edge) ct += alignb - edge;
+            IP_PRE_edge_align(ct, sizeof(int));
             neighptr = firstneigh + ct;
+            neighptr2 = neighptr;
             max_chunk = 0;
-            pack_offset = maxnbors * pack_width;
             lane = 0;
             if (ct + obound > list_size) {
               if (i < ito - 1) {
@@ -548,12 +642,13 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
               }
             }
           }
-        } else {
+        } else
+        #endif
+        {
           ct += n;
-          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = ct & (alignb - 1);
-          if (edge) ct += alignb - edge;
+          //IP_PRE_edge_align(ct, sizeof(int));
           neighptr = firstneigh + ct;
+          if (THREE) neighptr2 = neighptr;
           if (ct + obound > list_size) {
             if (i < ito - 1) {
               *overflow = 1;
@@ -573,14 +668,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       if (separate_buffers) {
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
+          int jnum = numneigh[i];
+          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           #if __INTEL_COMPILER+0 > 1499
           #pragma vector aligned
           #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
           #endif
           for (int jj = 0; jj < jnum; jj++) {
-            int j = jlist[jj];
-            if (need_ic && j < 0) j = -j - 1;
+            const int j = jlist[jj] & NEIGHMASK;
             if (j < nlocal) {
               if (j < vlmin) vlmin = j;
               if (j > vlmax) vlmax = j;
@@ -615,72 +710,20 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
           nall_offset = nlocal + nghost;
         }
-      } // if separate_buffers
-      #endif
 
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-
-          if (THREE) {
-            const int trip = jnum * pack_width;
-            for (int jj = 0; jj < trip; jj+=pack_width) {
-              const int j = jlist[jj];
-              if (need_ic && j < 0) {
-                which = 0;
-                jlist[jj] = -j - 1;
-              } else
-                ofind_special(which, special, nspecial, i, tag[j]);
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (j >= nlocal) {
-                if (j == e_nall)
-                  jlist[jj] = nall_offset;
-                else if (which)
-                  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-                else jlist[jj]-=ghost_offset;
-              } else
-              #endif
-              if (which) jlist[jj] = j ^ (which << SBBITS);
-            }
-          } else {
-            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
-            #pragma simd
-            #endif
-            for (int jj = 0; jj < jnum; jj++) {
-              const int j = jlist[jj];
-              if (need_ic && j < 0) {
-                which = 0;
-                jlist[jj] = -j - 1;
-              } else
-                ofind_special(which, special, nspecial, i, tag[j]);
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (j >= nlocal) {
-                if (j == e_nall)
-                  jlist[jj] = nall_offset;
-                else if (which)
-                  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-                else jlist[jj]-=ghost_offset;
-              } else
-              #endif
-              if (which) jlist[jj] = j ^ (which << SBBITS);
-            }
-          }
-        } // for i
-      } // if molecular
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
+          int jnum = numneigh[i];
+          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
           int jj = 0;
           #pragma vector aligned
           #pragma simd
           for (jj = 0; jj < jnum; jj++) {
-            if (jlist[jj] >= nlocal) {
-              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-              else jlist[jj] -= ghost_offset;
+            const int which = jlist[jj] >> SBBITS & 3;
+            const int j = jlist[jj] & NEIGHMASK;
+            if (j >= nlocal) {
+              if (j == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] = (j - ghost_offset) ^ (which << SBBITS);
             }
           }
         }
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index 71aad95bc5..39fa9014db 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -142,6 +142,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -185,6 +186,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
@@ -221,15 +223,17 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
-        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index bba8a7b5e7..fe4d408a13 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -142,6 +142,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -209,6 +210,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
@@ -255,15 +257,17 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
         const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index f5dde26393..8ce3d121e0 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -133,6 +133,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -169,6 +170,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
@@ -205,14 +207,16 @@ void PairBuckIntel::eval(const int offload, const int vflag,
       FORCE_T * _noalias const f = f_start + foff;
       if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
-        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp
index cb8c06cacc..7c33ed21d3 100644
--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@@ -171,6 +171,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
   lmp_vt *v = (lmp_vt *)atom->v[0];
   const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -237,7 +238,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
         gamma = param[3].gamma;
         sigma = param[3].sigma;
       }
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias parami;
         if (!ONETYPE) {
@@ -246,8 +248,9 @@ void PairDPDIntel::eval(const int offload, const int vflag,
           parami = param + ptr_off;
         }
 
-        const int * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp
index 94799cdba2..201277e68d 100644
--- a/src/USER-INTEL/pair_eam_intel.cpp
+++ b/src/USER-INTEL/pair_eam_intel.cpp
@@ -164,8 +164,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
     memory->destroy(rho);
     memory->destroy(fp);
     nmax = atom->nmax;
-    int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN;
-    if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t);
+    IP_PRE_edge_align(nmax, sizeof(acc_t));
     if (NEWTON_PAIR)
       memory->create(rho,nmax*comm->nthreads,"pair:rho");
     else
@@ -192,6 +191,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -286,14 +286,16 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         rhor_joff = rhor_ioff + _onetype * jstride;
         frho_ioff = fstride * _onetype;
       }
-      for (int i = iifrom; i < iito; ++i) {
+      for (int ii = iifrom; ii < iito; ++ii) {
+        const int i = ilist[ii];
         int itype, rhor_ioff;
         if (!ONETYPE) {
           itype = x[i].w;
           rhor_ioff = istride * itype;
         }
-        const int * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
@@ -411,7 +413,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       #pragma vector aligned
       #pragma simd reduction(+:tevdwl)
       #endif
-      for (int i = iifrom; i < iito; ++i) {
+      for (int ii = iifrom; ii < iito; ++ii) {
+        const int i = ilist[ii];
         int itype;
         if (!ONETYPE) itype = x[i].w;
         flt_t p = rho[i]*frdrho + (flt_t)1.0;
@@ -457,7 +460,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       // compute forces on each atom
       // loop over neighbors of my atoms
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int ii = iifrom; ii < iito; ++ii) {
+        const int i = ilist[ii];
         int itype, rhor_ioff;
         const flt_t * _noalias scale_fi;
         if (!ONETYPE) {
@@ -465,8 +469,9 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           rhor_ioff = istride * itype;
           scale_fi = scale_f + itype*ntypes;
         }
-        const int * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
@@ -761,14 +766,12 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
     if (ntypes > 0) {
       _cop = cop;
       _nr = nr + 1;
-      int edge = (_nr * sizeof(flt_t)) % INTEL_DATA_ALIGN;
-      if (edge) _nr += (INTEL_DATA_ALIGN - edge) / sizeof(flt_t);
+      IP_PRE_edge_align(_nr, sizeof(flt_t));
       memory->create(rhor_spline_f,ntypes*ntypes*_nr,"fc.rhor_spline_f");
       memory->create(rhor_spline_e,ntypes*ntypes*_nr,"fc.rhor_spline_e");
       memory->create(z2r_spline_t,ntypes*ntypes*_nr,"fc.z2r_spline_t");
       _nrho = nrho + 1;
-      edge = (_nrho * sizeof(flt_t)) % INTEL_DATA_ALIGN;
-      if (edge) _nrho += (INTEL_DATA_ALIGN - edge) / sizeof(flt_t);
+      IP_PRE_edge_align(_nrho, sizeof(flt_t));
       memory->create(frho_spline_f,ntypes*_nrho,"fc.frho_spline_f");
       memory->create(frho_spline_e,ntypes*_nrho,"fc.frho_spline_e");
       memory->create(scale_f,ntypes,ntypes,"fc.scale_f");
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index 1f05ad0efc..30941a7594 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -226,7 +226,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   }
   #endif
 
-  //  const int * _noalias const ilist = list->ilist;
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -287,6 +287,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(quat:length(nall+1) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
     in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
@@ -364,15 +365,16 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
 
       int ierror = 0;
-      for (int i = iifrom; i < iito; i += iip) {
-        // const int i = ilist[ii];
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
         const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
 
-        const int * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
@@ -428,15 +430,13 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           } else
             multiple_forms = true;
         }
-        const int edge = packed_j & (pad_width - 1);
-        if (edge) {
-          const int packed_end = packed_j + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; packed_j < packed_end; packed_j++)
-            jlist_form[packed_j] = nall;
-        }
+        int packed_end = packed_j;
+        IP_PRE_neighbor_pad(packed_end, offload);
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count min=1, max=15, avg=8
+        #endif
+        for ( ; packed_j < packed_end; packed_j++)
+          jlist_form[packed_j] = nall;
 
         // -------------------------------------------------------------
 
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
index 7485395bef..8e6395c388 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -136,6 +136,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -189,6 +190,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
@@ -238,16 +240,17 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
-      for (int i = iifrom; i < iito; i += iip) {
-        //        const int i = ilist[ii];
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const flt_t * _noalias const cutsqi = cutsq + ptr_off;
         const LJ_T * _noalias const lji = lj + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index 25cca59714..a01a4688a5 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -140,6 +140,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -210,6 +211,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
@@ -258,16 +260,17 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
-      for (int i = iifrom; i < iito; i += iip) {
-        //        const int i = ilist[ii];
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const flt_t * _noalias const cutsqi = cutsq + ptr_off;
         const LJ_T * _noalias const lji = lj + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index cb7381270d..ab0b5b3d55 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -139,6 +139,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   ATOM_T * _noalias const x = buffers->get_x(offload);
   flt_t * _noalias const q = buffers->get_q(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -204,6 +205,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
@@ -250,15 +252,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
 
-        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index b16f6230cc..c973639709 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -148,6 +148,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const firstneigh = buffers->firstneigh(list);
@@ -207,7 +208,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
         lj4 = lj34[3].lj4;
         offset = ljc12o[3].offset;
       }
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias ljc12oi;
         const FC_PACKED2_T * _noalias lj34i;
@@ -217,9 +219,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           ljc12oi = ljc12o + ptr_off;
           lj34i = lj34 + ptr_off;
         }
-
-        const int * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
+        int jnum = numneigh[ii];
+        IP_PRE_neighbor_pad(jnum, offload);
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 421de91ee9..d1d52270be 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -42,7 +42,6 @@
 #include "suffix.h"
 
 #ifdef LMP_USE_AVXCD
-#define OUTER_CHUNK 1
 #include "intel_simd.h"
 using namespace ip_simd;
 #endif
@@ -185,6 +184,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
@@ -230,6 +230,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
@@ -298,7 +299,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
         }
       }
 
-      for (int i = iifrom; i < iito; i += iip) {
+      for (int ii = iifrom; ii < iito; ii += iip) {
+        const int i = ilist[ii];
         int itype, itype_offset;
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
@@ -309,9 +311,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
           itype_offset = itype * ntypes;
         }
 
-        const int * _noalias const jlist = firstneigh + cnumneigh[i];
-        const int jnum = numneigh[i];
-        const int jnumhalf = numneighhalf[i];
+        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
+        const int jnum = numneigh[ii];
+        const int jnumhalf = numneighhalf[ii];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl;
@@ -346,9 +348,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
           }
         }
 
-        int ejrem = ejnum & (pad_width - 1);
-        if (ejrem) ejrem = pad_width - ejrem;
-        const int ejnum_pad = ejnum + ejrem;
+        int ejnum_pad = ejnum;
+        IP_PRE_neighbor_pad(ejnum_pad, offload);
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+                avg=INTEL_COMPILE_WIDTH/2
+        #endif
         for (int jj = ejnum; jj < ejnum_pad; jj++) {
           tdelx[jj] = (flt_t)0.0;
           tdely[jj] = (flt_t)0.0;
@@ -594,6 +600,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
   IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
 
   ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneighhalf = buffers->get_atombin();
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
@@ -641,6 +648,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
@@ -714,24 +722,24 @@ void PairSWIntel::eval(const int offload, const int vflag,
         }
       }
 
-      SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
       const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
                                         144,160,176,192,208,224,240);
-      ilist = ilist + iifrom;
       acc_t * const dforce = &(f[0].x);
       for (int i = iifrom; i < iito; i += iip) {
-        SIMD_mask imask = ilist < iito;
+        SIMD_int ilistv = SIMD_load(ilist + i);
+        SIMD_int goffset = ilistv * 16;
+        SIMD_mask imask = ilistv < iito;
         SIMD_flt_t xtmp, ytmp, ztmp;
         SIMD_int itype, itype_offset;
 
         if (ONETYPE)
-          SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
+          SIMD_atom_gather(imask, &(x[0].x), goffset, xtmp, ytmp, ztmp);
         else {
-          SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
+          SIMD_atom_gather(imask, &(x[0].x), goffset, xtmp, ytmp, ztmp, itype);
           itype_offset = itype * ntypes;
         }
 
-        #ifdef OUTER_CHUNK
+        #ifdef LMP_INTEL_3BODY_FAST
         const int* ng = firstneigh + cnumneigh[i] - swidth;
         #else
         SIMD_int ng = SIMD_load(cnumneigh + i);
@@ -765,7 +773,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
         for (int jj = 0; jj < jnum_max; jj++) {
           SIMD_mask jmask = jj < jnum;
 
-          #ifdef OUTER_CHUNK
+          #ifdef LMP_INTEL_3BODY_FAST
           ng += swidth;
           SIMD_int j = SIMD_load(ng);
           #else
@@ -1025,15 +1033,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
           }
         } // for jj second loop
 
-        SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
+        SIMD_iforce_update(imask, &(f[0].x), goffset, fxtmp, fytmp, fztmp,
                            EFLAG, eatom, fwtmp);
         if (is_same<flt_t,acc_t>::value == 0) {
           imask = imask >> 8;
-          SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
+          SIMD_int goffset2 = _mm512_shuffle_i32x4(goffset, goffset, 238);
+          SIMD_iforce_update(imask, &(f[0].x), goffset2, fxtmp2, fytmp2,
                              fztmp2, EFLAG, eatom, fwtmp2);
         }
         if (EFLAG) oevdwl += SIMD_sum(sevdwl);
-        ilist = ilist + iip;
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index c772546928..584b371784 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -208,6 +208,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
       const int * _noalias const cnumneigh,
       const int * _noalias const firstneigh, int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
+      const int * _noalias const ilist,
       const c_inner_t * _noalias const c_inner,
       const c_outer_t * _noalias const c_outer,
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
@@ -271,6 +272,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
   tagint * _noalias tag = this->atom->tag;
   flt_t * _noalias const q = buffers->get_q(offload);
 
+  const int * _noalias const ilist = list->ilist;
   const int * _noalias const numneigh = list->numneigh;
   const int * _noalias const cnumneigh = buffers->cnumneigh(list);
   const int * _noalias const numneighhalf = buffers->get_atombin();
@@ -311,6 +313,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
     in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
     in(f_stride,nlocal,minlocal,separate_flag,offload) \
@@ -349,8 +352,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
       {
         acc_t sevdwl;
         sevdwl = 0.;
-        #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
-          firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl
+        #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, \
+                     cnumneigh, firstneigh, ntypes, x, ilist, c_inner, \
+                     c_outer, f, &sevdwl
         // Pick the variable i algorithm under specific conditions
         // do use scalar algorithm with very short vectors
         int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
@@ -1135,6 +1139,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
     const int * _noalias const cnumneigh,
     const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
+    const int * _noalias const ilist,
     const c_inner_t * _noalias const c_inner,
     const c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
@@ -1155,12 +1160,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
   for (ii = iifrom; ii < iito; ii++) {
     // Right now this loop is scalar, to allow for the compiler to do
     //  its prefetching magic.
-    int i = ii;
+    int i = ilist[ii];
     int w_i = x[i].w;
     flt_t x_i = x[i].x;
     flt_t y_i = x[i].y;
     flt_t z_i = x[i].z;
-    int jlist_off_i = cnumneigh[i];
+    int jlist_off_i = cnumneigh[ii];
     int jnum = numneigh[ii];
     for (jj = 0; jj < jnum; jj++) {
       int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
@@ -1173,7 +1178,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
       if (rsq < cutsq) {
         is[compress_idx] = ii;
         js[compress_idx] = j;
-        if (jj < numneighhalf[i])
+        if (jj < numneighhalf[ii])
           repulsive_flag[compress_idx] = 1;
         compress_idx += 1;
       }
diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index 962202e228..5dd0da1939 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -157,7 +157,7 @@ void VerletLRTIntel::setup(int flag)
   pthread_create(&_kspace_thread, &_kspace_attr,
                  &VerletLRTIntel::k_launch_loop, this);
   #elif defined(_LMP_INTEL_LRT_11)
-  std::thread kspace_thread;
+  std::thread _kspace_thread;
   if (kspace_compute_flag)
     _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag,
                                                                  vflag); });
diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h
index 7380cc6376..6c44ba7e57 100644
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@@ -24,14 +24,20 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
 #include "pppm_intel.h"
 
 #ifdef LMP_INTEL_USELRT
-#ifdef LMP_INTEL_LRT11
-#define _LMP_INTEL_LRT_11
-#include <thread>
-#else
-#define _LMP_INTEL_LRT_PTHREAD
-#include <pthread.h>
-#endif
-
+  #if defined(LMP_INTEL_LRT11) || defined(__APPLE__)
+    #if __cplusplus > 199711L
+      #define _LMP_INTEL_LRT_11
+      #include <thread>
+    #else
+      #undef LMP_INTEL_USELRT
+      #ifdef LMP_INTEL_LRT11
+        #error C++11 support required for LMP_INTEL_LRT11 define
+      #endif
+    #endif
+  #else
+    #define _LMP_INTEL_LRT_PTHREAD
+    #include <pthread.h>
+  #endif
 #endif
 
 namespace LAMMPS_NS {
-- 
GitLab