diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index f5bd66aebaf7ec6c640f1fbe08fb5e844a4ca8ea..155e29e367241c02943c06eb674ddb7fffbb1cd2 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -106,6 +106,8 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
+For simulations on higher node counts, add "processors * * * grid 
+numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
 :ule
@@ -392,6 +394,10 @@ hybrid intel omp"_suffix.html command can also be used within the
 input script to automatically append the "omp" suffix to styles when
 USER-INTEL styles are not available.
 
+NOTE: For simulations on higher node counts, add "processors * * * 
+grid numa"_processors.html" to the beginning of the input script for
+better scalability.
+
 When running on many nodes, performance might be better when using
 fewer OpenMP threads and more MPI tasks. This will depend on the
 simulation and the machine. Using the "verlet/split"_run_style.html
diff --git a/doc/src/read_data.txt b/doc/src/read_data.txt
index bd602eee5a9cdc39aaf9e6f51b83bc5e7be2cef8..6785eb10660861626bf0e65e82eb09ade46a9a0b 100644
--- a/doc/src/read_data.txt
+++ b/doc/src/read_data.txt
@@ -14,7 +14,7 @@ read_data file keyword args ... :pre
 
 file = name of data file to read in :ulb,l
 zero or more keyword/arg pairs may be appended :l
-keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {group} or {nocoeff} or {fix} :l
+keyword = {add} or {offset} or {shift} or {extra/atom/types} or {extra/bond/types} or {extra/angle/types} or {extra/dihedral/types} or {extra/improper/types} or {extra/bond/per/atom} or {extra/angle/per/atom} or {extra/dihedral/per/atom} or {extra/improper/per/atom} or {group} or {nocoeff} or {fix} :l
   {add} arg = {append} or {Nstart} or {merge}
     append = add new atoms with IDs appended to current IDs
     Nstart = add new atoms with IDs starting with Nstart
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index b06f76c90ddf993a74011180e07bd61f4dcc634a..637fc0d06e888c409733ee722437ef33ab286ac0 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -748,7 +748,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
       if (eatom) {
         double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
         #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
@@ -762,7 +763,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
         #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
@@ -778,7 +780,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
       if (eatom) {
         double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
         #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
@@ -788,7 +791,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
-        #pragma novector
+        #pragma vector aligned
+	#pragma ivdep
         #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 135309fe44557b8d63b420602405212421adae7a..7a7640a20366fc6410a61ae71e3e01ec7d92b7d2 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -172,6 +172,10 @@ class IntelBuffers {
 
   inline void thr_pack(const int ifrom, const int ito, const int ago) {
     if (ago == 0) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = lmp->atom->x[i][0];
         _x[i].y = lmp->atom->x[i][1];
@@ -179,9 +183,17 @@ class IntelBuffers {
         _x[i].w = lmp->atom->type[i];
       }
       if (lmp->atom->q != NULL)
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
         for (int i = ifrom; i < ito; i++)
           _q[i] = lmp->atom->q[i];
     } else {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = lmp->atom->x[i][0];
         _x[i].y = lmp->atom->x[i][1];
@@ -204,7 +216,10 @@ class IntelBuffers {
                            const int offset, const bool dotype = false) {
     double ** x = lmp->atom->x + offset;
     if (dotype == false) {
-      #pragma vector nontemporal
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = x[i][0];
         _x[i].y = x[i][1];
@@ -212,7 +227,10 @@ class IntelBuffers {
       }
     } else {
       int *type = lmp->atom->type + offset;
-      #pragma vector nontemporal
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
       for (int i = ifrom; i < ito; i++) {
         _x[i].x = x[i][0];
         _x[i].y = x[i][1];
@@ -225,6 +243,10 @@ class IntelBuffers {
   inline void thr_pack_host(const int ifrom, const int ito,
                             const int offset) {
     double ** x = lmp->atom->x + offset;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma ivdep
+    #endif
     for (int i = ifrom; i < ito; i++) {
       _host_x[i].x = x[i][0];
       _host_x[i].y = x[i][1];
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 1f6225dc440188a80371a32b59ca0f0e90a40c18..e256dd1abc93f5d59d657697c668b597367d5631 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -68,7 +68,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL 256
 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
 #define INTEL_MAX_STENCIL_CHECK 4096
-#define INTEL_P3M_MAXORDER 7
+#define INTEL_P3M_MAXORDER 8
 #define INTEL_P3M_ALIGNED_MAXORDER 8
 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
 #define INTEL_P3M_TABLE 1
@@ -248,6 +248,12 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 
 #else
 
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
+  {                                                             \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+  }
+
 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
   {                                                             \
     tid = 0;                                                    \
@@ -293,6 +299,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
     ito = inum;                                                 \
   }
 
+#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum,      \
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = vecsize;                                               \
+  }
+
 #endif
 
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
index ec5f5150c2546c1fbdc1bfb74096c27d9e0321e5..1269579ff4f26addaa08d00c19ff433626d5a089 100644
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -885,21 +885,22 @@ void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR z0 = fdelvolinv * q[i];
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order; n++) {
         int mz = n*nix*niy + nzsum;
         FFT_SCALAR y0 = z0*rho[2][n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order; m++) {
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
             my_density[mzyx] += x0*rho[0][l];
           }
@@ -1034,21 +1035,22 @@ void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR z0 = fdelvolinv * B[type];
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n*nix*niy + nzsum;
         FFT_SCALAR y0 = z0*rho[2][n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
             my_density[mzyx] += x0*rho[0][l];
           }
@@ -1181,21 +1183,22 @@ void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR z0 = fdelvolinv;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n + nzsum;
         FFT_SCALAR y0 = z0*rho[2][n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int my = m + nysum;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
             FFT_SCALAR w = x0*rho[0][l];
             density_brick_a0[mz][my][mx] += w*B[7*type];
@@ -1314,21 +1317,22 @@ void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR z0 = fdelvolinv;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n*nix*niy + nzsum;
         FFT_SCALAR y0 = z0*rho[2][n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mzyx = l + mzy;
             FFT_SCALAR w0 = x0*rho[0][l];
             for(int k = 0; k < nsplit; k++)
@@ -1462,21 +1466,22 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
       _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order; n++) {
         int mz = n+nzsum;
         FFT_SCALAR z0 = rho2[n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order; m++) {
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
             FFT_SCALAR x0 = y0*rho0[l];
               ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
@@ -1490,12 +1495,11 @@ void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR ekx, eky, ekz;
       ekx = eky = ekz = ZEROF;
 
-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx += ekx_arr[l];
-          eky += eky_arr[l];
-          ekz += ekz_arr[l];
-        }
+      for (int l = 0; l < order; l++) {
+        ekx += ekx_arr[l];
+	eky += eky_arr[l];
+	ekz += ekz_arr[l];
+      }
 
       // convert E-field to force
 
@@ -1643,12 +1647,12 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
 
       particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order; n++) {
         int mz = n + nzsum;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order; m++) {
           int my = m + nysum;
@@ -1656,9 +1660,10 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
             ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
             eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
@@ -1668,9 +1673,9 @@ void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
       }
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
         particle_ekx[i] += ekx[l];
         particle_eky[i] += eky[l];
         particle_ekz[i] += ekz[l];
@@ -1809,21 +1814,22 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
       _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n+nzsum;
         FFT_SCALAR z0 = rho2[n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
             FFT_SCALAR x0 = y0*rho0[l];
               ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
@@ -1837,12 +1843,11 @@ void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR ekx, eky, ekz;
       ekx = eky = ekz = ZEROF;
 
-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx += ekx_arr[l];
-          eky += eky_arr[l];
-          ekz += ekz_arr[l];
-        }
+      for (int l = 0; l < order; l++) {
+        ekx += ekx_arr[l];
+	eky += eky_arr[l];
+	ekz += ekz_arr[l];
+      }
 
       // convert E-field to force
 
@@ -1985,12 +1990,12 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
 
       particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n + nzsum;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int my = m + nysum;
@@ -1998,9 +2003,10 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
             ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
             eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
@@ -2010,9 +2016,9 @@ void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
       }
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
         particle_ekx[i] += ekx[l];
         particle_eky[i] += eky[l];
         particle_ekz[i] += ekz[l];
@@ -2168,21 +2174,22 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
 
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n+nzsum;
         FFT_SCALAR z0 = rho2[n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l+nxsum;
             FFT_SCALAR x0 = y0*rho0[l];
               ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
@@ -2221,29 +2228,29 @@ void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
       ekx5 = eky5 = ekz5 = ZEROF;
       ekx6 = eky6 = ekz6 = ZEROF;
 
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx0 += ekx0_arr[l];
-          eky0 += eky0_arr[l];
-          ekz0 += ekz0_arr[l];
-          ekx1 += ekx1_arr[l];
-          eky1 += eky1_arr[l];
-          ekz1 += ekz1_arr[l];
-          ekx2 += ekx2_arr[l];
-          eky2 += eky2_arr[l];
-          ekz2 += ekz2_arr[l];
-          ekx3 += ekx3_arr[l];
-          eky3 += eky3_arr[l];
-          ekz3 += ekz3_arr[l];
-          ekx4 += ekx4_arr[l];
-          eky4 += eky4_arr[l];
-          ekz4 += ekz4_arr[l];
-          ekx5 += ekx5_arr[l];
-          eky5 += eky5_arr[l];
-          ekz5 += ekz5_arr[l];
-          ekx6 += ekx6_arr[l];
-          eky6 += eky6_arr[l];
-          ekz6 += ekz6_arr[l];
-        }
+      for (int l = 0; l < order; l++) {
+	ekx0 += ekx0_arr[l];
+	eky0 += eky0_arr[l];
+	ekz0 += ekz0_arr[l];
+	ekx1 += ekx1_arr[l];
+	eky1 += eky1_arr[l];
+	ekz1 += ekz1_arr[l];
+	ekx2 += ekx2_arr[l];
+	eky2 += eky2_arr[l];
+	ekz2 += ekz2_arr[l];
+	ekx3 += ekx3_arr[l];
+	eky3 += eky3_arr[l];
+	ekz3 += ekz3_arr[l];
+	ekx4 += ekx4_arr[l];
+	eky4 += eky4_arr[l];
+	ekz4 += ekz4_arr[l];
+	ekx5 += ekx5_arr[l];
+	eky5 += eky5_arr[l];
+	ekz5 += ekz5_arr[l];
+	ekx6 += ekx6_arr[l];
+	eky6 += eky6_arr[l];
+	ekz6 += ekz6_arr[l];
+      }
 
       // convert D-field to force
 
@@ -2439,12 +2446,12 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
       particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order_6; n++) {
         int mz = n + nzsum;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order_6; m++) {
           int my = m + nysum;
@@ -2452,9 +2459,10 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
           FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #pragma simd
           #endif
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int l = 0; l < order; l++) {
             int mx = l + nxsum;
             FFT_SCALAR x0 = drho[0][l] * ekx_p;
             FFT_SCALAR y0 = rho[0][l] * eky_p;
@@ -2486,9 +2494,9 @@ void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
       }
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
         particle_ekx0[i] += ekx0[l];
         particle_eky0[i] += eky0[l];
         particle_ekz0[i] += ekz0[l];
@@ -2681,21 +2689,22 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
 
       for (int k = 0; k < nsplit; k++) {
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int n = 0; n < order_6; n++) {
           int mz = n+nzsum;
           FFT_SCALAR z0 = rho2[n];
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int m = 0; m < order_6; m++) {
             int my = m+nysum;
             FFT_SCALAR y0 = z0*rho1[m];
             #if defined(LMP_SIMD_COMPILER)
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
             #pragma simd
             #endif
-            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            for (int l = 0; l < order; l++) {
               int mx = l+nxsum;
               FFT_SCALAR x0 = y0*rho0[l];
               ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
@@ -2716,13 +2725,13 @@ void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
         ekx[k] = eky[k] = ekz[k] = ZEROF;
       }
 
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          for (int k = 0; k < nsplit; k++) {
-            ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-            eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-            ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-          }
-        }
+      for (int l = 0; l < order; l++) {
+	for (int k = 0; k < nsplit; k++) {
+	  ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	  eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	  ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	}
+      }
 
       // convert E-field to force
 
@@ -2867,12 +2876,12 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
       for (int k = 0; k < nsplit; k++) {
         particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int n = 0; n < order_6; n++) {
           int mz = n + nzsum;
           #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
+          #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
           #endif
           for (int m = 0; m < order_6; m++) {
             int my = m + nysum;
@@ -2880,9 +2889,10 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
             FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
             FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
             #if defined(LMP_SIMD_COMPILER)
+            #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
             #pragma simd
             #endif
-            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            for (int l = 0; l < order; l++) {
               int mx = l + nxsum;
               ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
                 u_brick_none[k][mz][my][mx];
@@ -2903,9 +2913,9 @@ void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
       }
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
         for (int k = 0; k < nsplit; k++) {
           ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
           eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp
index 8416b6f3a35cb74159af8131e92c5b0b04c09355..f1cfe591f2a1365ffc8e2f3b68c6cad1dc5cf0bf 100644
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@@ -149,13 +149,13 @@ void PPPMIntel::init()
     memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
     memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
     memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
-                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
-                            "pppmintel:vdxy_brick");
+    create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
+		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+		    "pppmintel:vdxy_brick");
     memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
-                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
-                            "pppmintel:vdz0_brick");
+    create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
+		    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+		    "pppmintel:vdz0_brick");
     memory->destroy(work3);
     memory->create(work3, 2*nfft_both, "pppmintel:work3");
 
@@ -555,13 +555,13 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR z0 = fdelvolinv * q[i];
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order; n++) {
         int mz = n*nix*niy + nzsum;
         FFT_SCALAR y0 = z0*rho[2][n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order; m++) {
           int mzy = m*nix + mz;
@@ -708,13 +708,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
       _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order; n++) {
         int mz = n+nzsum;
         FFT_SCALAR z0 = rho2[n];
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order; m++) {
           int my = m+nysum;
@@ -742,13 +742,13 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
       ekx = eky = ekz = ZEROF;
 
       if (use_packing) {
-        for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
+        for (int l = 0; l < 2*order; l += 2) {
           ekx += ekxy_arr[l];
           eky += ekxy_arr[l+1];
           ekz += ekz0_arr[l];
         }
       } else {
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+        for (int l = 0; l < order; l++) {
           ekx += ekx_arr[l];
           eky += eky_arr[l];
           ekz += ekz_arr[l];
@@ -896,12 +896,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
       for (int n = 0; n < order; n++) {
         int mz = n + nzsum;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
+        #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
         #endif
         for (int m = 0; m < order; m++) {
           int my = m + nysum;
@@ -921,9 +921,9 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       }
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
+      #pragma loop_count min(2), max(INTEL_P3M_ALIGNED_MAXORDER), avg(7)
       #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      for (int l = 0; l < order; l++){
         particle_ekx[i] += ekx[l];
         particle_eky[i] += eky[l];
         particle_ekz[i] += ekz[l];
@@ -1240,6 +1240,73 @@ void PPPMIntel::pack_buffers()
   fix->stop_watch(TIME_PACK);
 }
 
+/* ----------------------------------------------------------------------
+   Allocate density_brick with extra padding for vector writes
+------------------------------------------------------------------------- */
+
+void PPPMIntel::allocate()
+{
+  PPPM::allocate();
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+  create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		  nxlo_out,nxhi_out,"pppm:density_brick");
+
+  if (differentiation_flag == 1) {
+    memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
+    create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:u_brick");
+  } else {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+    create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdx_brick");
+    create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdy_brick");
+    create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+	            nxlo_out,nxhi_out,"pppm:vdz_brick");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Create 3D-offset allocation with extra padding for vector writes
+------------------------------------------------------------------------- */
+
+FFT_SCALAR *** PPPMIntel::create3d_offset(FFT_SCALAR ***&array, int n1lo, 
+	                                  int n1hi, int n2lo, int n2hi, 
+	                                  int n3lo, int n3hi,
+	                                  const char *name)
+{
+  int n1 = n1hi - n1lo + 1;
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+
+  bigint nbytes = ((bigint) sizeof(FFT_SCALAR)) * n1*n2*n3 + 
+    INTEL_P3M_ALIGNED_MAXORDER*2;
+  FFT_SCALAR *data = (FFT_SCALAR *) memory->smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(FFT_SCALAR *)) * n1*n2;
+  FFT_SCALAR **plane = (FFT_SCALAR **) memory->smalloc(nbytes,name);
+  nbytes = ((bigint) sizeof(FFT_SCALAR **)) * n1;
+  array = (FFT_SCALAR ***) memory->smalloc(nbytes,name);
+
+  bigint m;
+  bigint n = 0;
+  for (int i = 0; i < n1; i++) {
+    m = ((bigint) i) * n2;
+    array[i] = &plane[m];
+    for (int j = 0; j < n2; j++) {
+      plane[m+j] = &data[n];
+      n += n3;
+    }
+  }
+
+  m = ((bigint) n1) * n2;
+  for (bigint i = 0; i < m; i++) array[0][i] -= n3lo;
+  for (int i = 0; i < n1; i++) array[i] -= n2lo;
+  array -= n1lo;
+  return array;
+}
+
 /* ----------------------------------------------------------------------
    Returns 0 if Intel optimizations for PPPM ignored due to offload
 ------------------------------------------------------------------------- */
diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h
index e152486b29b38e2a9cfd279089dd54d06b8465e3..5bffabe0e5f2e3ed07e22aa436ffa706d8dae9da 100644
--- a/src/USER-INTEL/pppm_intel.h
+++ b/src/USER-INTEL/pppm_intel.h
@@ -74,9 +74,10 @@ class PPPMIntel : public PPPM {
   int _use_base;
   #endif
 
-    template<class flt_t, class acc_t>
-    void test_function(IntelBuffers<flt_t,acc_t> *buffers);
+  virtual void allocate();
 
+  template<class flt_t, class acc_t>
+  void test_function(IntelBuffers<flt_t,acc_t> *buffers);
 
   void precompute_rho();
   template<class flt_t, class acc_t>
@@ -120,6 +121,8 @@ class PPPMIntel : public PPPM {
       fieldforce_ad<flt_t,acc_t,0>(buffers);
     }
   }
+  FFT_SCALAR ***create3d_offset(FFT_SCALAR ***&, int, int, int,
+				int, int, int, const char *name);
 };
 
 }