From 3c329d170791699f01f500213c48b57cb7cff38d Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 19 Jun 2017 13:23:01 -0400
Subject: [PATCH] massive whitespace cleanup in USER-INTEL

removed are:
- DOS/Windows text format carriage return characters (^M)
- tabs replaced with spaces (tabs are evil!!)
- trailing whitespace
---
 src/USER-INTEL/angle_charmm_intel.cpp         |   64 +-
 src/USER-INTEL/angle_charmm_intel.h           |    4 +-
 src/USER-INTEL/angle_harmonic_intel.cpp       |   64 +-
 src/USER-INTEL/angle_harmonic_intel.h         |    4 +-
 src/USER-INTEL/bond_fene_intel.cpp            |   64 +-
 src/USER-INTEL/bond_fene_intel.h              |    4 +-
 src/USER-INTEL/bond_harmonic_intel.cpp        |   54 +-
 src/USER-INTEL/bond_harmonic_intel.h          |    4 +-
 src/USER-INTEL/dihedral_charmm_intel.cpp      |  382 +-
 src/USER-INTEL/dihedral_charmm_intel.h        |    6 +-
 src/USER-INTEL/dihedral_harmonic_intel.cpp    |  134 +-
 src/USER-INTEL/dihedral_harmonic_intel.h      |    6 +-
 src/USER-INTEL/dihedral_opls_intel.cpp        |  138 +-
 src/USER-INTEL/dihedral_opls_intel.h          |    4 +-
 src/USER-INTEL/fix_intel.cpp                  |  194 +-
 src/USER-INTEL/fix_intel.h                    |   46 +-
 src/USER-INTEL/fix_nh_intel.cpp               |   74 +-
 src/USER-INTEL/fix_nh_intel.h                 |    2 +-
 src/USER-INTEL/fix_nve_asphere_intel.cpp      |   40 +-
 src/USER-INTEL/fix_nve_intel.cpp              |   46 +-
 src/USER-INTEL/improper_cvff_intel.cpp        |  116 +-
 src/USER-INTEL/improper_cvff_intel.h          |    4 +-
 src/USER-INTEL/improper_harmonic_intel.cpp    |   74 +-
 src/USER-INTEL/improper_harmonic_intel.h      |    4 +-
 src/USER-INTEL/intel_buffers.cpp              |   48 +-
 src/USER-INTEL/intel_buffers.h                |   46 +-
 src/USER-INTEL/intel_intrinsics.h             |  166 +-
 src/USER-INTEL/intel_preprocess.h             | 1194 ++--
 src/USER-INTEL/intel_simd.h                   |  994 +--
 src/USER-INTEL/math_extra_intel.h             |  698 +-
 src/USER-INTEL/nbin_intel.cpp                 |   20 +-
 src/USER-INTEL/npair_full_bin_intel.cpp       |   36 +-
 src/USER-INTEL/npair_full_bin_intel.h         |    2 +-
 .../npair_half_bin_newton_intel.cpp           |   12 +-
 .../npair_half_bin_newton_tri_intel.cpp       |   12 +-
 src/USER-INTEL/npair_intel.cpp                |  684 +-
 src/USER-INTEL/npair_intel.h                  |    4 +-
 src/USER-INTEL/pair_buck_coul_cut_intel.cpp   |  138 +-
 src/USER-INTEL/pair_buck_coul_cut_intel.h     |    6 +-
 src/USER-INTEL/pair_buck_coul_long_intel.cpp  |  230 +-
 src/USER-INTEL/pair_buck_coul_long_intel.h    |    6 +-
 src/USER-INTEL/pair_buck_intel.cpp            |  136 +-
 src/USER-INTEL/pair_buck_intel.h              |    8 +-
 src/USER-INTEL/pair_eam_intel.cpp             |  510 +-
 src/USER-INTEL/pair_eam_intel.h               |    8 +-
 src/USER-INTEL/pair_gayberne_intel.cpp        |  470 +-
 .../pair_lj_charmm_coul_long_intel.cpp        |  280 +-
 .../pair_lj_charmm_coul_long_intel.h          |    6 +-
 .../pair_lj_cut_coul_long_intel.cpp           |  246 +-
 src/USER-INTEL/pair_lj_cut_coul_long_intel.h  |    6 +-
 src/USER-INTEL/pair_lj_cut_intel.cpp          |  198 +-
 .../pair_lj_long_coul_long_intel.cpp          |  100 +-
 src/USER-INTEL/pair_lj_long_coul_long_intel.h |   78 +-
 src/USER-INTEL/pair_sw_intel.cpp              | 1078 +--
 src/USER-INTEL/pair_sw_intel.h                |    2 +-
 src/USER-INTEL/pair_tersoff_intel.cpp         |  208 +-
 src/USER-INTEL/pair_tersoff_intel.h           |    6 +-
 src/USER-INTEL/pppm_disp_intel.cpp            | 6068 ++++++++---------
 src/USER-INTEL/pppm_disp_intel.h              |  476 +-
 src/USER-INTEL/pppm_intel.cpp                 |  126 +-
 src/USER-INTEL/pppm_intel.h                   |    4 +-
 src/USER-INTEL/verlet_lrt_intel.cpp           |   36 +-
 62 files changed, 7939 insertions(+), 7939 deletions(-)

diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp
index 0c493646e3..d55afd4742 100644
--- a/src/USER-INTEL/angle_charmm_intel.cpp
+++ b/src/USER-INTEL/angle_charmm_intel.cpp
@@ -37,7 +37,7 @@ typedef struct { int a,b,c,t;  } int4_t;
 
 /* ---------------------------------------------------------------------- */
 
-AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) 
+AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -74,8 +74,8 @@ void AngleCharmmIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void AngleCharmmIntel::compute(int eflag, int vflag,
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
@@ -83,14 +83,14 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -103,9 +103,9 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void AngleCharmmIntel::eval(const int vflag, 
-			    IntelBuffers<flt_t,acc_t> *buffers,
-			    const ForceConst<flt_t> &fc)
+void AngleCharmmIntel::eval(const int vflag,
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->nanglelist;
@@ -133,7 +133,7 @@ void AngleCharmmIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)	\
+    shared(f_start,f_stride,fc) \
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
@@ -148,7 +148,7 @@ void AngleCharmmIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int4_t * _noalias const anglelist = 
+    const int4_t * _noalias const anglelist =
       (int4_t *) neighbor->anglelist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -246,35 +246,35 @@ void AngleCharmmIntel::eval(const int vflag,
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= f1x + f3x;
-	  f[i2].y -= f1y + f3y;
-	  f[i2].z -= f1z + f3z;
+          f[i2].y -= f1y + f3y;
+          f[i2].z -= f1z + f3z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
-        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, 
-                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, 
-                              dely1, delz1, delx2, dely2, delz2, seangle, 
-                              f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, 
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
+                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
+                              dely1, delz1, delx2, dely2, delz2, seangle,
+                              f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
                               sv4, sv5);
-	#else
-        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, 
-                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, 
-                              dely1, delz1, delx2, dely2, delz2, oeangle, 
-                              f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, 
+        #else
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2,
+                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1,
+                              dely1, delz1, delx2, dely2, delz2, oeangle,
+                              f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
                               ov4, ov5);
         #endif
       }
@@ -282,8 +282,8 @@ void AngleCharmmIntel::eval(const int vflag,
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oeangle += seangle;
     if (VFLAG && vflag) {
-        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
-	ov3 += sv3; ov4 += sv4; ov5 += sv5;
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
@@ -291,7 +291,7 @@ void AngleCharmmIntel::eval(const int vflag,
   if (EFLAG) energy += oeangle;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -348,11 +348,11 @@ void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
-	                                             Memory *memory) {
+                                                     Memory *memory) {
   if (nangletypes != _nangletypes) {
     if (_nangletypes > 0)
       _memory->destroy(fc);
-    
+
     if (nangletypes > 0)
       _memory->create(fc,nangletypes,"anglecharmmintel.fc");
   }
diff --git a/src/USER-INTEL/angle_charmm_intel.h b/src/USER-INTEL/angle_charmm_intel.h
index a98007b3ef..342af31b8c 100644
--- a/src/USER-INTEL/angle_charmm_intel.h
+++ b/src/USER-INTEL/angle_charmm_intel.h
@@ -45,8 +45,8 @@ class AngleCharmmIntel : public AngleCharmm {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp
index 198431d552..47e0add690 100644
--- a/src/USER-INTEL/angle_harmonic_intel.cpp
+++ b/src/USER-INTEL/angle_harmonic_intel.cpp
@@ -37,7 +37,7 @@ typedef struct { int a,b,c,t;  } int4_t;
 
 /* ---------------------------------------------------------------------- */
 
-AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp) 
+AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -74,8 +74,8 @@ void AngleHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void AngleHarmonicIntel::compute(int eflag, int vflag,
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
@@ -83,14 +83,14 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -103,9 +103,9 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void AngleHarmonicIntel::eval(const int vflag, 
-			    IntelBuffers<flt_t,acc_t> *buffers,
-			    const ForceConst<flt_t> &fc)
+void AngleHarmonicIntel::eval(const int vflag,
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->nanglelist;
@@ -133,7 +133,7 @@ void AngleHarmonicIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)	\
+    shared(f_start,f_stride,fc) \
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
@@ -148,7 +148,7 @@ void AngleHarmonicIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int4_t * _noalias const anglelist = 
+    const int4_t * _noalias const anglelist =
       (int4_t *) neighbor->anglelist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -228,35 +228,35 @@ void AngleHarmonicIntel::eval(const int vflag,
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
-	if (NEWTON_BOND || i2 < nlocal) {
+        if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= f1x + f3x;
-	  f[i2].y -= f1y + f3y;
-	  f[i2].z -= f1z + f3z;
+          f[i2].y -= f1y + f3y;
+          f[i2].z -= f1z + f3z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
-	IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
-                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, 
-                              delz1, delx2, dely2, delz2, seangle, f, 
-                              NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, 
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
+                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
+                              delz1, delx2, dely2, delz2, seangle, f,
+                              NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4,
                               sv5);
         #else
-	IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
-                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, 
-                              delz1, delx2, dely2, delz2, oeangle, f, 
-                              NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, 
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
+                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1,
+                              delz1, delx2, dely2, delz2, oeangle, f,
+                              NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4,
                               ov5);
         #endif
       }
@@ -264,8 +264,8 @@ void AngleHarmonicIntel::eval(const int vflag,
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oeangle += seangle;
     if (VFLAG && vflag) {
-        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
-	ov3 += sv3; ov4 += sv4; ov5 += sv5;
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
@@ -273,7 +273,7 @@ void AngleHarmonicIntel::eval(const int vflag,
   if (EFLAG) energy += oeangle;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -328,11 +328,11 @@ void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
-	                                             Memory *memory) {
+                                                     Memory *memory) {
   if (nangletypes != _nangletypes) {
     if (_nangletypes > 0)
       _memory->destroy(fc);
-    
+
     if (nangletypes > 0)
       _memory->create(fc,nangletypes,"anglecharmmintel.fc");
   }
diff --git a/src/USER-INTEL/angle_harmonic_intel.h b/src/USER-INTEL/angle_harmonic_intel.h
index 340ea4b974..301fc7cc06 100644
--- a/src/USER-INTEL/angle_harmonic_intel.h
+++ b/src/USER-INTEL/angle_harmonic_intel.h
@@ -45,8 +45,8 @@ class AngleHarmonicIntel : public AngleHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp
index 430142a72a..bb96135b2d 100644
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@@ -33,7 +33,7 @@ typedef struct { int a,b,t;  } int3_t;
 
 /* ---------------------------------------------------------------------- */
 
-BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp) 
+BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -70,8 +70,8 @@ void BondFENEIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void BondFENEIntel::compute(int eflag, int vflag,
-				IntelBuffers<flt_t,acc_t> *buffers,
-				const ForceConst<flt_t> &fc)
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
@@ -79,14 +79,14 @@ void BondFENEIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -97,9 +97,9 @@ void BondFENEIntel::compute(int eflag, int vflag,
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void BondFENEIntel::eval(const int vflag, 
-			 IntelBuffers<flt_t,acc_t> *buffers,
-			 const ForceConst<flt_t> &fc)
+void BondFENEIntel::eval(const int vflag,
+                         IntelBuffers<flt_t,acc_t> *buffers,
+                         const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
@@ -126,7 +126,7 @@ void BondFENEIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
@@ -141,7 +141,7 @@ void BondFENEIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int3_t * _noalias const bondlist = 
+    const int3_t * _noalias const bondlist =
       (int3_t *) neighbor->bondlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -176,7 +176,7 @@ void BondFENEIntel::eval(const int vflag,
       // if r -> r0, then rlogarg < 0.0 which is an error
       // issue a warning and reset rlogarg = epsilon
       // if r > 2*r0 something serious is wrong, abort
-      
+
       if (rlogarg < (flt_t)0.1) {
         char str[128];
         sprintf(str,"FENE bond too long: " BIGINT_FORMAT " "
@@ -186,18 +186,18 @@ void BondFENEIntel::eval(const int vflag,
         if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond");
         rlogarg = (flt_t)0.1;
       }
-      
+
       flt_t fbond = -k/rlogarg;
-      
+
       // force from LJ term
-      
+
       flt_t sr2,sr6;
       if (rsq < (flt_t)TWO_1_3*sigmasq) {
-	sr2 = sigmasq * irsq;
+        sr2 = sigmasq * irsq;
         sr6 = sr2 * sr2 * sr2;
         fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq;
       }
-      
+
       // energy
 
       flt_t ebond;
@@ -215,27 +215,27 @@ void BondFENEIntel::eval(const int vflag,
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += delx*fbond;
-	  f[i1].y += dely*fbond;
-	  f[i1].z += delz*fbond;
+          f[i1].y += dely*fbond;
+          f[i1].z += delz*fbond;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= delx*fbond;
-	  f[i2].y -= dely*fbond;
-	  f[i2].z -= delz*fbond;
+          f[i2].y -= dely*fbond;
+          f[i2].z -= delz*fbond;
         }
-      } 
+      }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
-	IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, 
-                             delx, dely, delz, sebond, f, NEWTON_BOND, 
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
+                             delx, dely, delz, sebond, f, NEWTON_BOND,
                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
-	#else
-	IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, 
-                             delx, dely, delz, oebond, f, NEWTON_BOND, 
+        #else
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond,
+                             delx, dely, delz, oebond, f, NEWTON_BOND,
                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
-	#endif
+        #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -250,7 +250,7 @@ void BondFENEIntel::eval(const int vflag,
   if (EFLAG) energy += oebond;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -307,11 +307,11 @@ void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                              Memory *memory) {
+                                                      Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(fc);
-    
+
     if (nbondtypes > 0)
       _memory->create(fc,nbondtypes,"bondfeneintel.fc");
   }
diff --git a/src/USER-INTEL/bond_fene_intel.h b/src/USER-INTEL/bond_fene_intel.h
index d64f1e7254..89c3033096 100644
--- a/src/USER-INTEL/bond_fene_intel.h
+++ b/src/USER-INTEL/bond_fene_intel.h
@@ -45,8 +45,8 @@ class BondFENEIntel : public BondFENE {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp
index 1cccf5fe54..beb0ebcdda 100644
--- a/src/USER-INTEL/bond_harmonic_intel.cpp
+++ b/src/USER-INTEL/bond_harmonic_intel.cpp
@@ -33,7 +33,7 @@ typedef struct { int a,b,t;  } int3_t;
 
 /* ---------------------------------------------------------------------- */
 
-BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) 
+BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
 }
@@ -70,8 +70,8 @@ void BondHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void BondHarmonicIntel::compute(int eflag, int vflag,
-				IntelBuffers<flt_t,acc_t> *buffers,
-				const ForceConst<flt_t> &fc)
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
@@ -79,14 +79,14 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -97,9 +97,9 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void BondHarmonicIntel::eval(const int vflag, 
-			     IntelBuffers<flt_t,acc_t> *buffers,
-			     const ForceConst<flt_t> &fc)
+void BondHarmonicIntel::eval(const int vflag,
+                             IntelBuffers<flt_t,acc_t> *buffers,
+                             const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
@@ -126,7 +126,7 @@ void BondHarmonicIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
@@ -141,7 +141,7 @@ void BondHarmonicIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int3_t * _noalias const bondlist = 
+    const int3_t * _noalias const bondlist =
       (int3_t *) neighbor->bondlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -184,29 +184,29 @@ void BondHarmonicIntel::eval(const int vflag,
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += delx*fbond;
-	  f[i1].y += dely*fbond;
-	  f[i1].z += delz*fbond;
+          f[i1].y += dely*fbond;
+          f[i1].z += delz*fbond;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x -= delx*fbond;
-	  f[i2].y -= dely*fbond;
-	  f[i2].z -= delz*fbond;
+          f[i2].y -= dely*fbond;
+          f[i2].z -= delz*fbond;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
-        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, 
-                             fbond, delx, dely, delz, sebond, f, 
-                             NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, 
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
+                             fbond, delx, dely, delz, sebond, f,
+                             NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3,
                              sv4, sv5);
-	#else
-        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, 
-                             fbond, delx, dely, delz, oebond, f, 
-                             NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, 
+        #else
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2,
+                             fbond, delx, dely, delz, oebond, f,
+                             NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3,
                              ov4, ov5);
-	#endif
+        #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -221,7 +221,7 @@ void BondHarmonicIntel::eval(const int vflag,
   if (EFLAG) energy += oebond;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -276,11 +276,11 @@ void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                              Memory *memory) {
+                                                      Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(fc);
-    
+
     if (nbondtypes > 0)
       _memory->create(fc,nbondtypes,"bondharmonicintel.fc");
   }
diff --git a/src/USER-INTEL/bond_harmonic_intel.h b/src/USER-INTEL/bond_harmonic_intel.h
index 0de844cddf..8fc04f432a 100644
--- a/src/USER-INTEL/bond_harmonic_intel.h
+++ b/src/USER-INTEL/bond_harmonic_intel.h
@@ -45,8 +45,8 @@ class BondHarmonicIntel : public BondHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp
index df8834c283..715cef4d37 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@@ -80,8 +80,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::compute(int eflag, int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -95,14 +95,14 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -115,9 +115,9 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
 #ifndef LMP_USE_AVXCD_DHC
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralCharmmIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+void DihedralCharmmIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
@@ -148,9 +148,9 @@ void DihedralCharmmIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
-	      opv0,opv1,opv2,opv3,opv4,opv5)
+              opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
     #if defined(LMP_SIMD_COMPILER_TEST)
@@ -165,7 +165,7 @@ void DihedralCharmmIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const dihedrallist = 
+    const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
     const flt_t qqrd2e = force->qqrd2e;
 
@@ -180,7 +180,7 @@ void DihedralCharmmIntel::eval(const int vflag,
     #if defined(LMP_SIMD_COMPILER_TEST)
     #pragma vector aligned
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
-                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) 
+                           sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5)
     for (int n = nfrom; n < nto; n++) {
     #endif
     for (int n = nfrom; n < nto; n += npl) {
@@ -204,7 +204,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
-      
+
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
@@ -244,25 +244,25 @@ void DihedralCharmmIntel::eval(const int vflag,
       // error check
       #ifndef LMP_SIMD_COMPILER_TEST
       if (c > PTOLERANCE || c < MTOLERANCE) {
-	int me = comm->me;
-
-	if (screen) {
-	  char str[128];
-	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT,
-		  me,tid,update->ntimestep,
-		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
-		  me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
-		  me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
-		  me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
-		  me,x[i4].x,x[i4].y,x[i4].z);
-	}
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
       }
       #endif
 
@@ -279,19 +279,19 @@ void DihedralCharmmIntel::eval(const int vflag,
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
-	ddf1 = p*c - df1*s;
-	df1 = p*s + df1*c;
-	p = ddf1;
+        ddf1 = p*c - df1*s;
+        df1 = p*s + df1*c;
+        p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
-      
+
       if (m == 0) {
-	p = (flt_t)1.0 + tcos_shift;
-	df1 = (flt_t)0.0;
+        p = (flt_t)1.0 + tcos_shift;
+        df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
@@ -334,12 +334,12 @@ void DihedralCharmmIntel::eval(const int vflag,
       const flt_t f3z = -sz2 - f4z;
 
       if (EFLAG || VFLAG) {
-	flt_t deng;
-	if (EFLAG) deng = tk * p;
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, 
-                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, 
+        flt_t deng;
+        if (EFLAG) deng = tk * p;
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3,
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
                               f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
-                              vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND, 
+                              vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND,
                               nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
@@ -349,15 +349,15 @@ void DihedralCharmmIntel::eval(const int vflag,
       #endif
       {
         if (NEWTON_BOND || i2 < nlocal) {
-	  f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
-	  f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
       }
 
@@ -372,54 +372,54 @@ void DihedralCharmmIntel::eval(const int vflag,
       flt_t forcecoul;
       if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
       else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
-      const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - 
-				     fc.ljp[itype][jtype].lj2);
+      const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv -
+                                     fc.ljp[itype][jtype].lj2);
       const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
 
       if (NEWTON_BOND || i1 < nlocal) {
-	f1x += delx*fpair;
-	f1y += dely*fpair;
-	f1z += delz*fpair;
+        f1x += delx*fpair;
+        f1y += dely*fpair;
+        f1z += delz*fpair;
       }
       if (NEWTON_BOND || i4 < nlocal) {
-	f4x -= delx*fpair;
-	f4y -= dely*fpair;
-	f4z -= delz*fpair;
+        f4x -= delx*fpair;
+        f4y -= dely*fpair;
+        f4z -= delz*fpair;
       }
 
       if (EFLAG || VFLAG) {
-	flt_t ev_pre = (flt_t)0;
-	if (NEWTON_BOND || i1 < nlocal)
-	  ev_pre += (flt_t)0.5;
-	if (NEWTON_BOND || i4 < nlocal)
-	  ev_pre += (flt_t)0.5;
-
-	if (EFLAG) {
-	  flt_t ecoul, evdwl;
-	  ecoul = tweight * forcecoul;
-	  evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - 
-				     fc.ljp[itype][jtype].lj4);
-	  secoul += ev_pre * ecoul;
-	  sevdwl += ev_pre * evdwl;
-	  if (eatom) {
-	    evdwl *= (flt_t)0.5;
-	    evdwl += (flt_t)0.5 * ecoul;
-	    if (NEWTON_BOND || i1 < nlocal)
-	      f[i1].w += evdwl;
-	    if (NEWTON_BOND || i4 < nlocal)
-	      f[i4].w += evdwl;
-	  }
-	}
-	//	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-	//				   delx, dely, delz);
-	if (VFLAG && vflag) {
-	  spv0 += ev_pre * delx * delx * fpair;
-	  spv1 += ev_pre * dely * dely * fpair;
-	  spv2 += ev_pre * delz * delz * fpair;
-	  spv3 += ev_pre * delx * dely * fpair;
-	  spv4 += ev_pre * delx * delz * fpair;
-	  spv5 += ev_pre * dely * delz * fpair;
-	}                                                                    
+        flt_t ev_pre = (flt_t)0;
+        if (NEWTON_BOND || i1 < nlocal)
+          ev_pre += (flt_t)0.5;
+        if (NEWTON_BOND || i4 < nlocal)
+          ev_pre += (flt_t)0.5;
+
+        if (EFLAG) {
+          flt_t ecoul, evdwl;
+          ecoul = tweight * forcecoul;
+          evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv -
+                                     fc.ljp[itype][jtype].lj4);
+          secoul += ev_pre * ecoul;
+          sevdwl += ev_pre * evdwl;
+          if (eatom) {
+            evdwl *= (flt_t)0.5;
+            evdwl += (flt_t)0.5 * ecoul;
+            if (NEWTON_BOND || i1 < nlocal)
+              f[i1].w += evdwl;
+            if (NEWTON_BOND || i4 < nlocal)
+              f[i4].w += evdwl;
+          }
+        }
+        //            IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+        //                                 delx, dely, delz);
+        if (VFLAG && vflag) {
+          spv0 += ev_pre * delx * delx * fpair;
+          spv1 += ev_pre * dely * dely * fpair;
+          spv2 += ev_pre * delz * delz * fpair;
+          spv3 += ev_pre * delx * dely * fpair;
+          spv4 += ev_pre * delx * delz * fpair;
+          spv5 += ev_pre * dely * delz * fpair;
+        }
       }
 
       // apply force to each of 4 atoms
@@ -428,15 +428,15 @@ void DihedralCharmmIntel::eval(const int vflag,
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
-	  f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
-	  f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
     } // for n
@@ -447,7 +447,7 @@ void DihedralCharmmIntel::eval(const int vflag,
     }
     if (VFLAG && vflag) {
       ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-      opv0 += spv0; opv1 += spv1; opv2 += spv2; 
+      opv0 += spv0; opv1 += spv1; opv2 += spv2;
       opv3 += spv3; opv4 += spv4; opv5 += spv5;
     }
   } // omp parallel
@@ -485,9 +485,9 @@ authors for more details.
 ------------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralCharmmIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+void DihedralCharmmIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
@@ -522,20 +522,20 @@ void DihedralCharmmIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
-	      opv0,opv1,opv2,opv3,opv4,opv5)
+              opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
     int nfrom, npl, nto, tid;
     IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
-			     swidth);
+                             swidth);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int * _noalias const dihedrallist = 
+    const int * _noalias const dihedrallist =
       (int *) neighbor->dihedrallist[0];
     const flt_t * _noalias const weight = &(fc.weight[0]);
     const flt_t * _noalias const x_f = &(x[0].x);
@@ -574,7 +574,7 @@ void DihedralCharmmIntel::eval(const int vflag,
     }
 
     SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
-				 55, 60, 65, 70, 75) + (nfrom * 5);
+                                 55, 60, 65, 70, 75) + (nfrom * 5);
     const int nto5 = nto * 5;
     const int nlocals4 = nlocal << 4;
     const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
@@ -618,7 +618,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       const SIMD_flt_t vb2zm = z2 - z3;
 
       // 3rd bond
-      
+
       SIMD_flt_t x4, y4, z4;
       SIMD_int jtype;
 
@@ -664,7 +664,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       const SIMD_flt_t ptol = SIMD_set(PTOLERANCE);
       const SIMD_flt_t ntol = SIMD_set(MTOLERANCE);
       if (c > ptol || c < ntol)
-	if (screen)
+        if (screen)
           error->warning(FLERR,"Dihedral problem.");
 
       c = SIMD_set(c, c > one, one);
@@ -678,14 +678,14 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_flt_t p(one);
       SIMD_flt_t ddf1(szero);
       SIMD_flt_t df1(szero);
-      
+
       const int m_max = SIMD_max(m);
 
       for (int i = 0; i < m_max; i++) {
-	const SIMD_mask my_m = i < m;
-	ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
-	df1 = SIMD_set(df1, my_m, p*s + df1*c);
-	p = SIMD_set(p, my_m, ddf1);
+        const SIMD_mask my_m = i < m;
+        ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s);
+        df1 = SIMD_set(df1, my_m, p*s + df1*c);
+        p = SIMD_set(p, my_m, ddf1);
       }
 
       SIMD_flt_t multf;
@@ -694,7 +694,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 = df1 * multf;
       p = p + one;
-      
+
       SIMD_mask mzero = (m == SIMD_set((int)0));
       p = SIMD_set(p, mzero, one + tcos_shift);
       df1 = SIMD_set(df1, mzero, szero);
@@ -740,40 +740,40 @@ void DihedralCharmmIntel::eval(const int vflag,
 
       SIMD_flt_t qdeng;
       if (EFLAG || VFLAG) {
-	SIMD_flt_t ev_pre;
-	if (NEWTON_BOND) ev_pre = one;
-	else {
-	  ev_pre = szero;
-	  const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
-	  ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
-	  ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
-	  ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
-	  ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
-	}
-	SIMD_zero_masked(nmask, ev_pre);
-	if (EFLAG) {
-	  const SIMD_flt_t deng = tk * p;
-	  sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
-	  if (eatom) {
-	    qdeng = deng * SIMD_set((flt_t)0.25);
-	    SIMD_mask newton_mask;
-	    if (NEWTON_BOND) newton_mask = nmask;
-	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
-	    SIMD_flt_t ieng = qdeng;
-	    SIMD_jeng_update(newton_mask, featom, i2, ieng);
-	    ieng = qdeng;
-	    if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
-	    SIMD_jeng_update(newton_mask, featom, i3, ieng);
-	  }
-	}
-	if (VFLAG && vflag) {
+        SIMD_flt_t ev_pre;
+        if (NEWTON_BOND) ev_pre = one;
+        else {
+          ev_pre = szero;
+          const SIMD_flt_t quarter = SIMD_set((flt_t)0.25);
+          ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter);
+          ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter);
+          ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter);
+          ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter);
+        }
+        SIMD_zero_masked(nmask, ev_pre);
+        if (EFLAG) {
+          const SIMD_flt_t deng = tk * p;
+          sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng);
+          if (eatom) {
+            qdeng = deng * SIMD_set((flt_t)0.25);
+            SIMD_mask newton_mask;
+            if (NEWTON_BOND) newton_mask = nmask;
+            if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4);
+            SIMD_flt_t ieng = qdeng;
+            SIMD_jeng_update(newton_mask, featom, i2, ieng);
+            ieng = qdeng;
+            if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4);
+            SIMD_jeng_update(newton_mask, featom, i3, ieng);
+          }
+        }
+        if (VFLAG && vflag) {
           sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
-	  sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
-	  sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
-	  sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
-	  sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
-	  sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
-	}
+          sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
+          sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
+          sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y));
+          sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z));
+          sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z));
+        }
       }
 
       SIMD_mask newton_mask;
@@ -809,27 +809,27 @@ void DihedralCharmmIntel::eval(const int vflag,
       f4z = f4z - delz * fpair;
 
       if (EFLAG || VFLAG) {
-	SIMD_flt_t ev_pre;
-	if (NEWTON_BOND) ev_pre = one;
-	else {
-	  ev_pre = szero;
+        SIMD_flt_t ev_pre;
+        if (NEWTON_BOND) ev_pre = one;
+        else {
+          ev_pre = szero;
           const SIMD_flt_t half = SIMD_set((flt_t)0.5);
           ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half);
           ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half);
-	}
-	SIMD_zero_masked(nmask, ev_pre);
-
-	if (EFLAG) {
-	  const SIMD_flt_t ecoul = tweight * forcecoul;
-	  const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
-	  const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
-	  SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
-	  secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
-	  sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
-	  if (eatom) {
- 	    const SIMD_flt_t half = SIMD_set((flt_t)0.5);
-	    evdwl = evdwl * half;
-	    evdwl = evdwl + half * ecoul + qdeng;
+        }
+        SIMD_zero_masked(nmask, ev_pre);
+
+        if (EFLAG) {
+          const SIMD_flt_t ecoul = tweight * forcecoul;
+          const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype);
+          const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype);
+          SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4);
+          secoul = SIMD_ev_add(secoul, ev_pre * ecoul);
+          sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl);
+          if (eatom) {
+            const SIMD_flt_t half = SIMD_set((flt_t)0.5);
+            evdwl = evdwl * half;
+            evdwl = evdwl + half * ecoul + qdeng;
 
             if (NEWTON_BOND) newton_mask = nmask;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4);
@@ -838,16 +838,16 @@ void DihedralCharmmIntel::eval(const int vflag,
             ieng = evdwl;
             if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4);
             SIMD_jeng_update(newton_mask, featom, i4, ieng);
-	  }
-	}
-	if (VFLAG && vflag) {
+          }
+        }
+        if (VFLAG && vflag) {
           spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
-	  spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
-	  spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
-	  spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
-	  spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
-	  spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
-	}                                                                    
+          spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
+          spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
+          spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair);
+          spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair);
+          spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair);
+        }
       }
 
       if (NEWTON_BOND) newton_mask = nmask;
@@ -863,17 +863,17 @@ void DihedralCharmmIntel::eval(const int vflag,
       oevdwl += SIMD_sum(sevdwl);
     }
     if (VFLAG && vflag) {
-      ov0 += SIMD_sum(sv0); 
-      ov1 += SIMD_sum(sv1); 
-      ov2 += SIMD_sum(sv2); 
-      ov3 += SIMD_sum(sv3); 
-      ov4 += SIMD_sum(sv4); 
+      ov0 += SIMD_sum(sv0);
+      ov1 += SIMD_sum(sv1);
+      ov2 += SIMD_sum(sv2);
+      ov3 += SIMD_sum(sv3);
+      ov4 += SIMD_sum(sv4);
       ov5 += SIMD_sum(sv5);
-      opv0 += SIMD_sum(spv0); 
-      opv1 += SIMD_sum(spv1); 
-      opv2 += SIMD_sum(spv2); 
-      opv3 += SIMD_sum(spv3); 
-      opv4 += SIMD_sum(spv4); 
+      opv0 += SIMD_sum(spv0);
+      opv1 += SIMD_sum(spv1);
+      opv2 += SIMD_sum(spv2);
+      opv3 += SIMD_sum(spv3);
+      opv4 += SIMD_sum(spv4);
       opv5 += SIMD_sum(spv5);
     }
   } // omp parallel
@@ -933,7 +933,7 @@ void DihedralCharmmIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
-	                                   IntelBuffers<flt_t,acc_t> *buffers)
+                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
 
   const int tp1 = atom->ntypes + 1;
@@ -944,10 +944,10 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
   if (weightflag) {
     for (int i = 0; i < tp1; i++) {
       for (int j = 0; j < tp1; j++) {
-	fc.ljp[i][j].lj1 = lj14_1[i][j];
-	fc.ljp[i][j].lj2 = lj14_2[i][j];
-	fc.ljp[i][j].lj3 = lj14_3[i][j];
-	fc.ljp[i][j].lj4 = lj14_4[i][j];
+        fc.ljp[i][j].lj1 = lj14_1[i][j];
+        fc.ljp[i][j].lj2 = lj14_2[i][j];
+        fc.ljp[i][j].lj3 = lj14_3[i][j];
+        fc.ljp[i][j].lj4 = lj14_4[i][j];
       }
     }
   }
@@ -965,8 +965,8 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
-            	                                        const int nbondtypes,
-	                                                Memory *memory) {
+                                                        const int nbondtypes,
+                                                        Memory *memory) {
   if (npairtypes != _npairtypes) {
     if (_npairtypes > 0)
       _memory->destroy(ljp);
@@ -979,7 +979,7 @@ void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
       _memory->destroy(bp);
       _memory->destroy(weight);
     }
-    
+
     if (nbondtypes > 0) {
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
       _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
diff --git a/src/USER-INTEL/dihedral_charmm_intel.h b/src/USER-INTEL/dihedral_charmm_intel.h
index 292faea9f9..d80b32c8ac 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.h
+++ b/src/USER-INTEL/dihedral_charmm_intel.h
@@ -44,8 +44,8 @@ class DihedralCharmmIntel : public DihedralCharmm {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
@@ -58,7 +58,7 @@ class DihedralCharmmIntel : public DihedralCharmm {
   class ForceConst {
    public:
     typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
-    typedef struct { flt_t cos_shift, sin_shift, k; 
+    typedef struct { flt_t cos_shift, sin_shift, k;
                      int multiplicity; } fc_packed3;
 
     fc_packed1 **ljp;
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp
index 94130f4355..196b024fa6 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.cpp
+++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp
@@ -69,8 +69,8 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::compute(int eflag, int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -79,14 +79,14 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -97,9 +97,9 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralHarmonicIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+void DihedralHarmonicIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
@@ -127,7 +127,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
@@ -142,7 +142,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const dihedrallist = 
+    const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -175,7 +175,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
-      
+
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
@@ -207,25 +207,25 @@ void DihedralHarmonicIntel::eval(const int vflag,
       // error check
       #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
-	int me = comm->me;
-
-	if (screen) {
-	  char str[128];
-	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT,
-		  me,tid,update->ntimestep,
-		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
-		  me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
-		  me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
-		  me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
-		  me,x[i4].x,x[i4].y,x[i4].z);
-	}
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
       }
       #endif
 
@@ -242,19 +242,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
       ddf1 = df1 = (flt_t)0.0;
 
       for (int i = 0; i < m; i++) {
-	ddf1 = p*c - df1*s;
-	df1 = p*s + df1*c;
-	p = ddf1;
+        ddf1 = p*c - df1*s;
+        df1 = p*s + df1*c;
+        p = ddf1;
       }
 
       p = p*tcos_shift + df1*tsin_shift;
       df1 = df1*tcos_shift - ddf1*tsin_shift;
       df1 *= -m;
       p += (flt_t)1.0;
-      
+
       if (m == 0) {
-	p = (flt_t)1.0 + tcos_shift;
-	df1 = (flt_t)0.0;
+        p = (flt_t)1.0 + tcos_shift;
+        df1 = (flt_t)0.0;
       }
 
       const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
@@ -297,20 +297,20 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t f3z = -sz2 - f4z;
 
       if (EFLAG || VFLAG) {
-	flt_t deng;
-	if (EFLAG) deng = tk * p;
-	#ifdef LMP_INTEL_USE_SIMDOFF
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
-	                      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-	                      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
-	                      vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
-			      sv0, sv1, sv2, sv3, sv4, sv5);
+        flt_t deng;
+        if (EFLAG) deng = tk * p;
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
         #else
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
-	                      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-	                      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
-	                      vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
-			      ov0, ov1, ov2, ov3, ov4, ov5);
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
         #endif
       }
 
@@ -319,35 +319,35 @@ void DihedralHarmonicIntel::eval(const int vflag,
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
-	  f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
-	  f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
-	  f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
-	  f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oedihedral += sedihedral;
     if (VFLAG && vflag) {
-        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
-	ov3 += sv3; ov4 += sv4; ov5 += sv5;
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
@@ -395,7 +395,7 @@ void DihedralHarmonicIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
-	                                     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -412,11 +412,11 @@ void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
-    
+
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.h b/src/USER-INTEL/dihedral_harmonic_intel.h
index 41e3d20540..0a9cfaa042 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.h
+++ b/src/USER-INTEL/dihedral_harmonic_intel.h
@@ -44,8 +44,8 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
@@ -57,7 +57,7 @@ class DihedralHarmonicIntel : public DihedralHarmonic {
   template <class flt_t>
   class ForceConst {
    public:
-    typedef struct { flt_t cos_shift, sin_shift, k; 
+    typedef struct { flt_t cos_shift, sin_shift, k;
                      int multiplicity; } fc_packed1;
 
     fc_packed1 *bp;
diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp
index 3248a8bfc7..1abeba1d5e 100644
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@@ -73,8 +73,8 @@ void DihedralOPLSIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::compute(int eflag, int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -83,14 +83,14 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -101,9 +101,9 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
 }
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void DihedralOPLSIntel::eval(const int vflag, 
-			       IntelBuffers<flt_t,acc_t> *buffers,
-			       const ForceConst<flt_t> &fc)
+void DihedralOPLSIntel::eval(const int vflag,
+                               IntelBuffers<flt_t,acc_t> *buffers,
+                               const ForceConst<flt_t> &fc)
 
 {
   const int inum = neighbor->ndihedrallist;
@@ -131,7 +131,7 @@ void DihedralOPLSIntel::eval(const int vflag,
 
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
-    shared(f_start,f_stride,fc)		  \
+    shared(f_start,f_stride,fc)           \
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
@@ -146,7 +146,7 @@ void DihedralOPLSIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const dihedrallist = 
+    const int5_t * _noalias const dihedrallist =
       (int5_t *) neighbor->dihedrallist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -179,7 +179,7 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t vb2zm = x[i2].z - x[i3].z;
 
       // 3rd bond
-      
+
       const flt_t vb3x = x[i4].x - x[i3].x;
       const flt_t vb3y = x[i4].y - x[i3].y;
       const flt_t vb3z = x[i4].z - x[i3].z;
@@ -209,7 +209,7 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
 
       flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm;
-      const flt_t r12c1 =  rb1 * rb2; 
+      const flt_t r12c1 =  rb1 * rb2;
       const flt_t c1mag = ctmp * r12c1;
 
       ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
@@ -240,25 +240,25 @@ void DihedralOPLSIntel::eval(const int vflag,
       // error check
       #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
-	int me = comm->me;
-
-	if (screen) {
-	  char str[128];
-	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT " "
-		  TAGINT_FORMAT " " TAGINT_FORMAT,
-		  me,tid,update->ntimestep,
-		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
-		  me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
-		  me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
-		  me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
-		  me,x[i4].x,x[i4].y,x[i4].z);
-	}
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
       }
       #endif
 
@@ -283,14 +283,14 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim;
 
       flt_t p, pd;
-      p = fc.bp[type].k1*((flt_t)1.0 + c) + 
-	  fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + 
-	  fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
-	  fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
-      pd = fc.bp[type].k1 - 
-	   (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
-	   (flt_t)3.0 * fc.bp[type].k3 * sin_3phim - 
-	   (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
+      p = fc.bp[type].k1*((flt_t)1.0 + c) +
+          fc.bp[type].k2*((flt_t)1.0 - cos_2phi) +
+          fc.bp[type].k3*((flt_t)1.0 + cos_3phi) +
+          fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ;
+      pd = fc.bp[type].k1 -
+           (flt_t)2.0 * fc.bp[type].k2 * sin_2phim +
+           (flt_t)3.0 * fc.bp[type].k3 * sin_3phim -
+           (flt_t)4.0 * fc.bp[type].k4 * sin_4phim;
 
       flt_t edihed;
       if (EFLAG) edihed = p;
@@ -327,18 +327,18 @@ void DihedralOPLSIntel::eval(const int vflag,
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, 
-			      i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-			      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
-			      vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
-			      sv0, sv1, sv2, sv3, sv4, sv5);
-	#else
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, 
-			      i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-			      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
-			      vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
-			      ov0, ov1, ov2, ov3, ov4, ov5);
-	#endif
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3,
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
 
       #ifdef LMP_INTEL_USE_SIMDOFF
@@ -346,35 +346,35 @@ void DihedralOPLSIntel::eval(const int vflag,
       #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
-	  f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
-	  f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
-	  f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
-	  f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
     if (EFLAG) oedihedral += sedihedral;
     if (VFLAG && vflag) {
-        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
-	ov3 += sv3; ov4 += sv4; ov5 += sv5;
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
     #endif
   } // omp parallel
@@ -422,7 +422,7 @@ void DihedralOPLSIntel::init_style()
 
 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
-	                                     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->ndihedraltypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -439,11 +439,11 @@ void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nbondtypes != _nbondtypes) {
     if (_nbondtypes > 0)
       _memory->destroy(bp);
-    
+
     if (nbondtypes > 0)
       _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
   }
diff --git a/src/USER-INTEL/dihedral_opls_intel.h b/src/USER-INTEL/dihedral_opls_intel.h
index ea0930f4b8..1080bfa6c3 100644
--- a/src/USER-INTEL/dihedral_opls_intel.h
+++ b/src/USER-INTEL/dihedral_opls_intel.h
@@ -44,8 +44,8 @@ class DihedralOPLSIntel : public DihedralOPLS {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index e132947750..b06f76c90d 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -96,7 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   _allow_separate_buffers = 1;
   _offload_ghost = -1;
   _lrt = 0;
-  
+
   int iarg = 4;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"omp") == 0) {
@@ -141,7 +141,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
       else error->all(FLERR,"Illegal package intel command");
       iarg += 2;
     }
-  
+
     // undocumented options
 
     else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
@@ -179,7 +179,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
     _real_space_comm = MPI_COMM_WORLD;
     if (no_affinity == 0)
       if (set_host_affinity(nomp) != 0)
-	error->all(FLERR,"Could not set host affinity for offload tasks");
+        error->all(FLERR,"Could not set host affinity for offload tasks");
   }
 
   int max_offload_threads = 0, offload_cores = 0;
@@ -264,7 +264,7 @@ FixIntel::~FixIntel()
     double *time2 = off_watch_neighbor();
     int *overflow = get_off_overflow_flag();
     if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
-	overflow != NULL) {
+        overflow != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
     }
@@ -320,11 +320,11 @@ void FixIntel::init()
       if (strstr(hybrid->keywords[i], "/intel") != NULL)
         nstyles++;
       else
-	force->pair->no_virial_fdotr_compute = 1;
+        force->pair->no_virial_fdotr_compute = 1;
   }
   if (nstyles > 1)
     error->all(FLERR,
-	       "Currently, cannot use more than one intel style with hybrid.");
+               "Currently, cannot use more than one intel style with hybrid.");
 
   check_neighbor_intel();
   int off_mode = 0;
@@ -349,13 +349,13 @@ void FixIntel::setup(int vflag)
 {
   if (neighbor->style != BIN)
     error->all(FLERR,
-	    "Currently, neighbor style BIN must be used with Intel package.");
+            "Currently, neighbor style BIN must be used with Intel package.");
   if (neighbor->exclude_setting() != 0)
     error->all(FLERR,
-	    "Currently, cannot use neigh_modify exclude with Intel package.");
+            "Currently, cannot use neigh_modify exclude with Intel package.");
   if (vflag_atom)
    error->all(FLERR,
-	       "Cannot currently get per-atom virials with Intel package.");
+               "Cannot currently get per-atom virials with Intel package.");
   #ifdef _LMP_INTEL_OFFLOAD
   post_force(vflag);
   #endif
@@ -392,7 +392,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
     double *time2 = off_watch_neighbor();
     int *overflow = get_off_overflow_flag();
     if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
-	overflow != NULL) {
+        overflow != NULL) {
       #pragma offload_transfer target(mic:_cop)  \
         nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
         in(overflow:length(5) alloc_if(1) free_if(0))
@@ -407,7 +407,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
     error->warning(FLERR, "Unknown Intel Compiler Version\n");
     #else
     if (__INTEL_COMPILER_BUILD_DATE != 20131008 &&
-	__INTEL_COMPILER_BUILD_DATE < 20141023)
+        __INTEL_COMPILER_BUILD_DATE < 20141023)
       error->warning(FLERR, "Unsupported Intel Compiler.");
     #endif
     #if !defined(__INTEL_COMPILER)
@@ -438,24 +438,24 @@ void FixIntel::pair_init_check(const bool cdmessage)
   if (comm->me == 0) {
     if (screen) {
       fprintf(screen,
-	      "----------------------------------------------------------\n");
+              "----------------------------------------------------------\n");
       if (_offload_balance != 0.0) {
         fprintf(screen,"Using Intel Coprocessor with %d threads per core, ",
-		_offload_tpc);
+                _offload_tpc);
         fprintf(screen,"%d threads per task\n",_offload_threads);
       } else {
-	fprintf(screen,"Using Intel Package without Coprocessor.\n");
+        fprintf(screen,"Using Intel Package without Coprocessor.\n");
       }
       fprintf(screen,"Precision: %s\n",kmode);
       if (cdmessage) {
-	#ifdef LMP_USE_AVXCD
-	fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
-	#else
-	fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
-	#endif
+        #ifdef LMP_USE_AVXCD
+        fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
+        #else
+        fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
+        #endif
       }
       fprintf(screen,
-	      "----------------------------------------------------------\n");
+              "----------------------------------------------------------\n");
     }
   }
 }
@@ -464,7 +464,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
 
 void FixIntel::bond_init_check()
 {
-  if (_offload_balance != 0.0 && atom->molecular && 
+  if (_offload_balance != 0.0 && atom->molecular &&
       force->newton_pair != force->newton_bond)
     error->all(FLERR,
       "USER-INTEL package requires same setting for newton bond and non-bond.");
@@ -573,7 +573,7 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
   int o_range, f_stride;
   if (force->newton_pair)
     o_range = atom->nlocal + atom->nghost;
-  else		
+  else
     o_range = atom->nlocal;
   IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
 
@@ -588,18 +588,18 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
       _use_simd_pragma("vector aligned")
       _use_simd_pragma("simd")
       for (int n = 0; n < o_range; n++)
-	f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
+        f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
     } else if (_nthreads == 2) {
       _use_simd_pragma("vector aligned")
       _use_simd_pragma("simd")
       for (int n = 0; n < o_range; n++)
-	f_scalar[n] += f_scalar2[n];
+        f_scalar[n] += f_scalar2[n];
     } else {
       acc_t *f_scalar3 = f_scalar2 + f_stride4;
       _use_simd_pragma("vector aligned")
       _use_simd_pragma("simd")
       for (int n = 0; n < o_range; n++)
-	f_scalar[n] += f_scalar2[n] + f_scalar3[n];
+        f_scalar[n] += f_scalar2[n] + f_scalar3[n];
     }
   } else {
     #if defined(_OPENMP)
@@ -608,13 +608,13 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
     {
       int iifrom, iito, tid;
       IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
-				sizeof(acc_t));
+                                sizeof(acc_t));
 
       acc_t *f_scalar2 = f_scalar + f_stride4;
       for (int t = 1; t < _nthreads; t++) {
-	_use_simd_pragma("vector aligned")
-	_use_simd_pragma("simd")
-	for (int n = iifrom; n < iito; n++)
+        _use_simd_pragma("vector aligned")
+        _use_simd_pragma("simd")
+        for (int n = iifrom; n < iito; n++)
           f_scalar[n] += f_scalar2[n];
         f_scalar2 += f_stride4;
       }
@@ -648,33 +648,33 @@ template <class ft, class acc_t>
 void FixIntel::add_results(const ft * _noalias const f_in,
                            const acc_t * _noalias const ev_global,
                            const int eatom, const int vatom,
-			   const int offload) {
+                           const int offload) {
   start_watch(TIME_PACK);
   int f_length;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
     if (offload) {
       if (force->newton_pair) {
-	add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
-	const acc_t * _noalias const enull = 0;
-	int offset = _offload_nlocal;
-	if (atom->torque) offset *= 2;
-	add_oresults(f_in + offset, enull, eatom, vatom,
-		     _offload_min_ghost, _offload_nghost);
+        add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
+        const acc_t * _noalias const enull = 0;
+        int offset = _offload_nlocal;
+        if (atom->torque) offset *= 2;
+        add_oresults(f_in + offset, enull, eatom, vatom,
+                     _offload_min_ghost, _offload_nghost);
       } else
-	add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
+        add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
     } else {
       if (force->newton_pair) {
-	add_oresults(f_in, ev_global, eatom, vatom,
-		     _host_min_local, _host_used_local);
-	const acc_t * _noalias const enull = 0;
-	int offset = _host_used_local;
-	if (atom->torque) offset *= 2;
-	add_oresults(f_in + offset, enull, eatom,
-		     vatom, _host_min_ghost, _host_used_ghost);
+        add_oresults(f_in, ev_global, eatom, vatom,
+                     _host_min_local, _host_used_local);
+        const acc_t * _noalias const enull = 0;
+        int offset = _host_used_local;
+        if (atom->torque) offset *= 2;
+        add_oresults(f_in + offset, enull, eatom,
+                     vatom, _host_min_ghost, _host_used_ghost);
       } else {
-	int start = host_start_pair();
-	add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
+        int start = host_start_pair();
+        add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
       }
     }
     stop_watch(TIME_PACK);
@@ -685,9 +685,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
     start = 0;
     if (force->newton_pair) {
       if (_offload_noghost == 0)
-	f_length = atom->nlocal + atom->nghost;
+        f_length = atom->nlocal + atom->nghost;
       else
-	f_length = atom->nlocal;
+        f_length = atom->nlocal;
     } else
       f_length = offload_end_pair();
   } else {
@@ -714,9 +714,9 @@ void FixIntel::add_results(const ft * _noalias const f_in,
 
 template <class ft, class acc_t>
 void FixIntel::add_oresults(const ft * _noalias const f_in,
-			    const acc_t * _noalias const ev_global,
-			    const int eatom, const int vatom,
-			    const int out_offset, const int nall) {
+                            const acc_t * _noalias const ev_global,
+                            const int eatom, const int vatom,
+                            const int out_offset, const int nall) {
   lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
   if (atom->torque) {
     if (f_in[1].w)
@@ -744,12 +744,12 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
     if (atom->torque) {
       int ii = ifrom * 2;
       lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
-	out_offset;
+        out_offset;
       if (eatom) {
-	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
+        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
           f[i].y += f_in[ii].y;
@@ -762,8 +762,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[ii].x;
           f[i].y += f_in[ii].y;
@@ -776,10 +776,10 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
       }
     } else {
       if (eatom) {
-	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
+        double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
           f[i].y += f_in[i].y;
@@ -788,8 +788,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
         }
       } else {
         #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
+        #pragma novector
+        #endif
         for (int i = ifrom; i < ito; i++) {
           f[i].x += f_in[i].x;
           f[i].y += f_in[i].y;
@@ -931,7 +931,7 @@ void FixIntel::output_timing_data() {
     balance_out[0] = _balance_pair;
     balance_out[1] = _balance_neighbor;
     MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
-	       0, _real_space_comm);
+               0, _real_space_comm);
     balance_in[0] /= size;
     balance_in[1] /= size;
 
@@ -958,25 +958,25 @@ void FixIntel::output_timing_data() {
                 balance_in[1]);
         fprintf(_tscreen, "  Offload Pair Balance      %f\n",
                 balance_in[0]);
-	fprintf(_tscreen, "  Offload Ghost Atoms       ");
-	if (_offload_noghost) fprintf(_tscreen,"No\n");
-	else fprintf(_tscreen,"Yes\n");
+        fprintf(_tscreen, "  Offload Ghost Atoms       ");
+        if (_offload_noghost) fprintf(_tscreen,"No\n");
+        else fprintf(_tscreen,"Yes\n");
         #ifdef TIME_BALANCE
         fprintf(_tscreen, "  Offload Imbalance Seconds %f\n",
                 timers[TIME_IMBALANCE]);
-	fprintf(_tscreen, "  Offload Min/Max Seconds   ");
-	for (int i = 0; i < NUM_ITIMERS; i++)
-	  fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
-	fprintf(_tscreen, "\n");
+        fprintf(_tscreen, "  Offload Min/Max Seconds   ");
+        for (int i = 0; i < NUM_ITIMERS; i++)
+          fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
+        fprintf(_tscreen, "\n");
         #endif
-	double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
-	  timers[TIME_OFFLOAD_WAIT];
-	double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
-	  timers[TIME_OFFLOAD_PAIR];
-	double tt = MAX(ht,ct);
-	if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
-	  error->warning(FLERR,
-		 "Leaving a core free can improve performance for offload");
+        double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] +
+          timers[TIME_OFFLOAD_WAIT];
+        double ct = timers[TIME_OFFLOAD_NEIGHBOR] +
+          timers[TIME_OFFLOAD_PAIR];
+        double tt = MAX(ht,ct);
+        if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0)
+          error->warning(FLERR,
+                 "Leaving a core free can improve performance for offload");
       }
       fprintf(_tscreen, "------------------------------------------------\n");
     }
@@ -999,14 +999,14 @@ int FixIntel::get_ppn(int &node_rank) {
   node_name[name_length] = '\0';
   char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
   MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
-		MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
+                MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
   int ppn = 0;
   node_rank = 0;
   for (int i = 0; i < nprocs; i++) {
     if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
       ppn++;
       if (i < rank)
-	node_rank++;
+        node_rank++;
     }
   }
 
@@ -1068,19 +1068,19 @@ void FixIntel::set_offload_affinity()
       kmp_create_affinity_mask(&mask);
       int proc = offload_threads * node_rank + tnum;
       #ifdef __AVX512F__
-      proc = (proc / offload_tpc) + (proc % offload_tpc) * 
-	     ((offload_cores) / 4);
+      proc = (proc / offload_tpc) + (proc % offload_tpc) *
+             ((offload_cores) / 4);
       proc += 68;
       #else
       if (offload_affinity_balanced)
-	proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
+        proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
       else
-	proc += (proc / 4) * (4 - offload_tpc) + 1;
+        proc += (proc / 4) * (4 - offload_tpc) + 1;
       #endif
       kmp_set_affinity_mask_proc(proc, &mask);
       if (kmp_set_affinity(&mask) != 0)
-	printf("Could not set affinity on rank %d thread %d to %d\n",
-	       node_rank, tnum, proc);
+        printf("Could not set affinity on rank %d thread %d to %d\n",
+               node_rank, tnum, proc);
     }
   }
 
@@ -1110,7 +1110,7 @@ int FixIntel::set_host_affinity(const int nomp)
   char cmd[512];
   char readbuf[INTEL_MAX_HOST_CORE_COUNT*5];
   sprintf(cmd, "lscpu -p | grep -v '#' |"
-	  "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
+          "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'");
   p = popen(cmd, "r");
   if (p == NULL) return -1;
   ncores = 0;
@@ -1147,7 +1147,7 @@ int FixIntel::set_host_affinity(const int nomp)
   if (subscription > ncores) {
     if (rank == 0)
       error->warning(FLERR,
-		     "More MPI tasks/OpenMP threads than available cores");
+                     "More MPI tasks/OpenMP threads than available cores");
     return 0;
   }
   if (subscription == ncores)
@@ -1173,10 +1173,10 @@ int FixIntel::set_host_affinity(const int nomp)
       int first = coi_cores + node_rank * mpi_cores;
       CPU_ZERO(&cpuset);
       for (int i = first; i < first + mpi_cores; i++)
-	CPU_SET(proc_list[i], &cpuset);
+        CPU_SET(proc_list[i], &cpuset);
       if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
-	fail = 1;
-	break;
+        fail = 1;
+        break;
       }
       plwp++;
     }
@@ -1189,13 +1189,13 @@ int FixIntel::set_host_affinity(const int nomp)
     buf1 = (float*) malloc(sizeof(float)*pragma_size);
 
     #pragma offload target (mic:0) mandatory \
-      in(buf1:length(pragma_size) alloc_if(1) free_if(0))	\
+      in(buf1:length(pragma_size) alloc_if(1) free_if(0))       \
       signal(&sig1)
     { buf1[0] = 0.0; }
     #pragma offload_wait target(mic:0) wait(&sig1)
 
     #pragma offload target (mic:0) mandatory \
-      out(buf1:length(pragma_size) alloc_if(0) free_if(1))	\
+      out(buf1:length(pragma_size) alloc_if(0) free_if(1))      \
       signal(&sig2)
     { buf1[0] = 1.0; }
     #pragma offload_wait target(mic:0) wait(&sig2)
@@ -1211,11 +1211,11 @@ int FixIntel::set_host_affinity(const int nomp)
 
       CPU_ZERO(&cpuset);
       for(int i=0; i<coi_cores; i++)
-	CPU_SET(proc_list[i], &cpuset);
+        CPU_SET(proc_list[i], &cpuset);
 
       if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) {
-	fail = 1;
-	break;
+        fail = 1;
+        break;
       }
     }
     pclose(p);
@@ -1228,7 +1228,7 @@ int FixIntel::set_host_affinity(const int nomp)
   if (screen && rank == 0) {
     if (coi_cores)
       fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n",
-	      mlwp, coi_cores);
+              mlwp, coi_cores);
     fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores);
   }
   if (fail) return -1;
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index 92d1311256..068e5ed890 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -72,7 +72,7 @@ class FixIntel : public Fix {
   inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
   inline int three_body_neighbor() { return _three_body_neighbor; }
   inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
-  
+
   inline int need_zero(const int tid) {
     if (_need_reduce == 0 && tid > 0) return 1;
     return 0;
@@ -84,11 +84,11 @@ class FixIntel : public Fix {
   }
   inline int pppm_table() {
     if (force->kspace_match("pppm/intel", 0) ||
-	force->kspace_match("pppm/disp/intel",0)) 
+        force->kspace_match("pppm/disp/intel",0))
       return INTEL_P3M_TABLE;
     else return 0;
   }
-  
+
 
  protected:
   IntelBuffers<float,float> *_single_buffers;
@@ -103,17 +103,17 @@ class FixIntel : public Fix {
   inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
                                const int eatom = 0, const int vatom = 0,
-			       const int rflag = 0);
+                               const int rflag = 0);
   inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
                                const int eatom = 0, const int vatom = 0,
-			       const int rflag = 0);
+                               const int rflag = 0);
   inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
                                float *ev_in, const int offload,
                                const int eatom = 0, const int vatom = 0,
-			       const int rflag = 0);
+                               const int rflag = 0);
   inline void get_buffern(const int offload, int &nlocal, int &nall,
-			  int &minlocal);
+                          int &minlocal);
 
   #ifdef _LMP_INTEL_OFFLOAD
   void post_force(int vflag);
@@ -213,13 +213,13 @@ class FixIntel : public Fix {
   inline void add_results(const ft * _noalias const f_in,
                           const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
-			  const int offload);
+                          const int offload);
 
   template <class ft, class acc_t>
   inline void add_oresults(const ft * _noalias const f_in,
-			   const acc_t * _noalias const ev_global,
-			   const int eatom, const int vatom,
-			   const int out_offset, const int nall);
+                           const acc_t * _noalias const ev_global,
+                           const int eatom, const int vatom,
+                           const int out_offset, const int nall);
 
   int _offload_affinity_balanced, _offload_threads, _offload_tpc;
   #ifdef _LMP_INTEL_OFFLOAD
@@ -235,16 +235,16 @@ class FixIntel : public Fix {
 /* ---------------------------------------------------------------------- */
 
 void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
-			   int &minlocal) {
+                           int &minlocal) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
     if (offload) {
       if (neighbor->ago != 0) {
-	nlocal = _offload_nlocal;
-	nall = _offload_nall;
+        nlocal = _offload_nlocal;
+        nall = _offload_nall;
       } else {
-	nlocal = atom->nlocal;
-	nall = nlocal + atom->nghost;
+        nlocal = atom->nlocal;
+        nall = nlocal + atom->nghost;
       }
       minlocal = 0;
     } else {
@@ -253,7 +253,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
       if (force->newton)
         minlocal = _host_min_local;
       else
-	minlocal = host_start_pair();
+        minlocal = host_start_pair();
     }
     return;
   }
@@ -271,7 +271,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
 void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                                 double *ev_in, const int offload,
                                 const int eatom, const int vatom,
-				const int rflag) {
+                                const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload) {
     _off_results_eatom = eatom;
@@ -299,7 +299,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
 void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                                 double *ev_in, const int offload,
                                 const int eatom, const int vatom,
-				const int rflag) {
+                                const int rflag) {
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload) {
     _off_results_eatom = eatom;
@@ -361,12 +361,12 @@ int FixIntel::offload_end_neighbor() {
     if (atom->nlocal < 2)
       error->one(FLERR,"Too few atoms for load balancing offload");
     double granularity = 1.0 / atom->nlocal;
-    if (_balance_neighbor < granularity) 
+    if (_balance_neighbor < granularity)
       _balance_neighbor = granularity + 1e-10;
-    else if (_balance_neighbor > 1.0 - granularity) 
+    else if (_balance_neighbor > 1.0 - granularity)
       _balance_neighbor = 1.0 - granularity + 1e-10;
   }
-  return _balance_neighbor * atom->nlocal; 
+  return _balance_neighbor * atom->nlocal;
 }
 
 int FixIntel::offload_end_pair() {
@@ -517,7 +517,7 @@ The newton setting must be the same for both pairwise and bonded forces.
 
 E: Intel styles for bond/angle/dihedral/improper require intel pair style."
 
-You cannot use the USER-INTEL package for bond calculations without a 
+You cannot use the USER-INTEL package for bond calculations without a
 USER-INTEL supported pair style.
 
 E: Intel styles for kspace require intel pair style.
diff --git a/src/USER-INTEL/fix_nh_intel.cpp b/src/USER-INTEL/fix_nh_intel.cpp
index 3f76e53c1f..6e44b38ef1 100644
--- a/src/USER-INTEL/fix_nh_intel.cpp
+++ b/src/USER-INTEL/fix_nh_intel.cpp
@@ -45,7 +45,7 @@ typedef struct { double x,y,z; } dbl3_t;
    NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
  ---------------------------------------------------------------------- */
 
-FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) : 
+FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) :
   FixNH(lmp, narg, arg)
 {
   _dtfm = 0;
@@ -118,12 +118,12 @@ void FixNHIntel::remap()
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
-	const double d0 = x[i].x - b0;
-	const double d1 = x[i].y - b1;
-	const double d2 = x[i].z - b2;
-	x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
-	x[i].y = hi1*d1 + hi3*d2;
-	x[i].z = hi2*d2;
+        const double d0 = x[i].x - b0;
+        const double d1 = x[i].y - b1;
+        const double d2 = x[i].z - b2;
+        x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
+        x[i].y = hi1*d1 + hi3*d2;
+        x[i].z = hi2*d2;
       }
     }
   }
@@ -294,9 +294,9 @@ void FixNHIntel::remap()
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & dilate_group_bit) {
-	x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
-	x[i].y = h1*x[i].y + h3*x[i].z + nb1;
-	x[i].z = h2*x[i].z + nb2;
+        x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
+        x[i].y = h1*x[i].y + h3*x[i].z + nb1;
+        x[i].z = h2*x[i].z + nb2;
       }
     }
   }
@@ -318,7 +318,7 @@ void FixNHIntel::reset_dt()
   dto = dthalf;
 
   // If using respa, then remap is performed in innermost level
-  
+
   if (strstr(update->integrate_style,"respa"))
     dto = 0.5*step_respa[0];
 
@@ -329,7 +329,7 @@ void FixNHIntel::reset_dt()
     tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
 
   const int * const mask = atom->mask;
-  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
@@ -345,9 +345,9 @@ void FixNHIntel::reset_dt()
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++) {
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
       }
     } else {
       const double * const mass = atom->mass;
@@ -364,29 +364,29 @@ void FixNHIntel::reset_dt()
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     } else {
       const double * const mass = atom->mass;
       const int * const type = atom->type;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     }
   }
 }
@@ -431,9 +431,9 @@ void FixNHIntel::nh_v_press()
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
-	v[i].x *= f0;
-	v[i].y *= f1;
-	v[i].z *= f2;
+        v[i].x *= f0;
+        v[i].y *= f1;
+        v[i].z *= f2;
       }
     }
   }
@@ -506,7 +506,7 @@ void FixNHIntel::nh_v_temp()
     #pragma simd
     #endif
     for (int i = 0; i < _nlocal3; i++)
-	v[i] *= factor_eta;
+        v[i] *= factor_eta;
   } else {
     #if defined(LMP_SIMD_COMPILER)
     #pragma vector aligned
@@ -514,12 +514,12 @@ void FixNHIntel::nh_v_temp()
     #endif
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0)
-	v[i] *= factor_eta;
+        v[i] *= factor_eta;
     }
   }
 }
 
-double FixNHIntel::memory_usage() 
+double FixNHIntel::memory_usage()
 {
   return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
 }
diff --git a/src/USER-INTEL/fix_nh_intel.h b/src/USER-INTEL/fix_nh_intel.h
index 32ed6c8534..cc6ba8c481 100644
--- a/src/USER-INTEL/fix_nh_intel.h
+++ b/src/USER-INTEL/fix_nh_intel.h
@@ -35,7 +35,7 @@ class FixNHIntel : public FixNH {
   int _nlocal3, _nlocal_max;
 
   virtual void remap();
-  virtual void nve_x();      
+  virtual void nve_x();
   virtual void nve_v();
   virtual void nh_v_press();
   virtual void nh_v_temp();
diff --git a/src/USER-INTEL/fix_nve_asphere_intel.cpp b/src/USER-INTEL/fix_nve_asphere_intel.cpp
index 6563165454..8ad63f7326 100644
--- a/src/USER-INTEL/fix_nve_asphere_intel.cpp
+++ b/src/USER-INTEL/fix_nve_asphere_intel.cpp
@@ -36,7 +36,7 @@ using namespace FixConst;
 /* ---------------------------------------------------------------------- */
 
 FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) :
-  FixNVE(lmp, narg, arg) 
+  FixNVE(lmp, narg, arg)
 {
   _dtfm = 0;
   _nlocal3 = 0;
@@ -129,9 +129,9 @@ void FixNVEAsphereIntel::initial_integrate(int vflag)
     #endif
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
-	double *quat = bonus[ellipsoid[i]].quat;
-	ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
-			    _inertia1[i], _inertia2[i]);
+        double *quat = bonus[ellipsoid[i]].quat;
+        ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
+                            _inertia1[i], _inertia2[i]);
       }
     }
   }
@@ -168,7 +168,7 @@ void FixNVEAsphereIntel::reset_dt() {
   dtf = 0.5 * update->dt * force->ftm2v;
 
   const int * const mask = atom->mask;
-  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
@@ -211,27 +211,27 @@ void FixNVEAsphereIntel::reset_dt() {
     for (int i = 0; i < nlocal; i++) {
       if (mask[i] & groupbit) {
         _dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	double *shape = bonus[ellipsoid[i]].shape;
-	double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
-	if (idot != 0.0) idot = 1.0 / idot;
-	_inertia0[i] = idot;
-	idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
-	if (idot != 0.0) idot = 1.0 / idot;
-	_inertia1[i] = idot;
-	idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
-	if (idot != 0.0) idot = 1.0 / idot;
-	_inertia2[i] = idot;
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        double *shape = bonus[ellipsoid[i]].shape;
+        double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia0[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia1[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia2[i] = idot;
       } else {
         _dtfm[n++] = 0.0;
-	_dtfm[n++] = 0.0;
-	_dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
       }
     }
   }
 }
-double FixNVEAsphereIntel::memory_usage() 
+double FixNVEAsphereIntel::memory_usage()
 {
   return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
 }
diff --git a/src/USER-INTEL/fix_nve_intel.cpp b/src/USER-INTEL/fix_nve_intel.cpp
index 3fb290b3ab..c0f6da06ae 100644
--- a/src/USER-INTEL/fix_nve_intel.cpp
+++ b/src/USER-INTEL/fix_nve_intel.cpp
@@ -29,7 +29,7 @@ using namespace FixConst;
 /* ---------------------------------------------------------------------- */
 
 FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) :
-  FixNVE(lmp, narg, arg) 
+  FixNVE(lmp, narg, arg)
 {
   _dtfm = 0;
   _nlocal3 = 0;
@@ -91,7 +91,7 @@ void FixNVEIntel::initial_integrate(int vflag)
     for (int i = 0; i < _nlocal3; i++) {
       if (_dtfm[i] != 0.0) {
         v[i] += _dtfm[i] * f[i];
-	x[i] += dtv * v[i];
+        x[i] += dtv * v[i];
       }
     }
   }
@@ -130,7 +130,7 @@ void FixNVEIntel::reset_dt() {
   dtf = 0.5 * update->dt * force->ftm2v;
 
   const int * const mask = atom->mask;
-  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : 
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
     atom->nlocal;
 
   if (nlocal > _nlocal_max) {
@@ -146,9 +146,9 @@ void FixNVEIntel::reset_dt() {
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++) {
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
-	_dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
+        _dtfm[n++] = dtf / rmass[i];
       }
     } else {
       const double * const mass = atom->mass;
@@ -165,34 +165,34 @@ void FixNVEIntel::reset_dt() {
       const double * const rmass = atom->rmass;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
-	  _dtfm[n++] = dtf / rmass[i];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
+          _dtfm[n++] = dtf / rmass[i];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     } else {
       const double * const mass = atom->mass;
       const int * const type = atom->type;
       int n = 0;
       for (int i = 0; i < nlocal; i++)
-	if (mask[i] & groupbit) {
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
-	  _dtfm[n++] = dtf / mass[type[i]];
+        if (mask[i] & groupbit) {
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
+          _dtfm[n++] = dtf / mass[type[i]];
         } else {
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	  _dtfm[n++] = 0.0;
-	}
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
     }
   }
 }
 
-double FixNVEIntel::memory_usage() 
+double FixNVEIntel::memory_usage()
 {
   return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double);
 }
diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp
index df13cd5d66..dc9765d913 100644
--- a/src/USER-INTEL/improper_cvff_intel.cpp
+++ b/src/USER-INTEL/improper_cvff_intel.cpp
@@ -42,7 +42,7 @@ typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
-ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) : 
+ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) :
   ImproperCvff(lmp)
 {
   suffix_flag |= Suffix::INTEL;
@@ -80,8 +80,8 @@ void ImproperCvffIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::compute(int eflag, int vflag,
-				    IntelBuffers<flt_t,acc_t> *buffers,
-				    const ForceConst<flt_t> &fc)
+                                    IntelBuffers<flt_t,acc_t> *buffers,
+                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
@@ -89,14 +89,14 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -109,9 +109,9 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void ImproperCvffIntel::eval(const int vflag, 
-				 IntelBuffers<flt_t,acc_t> *buffers,
-				 const ForceConst<flt_t> &fc)
+void ImproperCvffIntel::eval(const int vflag,
+                                 IntelBuffers<flt_t,acc_t> *buffers,
+                                 const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nimproperlist;
   if (inum == 0) return;
@@ -153,7 +153,7 @@ void ImproperCvffIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const improperlist = 
+    const int5_t * _noalias const improperlist =
       (int5_t *) neighbor->improperlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF_FIX
@@ -230,22 +230,22 @@ void ImproperCvffIntel::eval(const int vflag,
       #ifndef LMP_INTEL_USE_SIMDOFF_FIX
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
-	MPI_Comm_rank(world,&me);
-	if (screen) {
+        MPI_Comm_rank(world,&me);
+        if (screen) {
           char str[128];
-	  sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
+          sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
@@ -268,35 +268,35 @@ void ImproperCvffIntel::eval(const int vflag,
       {
         if (m == 2) {
           p = (flt_t)2.0*c*c;
-	  pd = (flt_t)2.0*c;
+          pd = (flt_t)2.0*c;
         } else if (m == 3) {
-	  const flt_t rc2 = c*c;
-	  p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
-	  pd = (flt_t)6.0*rc2 - (flt_t)1.5;
+          const flt_t rc2 = c*c;
+          p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
+          pd = (flt_t)6.0*rc2 - (flt_t)1.5;
         } else if (m == 4) {
           const flt_t rc2 = c*c;
-	  p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
-	  pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
+          p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
+          pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
         } else if (m == 6) {
           const flt_t rc2 = c*c;
-	  p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
-	  pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
+          p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
+          pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
         } else if (m == 1) {
-	  p = c + (flt_t)1.0;
-	  pd = (flt_t)0.5;
+          p = c + (flt_t)1.0;
+          pd = (flt_t)0.5;
         } else if (m == 5) {
-	  const flt_t rc2 = c*c;
-	  p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
-	  pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
+          const flt_t rc2 = c*c;
+          p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
+          pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
         } else if (m == 0) {
           p = (flt_t)2.0;
-	  pd = (flt_t)0.0;
+          pd = (flt_t)0.0;
         }
       }
 
       if (fc.fc[type].sign == -1) {
-	p = (flt_t)2.0 - p;
-	pd = -pd;
+        p = (flt_t)2.0 - p;
+        pd = -pd;
       }
 
       flt_t eimproper;
@@ -340,43 +340,43 @@ void ImproperCvffIntel::eval(const int vflag,
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
         if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
-	if (NEWTON_BOND || i3 < nlocal) {
+        if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF_FIX
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, 
-                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, 
-                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, 
-                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, 
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
+                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
                               nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
-	#else
-	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, 
-                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, 
-                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, 
-                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, 
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
+                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
                               nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
-	#endif
+        #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF_FIX
@@ -390,7 +390,7 @@ void ImproperCvffIntel::eval(const int vflag,
   if (EFLAG) energy += oeimproper;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style()
 
 template <class flt_t, class acc_t>
 void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
-					     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -444,11 +444,11 @@ void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nimproper != _nimpropertypes) {
     if (_nimpropertypes > 0)
       _memory->destroy(fc);
-    
+
     if (nimproper > 0)
       _memory->create(fc,nimproper,"improperharmonicintel.fc");
   }
diff --git a/src/USER-INTEL/improper_cvff_intel.h b/src/USER-INTEL/improper_cvff_intel.h
index 95ccd8f9d2..cb5da25f99 100644
--- a/src/USER-INTEL/improper_cvff_intel.h
+++ b/src/USER-INTEL/improper_cvff_intel.h
@@ -45,8 +45,8 @@ class ImproperCvffIntel : public ImproperCvff {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp
index cc854091f5..fe0efca5ec 100644
--- a/src/USER-INTEL/improper_harmonic_intel.cpp
+++ b/src/USER-INTEL/improper_harmonic_intel.cpp
@@ -43,7 +43,7 @@ typedef struct { int a,b,c,d,t;  } int5_t;
 
 /* ---------------------------------------------------------------------- */
 
-ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : 
+ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) :
   ImproperHarmonic(lmp)
 {
   suffix_flag |= Suffix::INTEL;
@@ -81,8 +81,8 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void ImproperHarmonicIntel::compute(int eflag, int vflag,
-				    IntelBuffers<flt_t,acc_t> *buffers,
-				    const ForceConst<flt_t> &fc)
+                                    IntelBuffers<flt_t,acc_t> *buffers,
+                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = 0;
@@ -90,14 +90,14 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
   if (evflag) {
     if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<0,1,1>(vflag, buffers, fc);
+        eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<0,1,0>(vflag, buffers, fc);
+        eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+        eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+        eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -110,9 +110,9 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
 /* ---------------------------------------------------------------------- */
 
 template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-void ImproperHarmonicIntel::eval(const int vflag, 
-				 IntelBuffers<flt_t,acc_t> *buffers,
-				 const ForceConst<flt_t> &fc)
+void ImproperHarmonicIntel::eval(const int vflag,
+                                 IntelBuffers<flt_t,acc_t> *buffers,
+                                 const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nimproperlist;
   if (inum == 0) return;
@@ -154,7 +154,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
     if (fix->need_zero(tid))
       memset(f, 0, f_stride * sizeof(FORCE_T));
 
-    const int5_t * _noalias const improperlist = 
+    const int5_t * _noalias const improperlist =
       (int5_t *) neighbor->improperlist[0];
 
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -221,22 +221,22 @@ void ImproperHarmonicIntel::eval(const int vflag,
       #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
-	MPI_Comm_rank(world,&me);
-	if (screen) {
+        MPI_Comm_rank(world,&me);
+        if (screen) {
           char str[128];
-	  sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
+          sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT " "
                   TAGINT_FORMAT " " TAGINT_FORMAT,
                   me,update->ntimestep,
                   atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
-	  error->warning(FLERR,str,0);
-	  fprintf(screen,"  1st atom: %d %g %g %g\n",
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
                   me,x[i1].x,x[i1].y,x[i1].z);
-	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
                   me,x[i2].x,x[i2].y,x[i2].z);
-	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
                   me,x[i3].x,x[i3].y,x[i3].z);
-	  fprintf(screen,"  4th atom: %d %g %g %g\n",
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
@@ -296,43 +296,43 @@ void ImproperHarmonicIntel::eval(const int vflag,
       {
         if (NEWTON_BOND || i1 < nlocal) {
           f[i1].x += f1x;
-	  f[i1].y += f1y;
-	  f[i1].z += f1z;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
         }
 
-	if (NEWTON_BOND || i2 < nlocal) {
+        if (NEWTON_BOND || i2 < nlocal) {
           f[i2].x += f2x;
-	  f[i2].y += f2y;
-	  f[i2].z += f2z;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
         }
 
         if (NEWTON_BOND || i3 < nlocal) {
           f[i3].x += f3x;
-	  f[i3].y += f3y;
-	  f[i3].z += f3z;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
         }
 
         if (NEWTON_BOND || i4 < nlocal) {
           f[i4].x += f4x;
-	  f[i4].y += f4y;
-	  f[i4].z += f4z;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
         }
       }
 
       if (EFLAG || VFLAG) {
         #ifdef LMP_INTEL_USE_SIMDOFF
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
-                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, 
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
                               f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
-                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, 
+                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND,
                               nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
         #else
         IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
-                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, 
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,
                               f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
-                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, 
+                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND,
                               nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
-	#endif
+        #endif
       }
     } // for n
     #ifdef LMP_INTEL_USE_SIMDOFF
@@ -346,7 +346,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
   if (EFLAG) energy += oeimproper;
   if (VFLAG && vflag) {
     virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
@@ -384,7 +384,7 @@ void ImproperHarmonicIntel::init_style()
 
 template <class flt_t, class acc_t>
 void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
-					     IntelBuffers<flt_t,acc_t> *buffers)
+                                             IntelBuffers<flt_t,acc_t> *buffers)
 {
   const int bp1 = atom->nimpropertypes + 1;
   fc.set_ntypes(bp1,memory);
@@ -399,11 +399,11 @@ void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
-	                                                  Memory *memory) {
+                                                          Memory *memory) {
   if (nimproper != _nimpropertypes) {
     if (_nimpropertypes > 0)
       _memory->destroy(fc);
-    
+
     if (nimproper > 0)
       _memory->create(fc,nimproper,"improperharmonicintel.fc");
   }
diff --git a/src/USER-INTEL/improper_harmonic_intel.h b/src/USER-INTEL/improper_harmonic_intel.h
index 4e38383863..0b759b4e43 100644
--- a/src/USER-INTEL/improper_harmonic_intel.h
+++ b/src/USER-INTEL/improper_harmonic_intel.h
@@ -45,8 +45,8 @@ class ImproperHarmonicIntel : public ImproperHarmonic {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
-  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
-	    const ForceConst<flt_t> &fc);
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index bacc8a8bad..3664bc248b 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -71,8 +71,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
       if (ev_global != 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:alloc_if(0) free_if(1)) \
-	  nocopy(f_start:alloc_if(0) free_if(1)) \
-	  nocopy(ev_global:alloc_if(0) free_if(1))
+          nocopy(f_start:alloc_if(0) free_if(1)) \
+          nocopy(ev_global:alloc_if(0) free_if(1))
       }
 
       if (q != 0) {
@@ -105,8 +105,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
-				       const int nthreads,
-				       const int offload_end)
+                                       const int nthreads,
+                                       const int offload_end)
 {
   free_buffers();
   _buf_size = static_cast<double>(nall) * 1.1 + 1;
@@ -151,15 +151,15 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
       if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
-	  nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
-	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+          nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
+          nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     } else {
       if (x != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
           nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
-	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+          nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     }
     if (lmp->atom->ellipsoid != NULL) {
@@ -186,7 +186,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
     if (tag != 0 && special != 0 && nspecial !=0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(tag:alloc_if(0) free_if(1)) \
-	nocopy(special,nspecial:alloc_if(0) free_if(1))
+        nocopy(special,nspecial:alloc_if(0) free_if(1))
     }
     _off_map_nmax = 0;
     _host_nmax = 0;
@@ -261,7 +261,7 @@ void IntelBuffers<flt_t, acc_t>::free_list_local()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
-	                                          const int offload_end)
+                                                  const int offload_end)
 {
   free_list_local();
   int size = list->get_maxlocal();
@@ -276,7 +276,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
     if (cnumneigh != 0) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
-	nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
+        nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
         nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
     }
     _off_map_ilist = ilist;
@@ -309,14 +309,14 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
                                                  const int nlocal,
-	                                         const int nthreads,
-	                                         const int offload_end,
-	                                         const int pack_width)
+                                                 const int nthreads,
+                                                 const int offload_end,
+                                                 const int pack_width)
 {
   free_nbor_list();
   _list_alloc_atoms = 1.10 * nlocal;
   int nt = MAX(nthreads, _off_threads);
-  int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * 
+  int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) *
     get_max_nbors();
   lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
   #ifdef _LMP_INTEL_OFFLOAD
@@ -380,8 +380,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
-	const int nthreads,
-	const int width)
+        const int nthreads,
+        const int width)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_ccachex && off_flag && _off_ccache == 0)
@@ -418,7 +418,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
     int *ccachej = _ccachej;
 
     if (ccachex != NULL && ccachey !=NULL && ccachez != NULL &&
-	ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
+        ccachew != NULL && ccachei != NULL && ccachej !=NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
@@ -471,7 +471,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
-					     const int nthreads)
+                                             const int nthreads)
 {
   const int nsize = get_max_nbors() * 3;
   int esize = MIN(sizeof(int), sizeof(flt_t));
@@ -507,7 +507,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
     int *ncachejtype = _ncachejtype;
 
     if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
-	ncachej != NULL && ncachejtype != NULL) {
+        ncachej != NULL && ncachejtype != NULL) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
@@ -522,9 +522,9 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
 
 #ifndef _LMP_INTEL_OFFLOAD
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, 
+void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
     const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
-    acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) 
+    acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
 {
   IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
                             ov1, ov2, ov3, ov4, ov5);
@@ -535,13 +535,13 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt,
 
 #ifndef _LMP_INTEL_OFFLOAD
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall, 
-    const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, 
+void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
+    const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
     acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
 {
   int iifrom, iito, tid;
   IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
-			 ov0, ov1, ov2, ov3, ov4, ov5);
+                         ov0, ov1, ov2, ov3, ov4, ov5);
 }
 #endif
 
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 9b73a65f60..135309fe44 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -62,7 +62,7 @@ class IntelBuffers {
 
   void free_buffers();
   void free_nmax();
-  inline void set_bininfo(int *atombin, int *binpacked) 
+  inline void set_bininfo(int *atombin, int *binpacked)
     { _atombin = atombin; _binpacked = binpacked; }
   inline void grow(const int nall, const int nlocal, const int nthreads,
                    const int offload_end) {
@@ -126,7 +126,7 @@ class IntelBuffers {
 
   inline void grow_nbor_list(NeighList *list, const int nlocal,
                              const int nthreads, const int offload_end,
-			     const int pack_width) {
+                             const int pack_width) {
     if (nlocal > _list_alloc_atoms)
       _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
@@ -165,7 +165,7 @@ class IntelBuffers {
   inline int get_off_threads() { return _off_threads; }
   #ifdef _LMP_INTEL_OFFLOAD
   inline void set_off_params(const int n, const int cop,
-			     const int separate_buffers)
+                             const int separate_buffers)
     { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; }
   inline vec3_acc_t * get_off_f() { return _off_f; }
   #endif
@@ -191,17 +191,17 @@ class IntelBuffers {
   }
 
   #ifndef _LMP_INTEL_OFFLOAD
-  void fdotr_reduce_l5(const int lf, const int lt, const int nthreads, 
-		       const int f_stride, acc_t &ov0, acc_t &ov1,
-		       acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
-  void fdotr_reduce(const int nall, const int nthreads, const int f_stride, 
-		    acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, 
-		    acc_t &ov4, acc_t &ov5);
+  void fdotr_reduce_l5(const int lf, const int lt, const int nthreads,
+                       const int f_stride, acc_t &ov0, acc_t &ov1,
+                       acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
+  void fdotr_reduce(const int nall, const int nthreads, const int f_stride,
+                    acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3,
+                    acc_t &ov4, acc_t &ov5);
   #endif
 
   #ifdef _LMP_INTEL_OFFLOAD
   inline void thr_pack_cop(const int ifrom, const int ito,
-			   const int offset, const bool dotype = false) {
+                           const int offset, const bool dotype = false) {
     double ** x = lmp->atom->x + offset;
     if (dotype == false) {
       #pragma vector nontemporal
@@ -214,16 +214,16 @@ class IntelBuffers {
       int *type = lmp->atom->type + offset;
       #pragma vector nontemporal
       for (int i = ifrom; i < ito; i++) {
-	_x[i].x = x[i][0];
-	_x[i].y = x[i][1];
-	_x[i].z = x[i][2];
-	_x[i].w = type[i];
+        _x[i].x = x[i][0];
+        _x[i].y = x[i][1];
+        _x[i].z = x[i][2];
+        _x[i].w = type[i];
       }
     }
   }
 
   inline void thr_pack_host(const int ifrom, const int ito,
-			    const int offset) {
+                            const int offset) {
     double ** x = lmp->atom->x + offset;
     for (int i = ifrom; i < ito; i++) {
       _host_x[i].x = x[i][0];
@@ -233,13 +233,13 @@ class IntelBuffers {
   }
 
   inline void pack_sep_from_single(const int host_min_local,
-				   const int used_local,
-				   const int host_min_ghost,
-				   const int used_ghost) {
+                                   const int used_local,
+                                   const int host_min_ghost,
+                                   const int used_ghost) {
     memcpy(_host_x + host_min_local, _x + host_min_local,
-	   used_local * sizeof(atom_t));
+           used_local * sizeof(atom_t));
     memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
-	   used_ghost * sizeof(atom_t));
+           used_ghost * sizeof(atom_t));
     int nall = used_local + used_ghost + host_min_local;
     _host_x[nall].x = INTEL_BIGP;
     _host_x[nall].y = INTEL_BIGP;
@@ -247,9 +247,9 @@ class IntelBuffers {
     _host_x[nall].w = 1;
     if (lmp->atom->q != NULL) {
       memcpy(_host_q + host_min_local, _q + host_min_local,
-	     used_local * sizeof(flt_t));
+             used_local * sizeof(flt_t));
       memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
-	     used_ghost * sizeof(flt_t));
+             used_ghost * sizeof(flt_t));
     }
   }
 
@@ -310,7 +310,7 @@ class IntelBuffers {
   _alignvar(acc_t _ev_global_host[8],64);
 
   void _grow(const int nall, const int nlocal, const int nthreads,
-	     const int offload_end);
+             const int offload_end);
   void _grow_nmax(const int offload_end);
   void _grow_list_local(NeighList *list, const int offload_end);
   void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
diff --git a/src/USER-INTEL/intel_intrinsics.h b/src/USER-INTEL/intel_intrinsics.h
index 56b488aa20..069eb5bed5 100644
--- a/src/USER-INTEL/intel_intrinsics.h
+++ b/src/USER-INTEL/intel_intrinsics.h
@@ -46,23 +46,23 @@ struct lmp_intel_an_fvec {
     lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; }
     lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; }
     const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] += b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] += b.data[:];
         return ret;
     }
     const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] -= b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] -= b.data[:];
         return ret;
     }
     const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] *= b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] *= b.data[:];
         return ret;
     }
     const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const {
-        lmp_intel_an_fvec ret = *this; 
-        ret.data[:] /= b.data[:]; 
+        lmp_intel_an_fvec ret = *this;
+        ret.data[:] /= b.data[:];
         return ret;
     }
     lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) {
@@ -103,18 +103,18 @@ struct lmp_intel_an_ivec {
     explicit lmp_intel_an_ivec(int i) { data[:] = i; }
     explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; }
     const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) {
-        lmp_intel_an_ivec ret = *this; 
-        ret.data[:] &= b.data[:]; 
+        lmp_intel_an_ivec ret = *this;
+        ret.data[:] &= b.data[:];
         return ret;
     }
     const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) {
-        lmp_intel_an_ivec ret = *this; 
-        ret.data[:] |= b.data[:]; 
+        lmp_intel_an_ivec ret = *this;
+        ret.data[:] |= b.data[:];
         return ret;
     }
     const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) {
-        lmp_intel_an_ivec ret = *this; 
-        ret.data[:] += b.data[:]; 
+        lmp_intel_an_ivec ret = *this;
+        ret.data[:] += b.data[:];
         return ret;
     }
 };
@@ -171,13 +171,13 @@ enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN };
 
 // This is used in the selection logic
 template<CalculationMode mode>
-struct vector_traits { 
-    static const bool support_integer_and_gather_ops = true; 
+struct vector_traits {
+    static const bool support_integer_and_gather_ops = true;
 };
 
 template<>
-struct vector_traits<AVX> { 
-    static const bool support_integer_and_gather_ops = false; 
+struct vector_traits<AVX> {
+    static const bool support_integer_and_gather_ops = false;
 };
 
 // This is the base template for all the different architectures
@@ -198,10 +198,10 @@ struct ivec32x16 {
   }
   explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); }
   operator __m512i() const { return vec; }
-  friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) { 
+  friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) {
     return _mm512_and_epi32(a, b);
   }
-  friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) { 
+  friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) {
     return _mm512_or_epi32(a, b);
   }
   friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) {
@@ -326,7 +326,7 @@ struct vector_ops<double, KNC> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  8);
@@ -337,7 +337,7 @@ struct vector_ops<double, KNC> {
       *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48);
       *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  8);
@@ -464,7 +464,7 @@ struct vector_ops<float, KNC> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -475,7 +475,7 @@ struct vector_ops<float, KNC> {
       *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
       *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -519,10 +519,10 @@ struct ivec32x8 {
   }
   explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); }
   operator __m256i() const { return vec; }
-  friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) { 
+  friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) {
     return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
-  friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) { 
+  friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) {
     return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
   friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) {
@@ -545,10 +545,10 @@ struct avx_bvec {
   operator F64vec4() const { return _mm256_castsi256_pd(vec); }
   operator F32vec8() const { return _mm256_castsi256_ps(vec); }
   operator ivec32x8() const { return vec; }
-  friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) { 
+  friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) {
     return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
-  friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) { 
+  friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) {
     return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)));
   }
   friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); }
@@ -582,8 +582,8 @@ struct vector_ops<double, AVX> {
       _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
       _mm256_store_pd(reinterpret_cast<double*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = mask_test_at(mask, i) 
-            ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]) 
+        result[i] = mask_test_at(mask, i)
+            ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i])
             : src[i];
       }
       return _mm256_load_pd(reinterpret_cast<double*>(result));
@@ -605,18 +605,18 @@ struct vector_ops<double, AVX> {
       __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
       __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
       __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
-      *x = blend(mask, *x, c0); 
-      *y = blend(mask, *y, c1); 
-      *z = blend(mask, *z, c2); 
+      *x = blend(mask, *x, c0);
+      *y = blend(mask, *y, c1);
+      *z = blend(mask, *z, c2);
       *w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0)));
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       iarr i, m;
       _mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs);
@@ -642,10 +642,10 @@ struct vector_ops<double, AVX> {
       __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20);
       __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31);
       __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31);
-      *r0 = blend(mask, *r0, c0); 
-      *r1 = blend(mask, *r1, c1); 
-      *r2 = blend(mask, *r2, c2); 
-      *r3 = blend(mask, *r3, c3); 
+      *r0 = blend(mask, *r0, c0);
+      *r1 = blend(mask, *r1, c1);
+      *r2 = blend(mask, *r2, c2);
+      *r3 = blend(mask, *r3, c3);
     }
     static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
       return (b & mask) | (a & ~ mask);
@@ -809,8 +809,8 @@ struct vector_ops<float, AVX> {
       _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
       _mm256_store_ps(reinterpret_cast<float*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = mask_test_at(mask, i) 
-            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) 
+        result[i] = mask_test_at(mask, i)
+            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
             : src[i];
       }
       return _mm256_load_ps(reinterpret_cast<float*>(result));
@@ -842,18 +842,18 @@ struct vector_ops<float, AVX> {
       __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
       __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
       __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
-      *x = blend(mask, *x, c0); 
-      *y = blend(mask, *y, c1); 
-      *z = blend(mask, *z, c2); 
+      *x = blend(mask, *x, c0);
+      *y = blend(mask, *y, c1);
+      *z = blend(mask, *z, c2);
       *w = int_blend(mask, *w, _mm256_castps_si256(c3));
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       iarr i, m;
       int_store(i, idxs);
@@ -880,10 +880,10 @@ struct vector_ops<float, AVX> {
       __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE);
       __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44);
       __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE);
-      *r0 = blend(mask, *r0, c0); 
-      *r1 = blend(mask, *r1, c1); 
-      *r2 = blend(mask, *r2, c2); 
-      *r3 = blend(mask, *r3, c3); 
+      *r0 = blend(mask, *r0, c0);
+      *r1 = blend(mask, *r1, c1);
+      *r2 = blend(mask, *r2, c2);
+      *r3 = blend(mask, *r3, c3);
     }
     static fvec blend(const bvec &mask, const fvec &a, const fvec &b) {
       return (b & mask) | (a & ~ mask);
@@ -961,8 +961,8 @@ struct vector_ops<float, AVX> {
       _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx);
       _mm256_store_si256(reinterpret_cast<__m256i*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = mask_test_at(mask, i) 
-            ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) 
+        result[i] = mask_test_at(mask, i)
+            ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
             : src[i];
       }
       return _mm256_load_si256(reinterpret_cast<__m256i*>(result));
@@ -1038,10 +1038,10 @@ struct avx2_ivec32 {
   }
   explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); }
   operator __m256i() const { return vec; }
-  friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) { 
+  friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) {
     return _mm256_and_si256(a, b);
   }
-  friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) { 
+  friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) {
     return _mm256_or_si256(a, b);
   }
   friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) {
@@ -1060,14 +1060,14 @@ struct avx2_bvec {
   operator F64vec4() const { return _mm256_castsi256_pd(vec); }
   operator F32vec8() const { return _mm256_castsi256_ps(vec); }
   operator avx2_ivec32() const { return vec; }
-  friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) { 
+  friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) {
     return _mm256_and_si256(a, b);
   }
-  friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) { 
+  friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) {
     return _mm256_or_si256(a, b);
   }
   friend avx2_bvec operator ~(const avx2_bvec &a) {
-    return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF)); 
+    return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF));
   }
   avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); }
 };
@@ -1106,13 +1106,13 @@ struct vector_ops<double, AVX2> {
       *z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1);
       *w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idx, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idx, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120
       ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8);
@@ -1253,7 +1253,7 @@ struct vector_ops<float, AVX2> {
       *z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1);
       *w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -1264,7 +1264,7 @@ struct vector_ops<float, AVX2> {
       *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24);
       *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) +  4);
@@ -1401,10 +1401,10 @@ struct ivec32x4 {
   }
   explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); }
   operator __m128i() const { return vec; }
-  friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) { 
+  friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) {
     return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
-  friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) { 
+  friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) {
     return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
   friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) {
@@ -1420,10 +1420,10 @@ struct sse_bvecx4 {
   operator __m128i() const { return vec; }
   operator F64vec2() const { return _mm_castsi128_pd(vec); }
   operator ivec32x4() const { return vec; }
-  friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) { 
+  friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) {
     return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
-  friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) { 
+  friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) {
     return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b)));
   }
   friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); }
@@ -1477,18 +1477,18 @@ struct vector_ops<double, SSE> {
       __m128d c1 = _mm_unpackhi_pd(a0lo, a1lo);
       __m128d c2 = _mm_unpacklo_pd(a0hi, a1hi);
       __m128d c3 = _mm_unpackhi_pd(a0hi, a1hi);
-      *x = blend(mask, *x, c0); 
-      *y = blend(mask, *y, c1); 
-      *z = blend(mask, *z, c2); 
+      *x = blend(mask, *x, c0);
+      *y = blend(mask, *y, c1);
+      *z = blend(mask, *z, c2);
       *w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0));
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  8);
@@ -1634,8 +1634,8 @@ struct vector_ops<float, SSE> {
       _mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx);
       _mm_store_ps(reinterpret_cast<float*>(src), from);
       for (int i = 0; i < VL; i++) {
-        result[i] = m[i] 
-            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) 
+        result[i] = m[i]
+            ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i])
             : src[i];
       }
       return _mm_load_ps(reinterpret_cast<float*>(result));
@@ -1647,13 +1647,13 @@ struct vector_ops<float, SSE> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0);
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  4);
@@ -1816,13 +1816,13 @@ struct vector_ops<flt_t, NONE> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0 * sizeof(fscal));
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  1 * sizeof(fscal));
@@ -1946,13 +1946,13 @@ struct vector_ops<flt_t, AN> {
       *z = gather<1>(*z, mask, idxs, &base->z);
       *w = int_gather<1>(*w, mask, idxs, &base->w);
     }
-    static void gather_8(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_8(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) {
       fvec a = zero(), b = zero(), c = zero(), d = zero();
       gather_4(idxs, mask, base, r0, r1, r2, r3);
       gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7);
     }
-    static void gather_4(const ivec &idxs, const bvec &mask, const void *base, 
+    static void gather_4(const ivec &idxs, const bvec &mask, const void *base,
         fvec *r0, fvec *r1, fvec *r2, fvec *r3) {
       *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) +  0 * sizeof(fscal));
       *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) +  1 * sizeof(fscal));
@@ -2113,7 +2113,7 @@ struct AccumulatorTwiceMixin {
 
   typedef avec_t avec;
   typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN)));
-  
+
   static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
     typename HIGH::fvec blo = BASE::cvtup_lo(b);
     typename HIGH::fvec bhi = BASE::cvtup_hi(b);
@@ -2121,7 +2121,7 @@ struct AccumulatorTwiceMixin {
     BASE::mask_cvtup(m, &mlo, &mhi);
     return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi));
   }
-  
+
   static typename HIGH::fscal acc_reduce_add(const avec &a) {
     return HIGH::reduce_add(a.lo + a.hi);
   }
@@ -2143,13 +2143,13 @@ template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic>
 struct AccumulatorTwiceMixinNone {
   typedef vector_ops<BASE_flt_t, mic> BASE;
   typedef vector_ops<HIGH_flt_t, mic> HIGH;
- 
+
   typedef typename HIGH::fvec avec;
   typedef typename HIGH::fscal aarr[BASE::VL];
-  
+
   static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) {
      return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b));
-  }  
+  }
   static typename HIGH::fscal acc_reduce_add(const avec &a) {
     return HIGH::reduce_add(a);
   }
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 93787cd6c8..d5cf6f5be2 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -134,374 +134,374 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_HTHREADS 2
 #endif
 
-#define IP_PRE_get_stride(stride, n, datasize, torque)	\
-  {								\
-    int blength = n;						\
-    if (torque) blength *= 2;					\
-    const int bytes = blength * datasize;			\
+#define IP_PRE_get_stride(stride, n, datasize, torque)  \
+  {                                                             \
+    int blength = n;                                            \
+    if (torque) blength *= 2;                                   \
+    const int bytes = blength * datasize;                       \
     stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN);     \
-    stride = blength + stride / datasize;			\
+    stride = blength + stride / datasize;                       \
   }
 
 #if defined(_OPENMP)
 
-#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) 	\
-  {								\
-    int idelta = inum/nthreads;					\
-    const int imod = inum % nthreads;				\
-    ifrom = tid * idelta;					\
-    ito = ifrom + idelta;					\
-    if (tid < imod) {						\
-      ito+=tid+1;						\
-      ifrom+=tid;						\
-    } else {							\
-      ito+=imod;						\
-      ifrom+=imod;						\
-    }								\
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
+  {                                                             \
+    int idelta = inum/nthreads;                                 \
+    const int imod = inum % nthreads;                           \
+    ifrom = tid * idelta;                                       \
+    ito = ifrom + idelta;                                       \
+    if (tid < imod) {                                           \
+      ito+=tid+1;                                               \
+      ifrom+=tid;                                               \
+    } else {                                                    \
+      ito+=imod;                                                \
+      ifrom+=imod;                                              \
+    }                                                           \
   }
 
-#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
-  {								\
-    tid = omp_get_thread_num();         			\
-    IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);		\
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);          \
   }
 
-#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr)	\
-  {								\
-    if (nthr <= INTEL_HTHREADS) {				\
-      ifrom = tid;						\
-      ito = inum;					      	\
-      ip = nthr;						\
-    } else if (nthr % INTEL_HTHREADS == 0) {			\
-      int nd = nthr / INTEL_HTHREADS;				\
-      int td = tid / INTEL_HTHREADS;				\
-      int tm = tid % INTEL_HTHREADS;				\
-      IP_PRE_omp_range(ifrom, ito, td, inum, nd);		\
-      ifrom += tm;						\
-      ip = INTEL_HTHREADS;					\
-    } else {							\
-      IP_PRE_omp_range(ifrom, ito, tid, inum, nthr);		\
-      ip = 1;							\
-    }								\
+#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr)      \
+  {                                                             \
+    if (nthr <= INTEL_HTHREADS) {                               \
+      ifrom = tid;                                              \
+      ito = inum;                                               \
+      ip = nthr;                                                \
+    } else if (nthr % INTEL_HTHREADS == 0) {                    \
+      int nd = nthr / INTEL_HTHREADS;                           \
+      int td = tid / INTEL_HTHREADS;                            \
+      int tm = tid % INTEL_HTHREADS;                            \
+      IP_PRE_omp_range(ifrom, ito, td, inum, nd);               \
+      ifrom += tm;                                              \
+      ip = INTEL_HTHREADS;                                      \
+    } else {                                                    \
+      IP_PRE_omp_range(ifrom, ito, tid, inum, nthr);            \
+      ip = 1;                                                   \
+    }                                                           \
   }
 
-#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)	\
-  {								\
-    tid = omp_get_thread_num();         			\
-    IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr);		\
+#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr);         \
   }
 
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
   int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
-  int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
-				     /chunk_size/nthreads));	\
-  idelta *= chunk_size;						\
+  int idelta = static_cast<int>(ceil(static_cast<float>(inum)   \
+                                     /chunk_size/nthreads));    \
+  idelta *= chunk_size;                                         \
   ifrom = tid*idelta;                                           \
   ito = ifrom + idelta;                                         \
   if (ito > inum) ito = inum;                                   \
 }
 
 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
-				nthreads, datasize)		\
-  {								\
-    tid = omp_get_thread_num();         			\
+                                nthreads, datasize)             \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
     IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads,     \
-			   datasize);				\
+                           datasize);                           \
   }
 
 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
-				nthreads, vecsize)		\
-  {								\
-    tid = omp_get_thread_num();         			\
-    int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
-				       /vecsize/nthreads));	\
-    idelta *= vecsize;						\
-    ifrom = tid*idelta;						\
-    ito = ifrom + idelta;					\
-    if (ito > inum) ito = inum;					\
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
+                                       /vecsize/nthreads));     \
+    idelta *= vecsize;                                          \
+    ifrom = tid*idelta;                                         \
+    ito = ifrom + idelta;                                       \
+    if (ito > inum) ito = inum;                                 \
   }
 
-#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \  
-                                 nthr, vecsize)			\
-  {								\
-    tid = omp_get_thread_num();					\
-    if (nthr <= INTEL_HTHREADS) {				\
-      ifrom = tid*vecsize;					\
-      ito = inum;					      	\
-      ip = nthr*vecsize;					\
-    } else if (nthr % INTEL_HTHREADS == 0) {			\
-      int nd = nthr / INTEL_HTHREADS;				\
-      int td = tid / INTEL_HTHREADS;				\
-      int tm = tid % INTEL_HTHREADS;				\
+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
+                                 nthr, vecsize)                 \
+  {                                                             \
+    tid = omp_get_thread_num();                                 \
+    if (nthr <= INTEL_HTHREADS) {                               \
+      ifrom = tid*vecsize;                                      \
+      ito = inum;                                               \
+      ip = nthr*vecsize;                                        \
+    } else if (nthr % INTEL_HTHREADS == 0) {                    \
+      int nd = nthr / INTEL_HTHREADS;                           \
+      int td = tid / INTEL_HTHREADS;                            \
+      int tm = tid % INTEL_HTHREADS;                            \
       IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd,         \
-	vecsize);						\
-      ifrom += tm * vecsize;					\
-      ip = INTEL_HTHREADS * vecsize;				\
-    } else {							\
+        vecsize);                                               \
+      ifrom += tm * vecsize;                                    \
+      ip = INTEL_HTHREADS * vecsize;                            \
+    } else {                                                    \
       IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr,      \
-			      vecsize);				\
-      ip = vecsize;						\
-    }								\
+                              vecsize);                         \
+      ip = vecsize;                                             \
+    }                                                           \
   }
 
 #else
 
-#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
-  {								\
-    ifrom = 0;							\
-    ito = inum;						        \
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
+  {                                                             \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
   }
 
-#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
-  {								\
-    tid = 0;							\
-    ifrom = 0;							\
-    ito = inum;							\
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
   }
 
-#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads)	\
-  {								\
-    ifrom = 0;							\
-    ito = inum;						        \
-    ip = 1;							\
+#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads)   \
+  {                                                             \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = 1;                                                     \
   }
 
-#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)	\
-  {								\
-    tid = 0;							\
-    ifrom = 0;							\
-    ito = inum;							\
-    ip = 1;							\
+#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = 1;                                                     \
   }
 
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
-    ifrom = 0;							\
-    ito = inum;						        \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
 }
 
 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
-				nthreads, datasize)		\
-{								\
-  tid = 0;							\
-  ifrom = 0;							\
-  ito = inum;							\
+                                nthreads, datasize)             \
+{                                                               \
+  tid = 0;                                                      \
+  ifrom = 0;                                                    \
+  ito = inum;                                                   \
 }
 
 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
-				nthreads, vecsize)		\
-  {								\
-    tid = 0;                            			\
-    ifrom = 0;							\
-    ito = inum;							\
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
   }
 
-#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum,	\
-				nthreads, vecsize)		\
-  {								\
-    tid = 0;                            			\
-    ifrom = 0;							\
-    ito = inum;							\
-    ip = vecsize;						\
+#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum,      \
+                                nthreads, vecsize)              \
+  {                                                             \
+    tid = 0;                                                    \
+    ifrom = 0;                                                  \
+    ito = inum;                                                 \
+    ip = vecsize;                                               \
   }
 
 #endif
 
-#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,	\
-				  f_stride, pos, ov0, ov1, ov2,		\
-				  ov3, ov4, ov5)			\
-{									\
-  acc_t *f_scalar = &f_start[0].x;					\
-  flt_t *x_scalar = &pos[minlocal].x;					\
-  int f_stride4 = f_stride * 4;						\
-  _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);				\
-  int vwidth;								\
-  if (sizeof(acc_t) == sizeof(double))					\
-    vwidth = INTEL_COMPILE_WIDTH/2;					\
-  else									\
-    vwidth = INTEL_COMPILE_WIDTH;					\
-  if (vwidth < 4) vwidth = 4;						\
-  _use_simd_pragma("vector aligned")          				\ 
-  _use_simd_pragma("simd")					        \
-  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;			\
-  int remainder = lt % vwidth;						\
-  if (lf > lt) remainder = 0;						\
-  const int v_range = lt - remainder;					\
-  if (nthreads == 2) {							\
-    acc_t *f_scalar2 = f_scalar + f_stride4;				\
-    for (int n = lf; n < v_range; n += vwidth) {			\
-      _use_simd_pragma("vector aligned")				\ 
-      _use_simd_pragma("simd")					        \
-      for (int v = 0; v < vwidth; v++) {				\
-	f_scalar[n+v] += f_scalar2[n+v];				\
-	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
-      }									\
-      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
-      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
-      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
-      if (vwidth > 4) {							\
-	ov3 += f_scalar[n+5] * x_scalar[n+4];				\
-	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
-	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
-      }									\
-      if (vwidth > 8) {							\
-        ov3 += f_scalar[n+9] * x_scalar[n+8];				\
-        ov3 += f_scalar[n+13] * x_scalar[n+12];				\
-	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
-	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
-	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
-	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
-      }									\
-    }									\
-    _use_simd_pragma("vector aligned")				        \ 
-    _use_simd_pragma("ivdep")						\
-    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")	\
-    for (int n = v_range; n < lt; n++)					\
-      f_scalar[n] += f_scalar2[n];					\
-  } else if (nthreads==4) {						\
-    acc_t *f_scalar2 = f_scalar + f_stride4;				\
-    acc_t *f_scalar3 = f_scalar2 + f_stride4;				\
-    acc_t *f_scalar4 = f_scalar3 + f_stride4;				\
-    for (int n = lf; n < v_range; n += vwidth) {			\
-      _use_simd_pragma("vector aligned")				\ 
-      _use_simd_pragma("simd")						\
-      for (int v = 0; v < vwidth; v++) {				\
-	f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +		\
-	  f_scalar4[n+v];						\
-	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
-      }									\
-      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
-      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
-      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
-      if (vwidth > 4) {							\
-	ov3 += f_scalar[n+5] * x_scalar[n+4];				\
-	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
-	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
-      }									\
-      if (vwidth > 8) {							\
-        ov3 += f_scalar[n+9] * x_scalar[n+8];				\
-	ov3 += f_scalar[n+13] * x_scalar[n+12];				\
-	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
-	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
-	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
-	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
-      }									\
-    }									\
-    _use_simd_pragma("vector aligned")				        \ 
-    _use_simd_pragma("ivdep")						\
-    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")	\
-    for (int n = v_range; n < lt; n++)				        \
-      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];	\
-  } else if (nthreads==1) {						\
-    for (int n = lf; n < v_range; n += vwidth) {			\
-      _use_simd_pragma("vector aligned")				\
-      _use_simd_pragma("simd")						\
-      for (int v = 0; v < vwidth; v++) 				        \
-	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
-      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
-      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
-      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
-      if (vwidth > 4) {							\
-        ov3 += f_scalar[n+5] * x_scalar[n+4];				\
-	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
-	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
-      }									\
-      if (vwidth > 8) {							\
-	ov3 += f_scalar[n+9] * x_scalar[n+8];				\
-	ov3 += f_scalar[n+13] * x_scalar[n+12];				\
-	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
-	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
-	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
-	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
-      }									\
-    }									\
-  } else if (nthreads==3) {						\
-    acc_t *f_scalar2 = f_scalar + f_stride4;				\
-    acc_t *f_scalar3 = f_scalar2 + f_stride4;				\
-    for (int n = lf; n < v_range; n += vwidth) {			\
-      _use_simd_pragma("vector aligned")				\ 
-      _use_simd_pragma("simd")						\
-      for (int v = 0; v < vwidth; v++) {				\
-	f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];		\
-	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
-      }									\
-      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
-      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
-      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
-      if (vwidth > 4) {							\
-	ov3 += f_scalar[n+5] * x_scalar[n+4];				\
-	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
-	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
-      }									\
-      if (vwidth > 8) {							\
-        ov3 += f_scalar[n+9] * x_scalar[n+8];				\
-	ov3 += f_scalar[n+13] * x_scalar[n+12];				\
-	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
-	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
-	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
-	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
-      }									\
-    }									\
-    _use_simd_pragma("vector aligned")				        \ 
-    _use_simd_pragma("ivdep")						\
-    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")	\
-    for (int n = v_range; n < lt; n++)				        \
-      f_scalar[n] += f_scalar2[n] + f_scalar3[n];			\
-  }									\
-  for (int n = v_range; n < lt; n += 4) {				\
-    _use_simd_pragma("vector aligned")				        \ 
-    _use_simd_pragma("ivdep")						\
-    for (int v = 0; v < 4; v++) 				        \
-      ovv[v] += f_scalar[n+v] * x_scalar[n+v];				\
-    ov3 += f_scalar[n+1] * x_scalar[n+0];				\
-    ov4 += f_scalar[n+2] * x_scalar[n+0];				\
-    ov5 += f_scalar[n+2] * x_scalar[n+1];				\
-  }									\
-  ov0 += ovv[0];							\
-  ov1 += ovv[1];						       	\
-  ov2 += ovv[2];							\
-  if (vwidth > 4) {							\
-    ov0 += ovv[4];							\
-    ov1 += ovv[5];							\
-    ov2 += ovv[6];							\
-  }									\
-  if (vwidth > 8) {							\
-    ov0 += ovv[8] + ovv[12];						\
-    ov1 += ovv[9] + ovv[13];						\
-    ov2 += ovv[10] + ovv[14];						\
-  }									\
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
+                                  f_stride, pos, ov0, ov1, ov2,         \
+                                  ov3, ov4, ov5)                        \
+{                                                                       \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  flt_t *x_scalar = &pos[minlocal].x;                                   \
+  int f_stride4 = f_stride * 4;                                         \
+  _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);                         \
+  int vwidth;                                                           \
+  if (sizeof(acc_t) == sizeof(double))                                  \
+    vwidth = INTEL_COMPILE_WIDTH/2;                                     \
+  else                                                                  \
+    vwidth = INTEL_COMPILE_WIDTH;                                       \
+  if (vwidth < 4) vwidth = 4;                                           \
+  _use_simd_pragma("vector aligned")                                    \
+  _use_simd_pragma("simd")                                              \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
+  int remainder = lt % vwidth;                                          \
+  if (lf > lt) remainder = 0;                                           \
+  const int v_range = lt - remainder;                                   \
+  if (nthreads == 2) {                                                  \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v];                                \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n];                                      \
+  } else if (nthreads==4) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
+          f_scalar4[n+v];                                               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
+  } else if (nthreads==1) {                                             \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++)                                  \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+  } else if (nthreads==3) {                                             \
+    acc_t *f_scalar2 = f_scalar + f_stride4;                            \
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
+    for (int n = lf; n < v_range; n += vwidth) {                        \
+      _use_simd_pragma("vector aligned")                                \
+      _use_simd_pragma("simd")                                          \
+      for (int v = 0; v < vwidth; v++) {                                \
+        f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
+        ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
+      }                                                                 \
+      ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
+      ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
+      ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
+      if (vwidth > 4) {                                                 \
+        ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
+        ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
+        ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
+      }                                                                 \
+      if (vwidth > 8) {                                                 \
+        ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
+        ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
+        ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
+        ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
+        ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
+        ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
+      }                                                                 \
+    }                                                                   \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
+    for (int n = v_range; n < lt; n++)                                  \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
+  }                                                                     \
+  for (int n = v_range; n < lt; n += 4) {                               \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("ivdep")                                           \
+    for (int v = 0; v < 4; v++)                                         \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
+    ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
+    ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
+    ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
+  }                                                                     \
+  ov0 += ovv[0];                                                        \
+  ov1 += ovv[1];                                                        \
+  ov2 += ovv[2];                                                        \
+  if (vwidth > 4) {                                                     \
+    ov0 += ovv[4];                                                      \
+    ov1 += ovv[5];                                                      \
+    ov2 += ovv[6];                                                      \
+  }                                                                     \
+  if (vwidth > 8) {                                                     \
+    ov0 += ovv[8] + ovv[12];                                            \
+    ov1 += ovv[9] + ovv[13];                                            \
+    ov2 += ovv[10] + ovv[14];                                           \
+  }                                                                     \
 }
 
-#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,	\
-                               f_stride, pos, offload, vflag, ov0, ov1,	\
-                               ov2, ov3, ov4, ov5)			\
-{									\
-  int o_range = (nall - minlocal) * 4;					\
-  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,	\
-			    sizeof(acc_t));				\
-									\
-  acc_t *f_scalar = &f_start[0].x;					\
-  int f_stride4 = f_stride * 4;						\
-  int t;								\
-  if (vflag == 2) t = 4; else t = 1;					\
-  acc_t *f_scalar2 = f_scalar + f_stride4 * t;				\
-  for ( ; t < nthreads; t++) {						\
-    _use_simd_pragma("vector aligned")					\
-    _use_simd_pragma("simd") 					        \
-    for (int n = iifrom; n < iito; n++)				        \
-      f_scalar[n] += f_scalar2[n];					\
-    f_scalar2 += f_stride4;						\
-  }									\
-									\
-  if (vflag == 2) {							\
-    int nt_min = MIN(4,nthreads);					\
-    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,	\
-			      f_stride, pos, ov0, ov1, ov2, ov3, ov4,	\
-			      ov5);					\
-  }									\
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
+                               f_stride, pos, offload, vflag, ov0, ov1, \
+                               ov2, ov3, ov4, ov5)                      \
+{                                                                       \
+  int o_range = (nall - minlocal) * 4;                                  \
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
+                            sizeof(acc_t));                             \
+                                                                        \
+  acc_t *f_scalar = &f_start[0].x;                                      \
+  int f_stride4 = f_stride * 4;                                         \
+  int t;                                                                \
+  if (vflag == 2) t = 4; else t = 1;                                    \
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
+  for ( ; t < nthreads; t++) {                                          \
+    _use_simd_pragma("vector aligned")                                  \
+    _use_simd_pragma("simd")                                            \
+    for (int n = iifrom; n < iito; n++)                                 \
+      f_scalar[n] += f_scalar2[n];                                      \
+    f_scalar2 += f_stride4;                                             \
+  }                                                                     \
+                                                                        \
+  if (vflag == 2) {                                                     \
+    int nt_min = MIN(4,nthreads);                                       \
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
+                              f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
+                              ov5);                                     \
+  }                                                                     \
 }
 
 #ifdef _LMP_INTEL_OFFLOAD
@@ -517,131 +517,131 @@ inline double MIC_Wtime() {
   return time;
 }
 
-#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,	\
-				     nlocal, nall)			\
-{									\
-    if (fix->separate_buffers() && ago != 0) {				\
-    fix->start_watch(TIME_PACK);					\
-    if (offload) {							\
-      int packthreads;							\
+#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
+                                     nlocal, nall)                      \
+{                                                                       \
+    if (fix->separate_buffers() && ago != 0) {                          \
+    fix->start_watch(TIME_PACK);                                        \
+    if (offload) {                                                      \
+      int packthreads;                                                  \
       if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
-      else packthreads = 1;						\
-      _use_omp_pragma("omp parallel if(packthreads > 1)")		\
-      {									\
-        int ifrom, ito, tid;						\
-	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		\
-				  packthreads, sizeof(flt_t));		\
-	buffers->thr_pack_cop(ifrom, ito, 0);				\
-	int nghost = nall - nlocal;					\
-	if (nghost) {							\
-	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,	\
-				 packthreads, sizeof(flt_t));		\
-	  buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,		\
-				fix->offload_min_ghost() - nlocal,	\
-				ago == 1);				\
-	}								\
-      }									\
-    } else {								\
-      buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);		\
-      buffers->thr_pack_host(nlocal, nall,				\
-			     fix->host_min_ghost()-nlocal);		\
-    }									\
-    fix->stop_watch(TIME_PACK);						\
-  }									\
+      else packthreads = 1;                                             \
+      _use_omp_pragma("omp parallel if(packthreads > 1)")               \
+      {                                                                 \
+        int ifrom, ito, tid;                                            \
+        IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,              \
+                                  packthreads, sizeof(flt_t));          \
+        buffers->thr_pack_cop(ifrom, ito, 0);                           \
+        int nghost = nall - nlocal;                                     \
+        if (nghost) {                                                   \
+          IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,        \
+                                 packthreads, sizeof(flt_t));           \
+          buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,           \
+                                fix->offload_min_ghost() - nlocal,      \
+                                ago == 1);                              \
+        }                                                               \
+      }                                                                 \
+    } else {                                                            \
+      buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);         \
+      buffers->thr_pack_host(nlocal, nall,                              \
+                             fix->host_min_ghost()-nlocal);             \
+    }                                                                   \
+    fix->stop_watch(TIME_PACK);                                         \
+  }                                                                     \
 }
 
-#define IP_PRE_get_transfern(ago, newton, eflag, vflag,			\
-			     buffers, offload, fix, separate_flag,	\
-			     x_size, q_size, ev_size, f_stride)		\
-{									\
-  separate_flag = 0;							\
-  if (ago == 0) {							\
-    x_size = 0;								\
-    q_size = nall;							\
-    if (offload) {							\
-      if (fix->separate_buffers()) {					\
-	if (lmp->atom->torque)						\
-	  separate_flag = 2;						\
-	else								\
-	  separate_flag = 1;						\
-      } else								\
-	separate_flag = 3;						\
-    }									\
-  } else {								\
-    x_size = nall;							\
-    q_size = 0;								\
-  }									\
-  ev_size = 0;								\
-  if (eflag) ev_size = 2;						\
-  if (vflag) ev_size = 8;						\
-  if (newton)								\
-    f_stride = buffers->get_stride(nall);				\
-  else									\
-    f_stride = buffers->get_stride(inum);				\
+#define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
+                             buffers, offload, fix, separate_flag,      \
+                             x_size, q_size, ev_size, f_stride)         \
+{                                                                       \
+  separate_flag = 0;                                                    \
+  if (ago == 0) {                                                       \
+    x_size = 0;                                                         \
+    q_size = nall;                                                      \
+    if (offload) {                                                      \
+      if (fix->separate_buffers()) {                                    \
+        if (lmp->atom->torque)                                          \
+          separate_flag = 2;                                            \
+        else                                                            \
+          separate_flag = 1;                                            \
+      } else                                                            \
+        separate_flag = 3;                                              \
+    }                                                                   \
+  } else {                                                              \
+    x_size = nall;                                                      \
+    q_size = 0;                                                         \
+  }                                                                     \
+  ev_size = 0;                                                          \
+  if (eflag) ev_size = 2;                                               \
+  if (vflag) ev_size = 8;                                               \
+  if (newton)                                                           \
+    f_stride = buffers->get_stride(nall);                               \
+  else                                                                  \
+    f_stride = buffers->get_stride(inum);                               \
 }
 
-#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
-			   ev_global)					\
-{									\
-  if (offload) {							\
-    tc = buffers->get_off_threads();					\
-    f_start = buffers->get_off_f();					\
-    ev_global = buffers->get_ev_global();				\
-  } else {								\
-    tc = comm->nthreads;						\
-    f_start = buffers->get_f();						\
-    fix->start_watch(TIME_HOST_PAIR);					\
-    ev_global = buffers->get_ev_global_host();				\
-  }									\
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
+                           ev_global)                                   \
+{                                                                       \
+  if (offload) {                                                        \
+    tc = buffers->get_off_threads();                                    \
+    f_start = buffers->get_off_f();                                     \
+    ev_global = buffers->get_ev_global();                               \
+  } else {                                                              \
+    tc = comm->nthreads;                                                \
+    f_start = buffers->get_f();                                         \
+    fix->start_watch(TIME_HOST_PAIR);                                   \
+    ev_global = buffers->get_ev_global_host();                          \
+  }                                                                     \
 }
 
-#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
-				  f_stride, x, q)			\
-{									\
-  if (separate_flag) {							\
-    if (separate_flag < 3) {						\
-      int all_local = nlocal;						\
-      int ghost_min = overflow[LMP_GHOST_MIN];				\
-      nlocal = overflow[LMP_LOCAL_MAX] + 1;				\
-      int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;		\
-      if (nghost < 0) nghost = 0;					\
-      nall = nlocal + nghost;						\
-      separate_flag--;							\
-      int flength;							\
-      if (newton) flength = nall;					\
-      else flength = nlocal;						\
-      IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		\
-			   separate_flag);				\
-      if (nghost) {							\
-	if (nlocal < all_local || ghost_min > all_local) {		\
-	  memmove(x + nlocal, x + ghost_min,				\
-		  (nall - nlocal) * sizeof(ATOM_T));			\
-	  if (q != 0)							\
-	    memmove((void *)(q + nlocal), (void *)(q + ghost_min),	\
-		    (nall - nlocal) * sizeof(flt_t));			\
-	}								\
-      }									\
-    }									\
-    x[nall].x = INTEL_BIGP;						\
-    x[nall].y = INTEL_BIGP;						\
-    x[nall].z = INTEL_BIGP;						\
-  }									\
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
+                                  f_stride, x, q)                       \
+{                                                                       \
+  if (separate_flag) {                                                  \
+    if (separate_flag < 3) {                                            \
+      int all_local = nlocal;                                           \
+      int ghost_min = overflow[LMP_GHOST_MIN];                          \
+      nlocal = overflow[LMP_LOCAL_MAX] + 1;                             \
+      int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;             \
+      if (nghost < 0) nghost = 0;                                       \
+      nall = nlocal + nghost;                                           \
+      separate_flag--;                                                  \
+      int flength;                                                      \
+      if (newton) flength = nall;                                       \
+      else flength = nlocal;                                            \
+      IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),             \
+                           separate_flag);                              \
+      if (nghost) {                                                     \
+        if (nlocal < all_local || ghost_min > all_local) {              \
+          memmove(x + nlocal, x + ghost_min,                            \
+                  (nall - nlocal) * sizeof(ATOM_T));                    \
+          if (q != 0)                                                   \
+            memmove((void *)(q + nlocal), (void *)(q + ghost_min),      \
+                    (nall - nlocal) * sizeof(flt_t));                   \
+        }                                                               \
+      }                                                                 \
+    }                                                                   \
+    x[nall].x = INTEL_BIGP;                                             \
+    x[nall].y = INTEL_BIGP;                                             \
+    x[nall].z = INTEL_BIGP;                                             \
+  }                                                                     \
 }
 
-#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,	\
-				f_start, f_stride, x, offload, vflag,	\
-				ov0, ov1, ov2, ov3, ov4, ov5)		\
-{								        \
-  if (newton) {								\
-    _use_omp_pragma("omp barrier");					\
-    IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,		\
-			   f_stride, x, offload, vflag, ov0, ov1, ov2,	\
-			   ov3, ov4, ov5);				\
-  }									\
+#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
+                                f_start, f_stride, x, offload, vflag,   \
+                                ov0, ov1, ov2, ov3, ov4, ov5)           \
+{                                                                       \
+  if (newton) {                                                         \
+    _use_omp_pragma("omp barrier");                                     \
+    IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,           \
+                           f_stride, x, offload, vflag, ov0, ov1, ov2,  \
+                           ov3, ov4, ov5);                              \
+  }                                                                     \
 }
 
-#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,	\
-			    ov0, ov1, ov2, ov3, ov4, ov5)		
+#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
+                            ov0, ov1, ov2, ov3, ov4, ov5)
 
 #else
 
@@ -649,164 +649,164 @@ inline double MIC_Wtime() {
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)
 
-#define IP_PRE_get_transfern(ago, newton, eflag, vflag,			\
-			     buffers, offload, fix, separate_flag,	\
-			     x_size, q_size, ev_size, f_stride)		\
+#define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
+                             buffers, offload, fix, separate_flag,      \
+                             x_size, q_size, ev_size, f_stride)         \
 {                                                                       \
-  separate_flag = 0;							\
+  separate_flag = 0;                                                    \
   int f_length;                                                         \
   if (newton)                                                           \
     f_length = nall;                                                    \
   else                                                                  \
     f_length = nlocal;                                                  \
-  f_stride = buffers->get_stride(f_length);				\
+  f_stride = buffers->get_stride(f_length);                             \
 }
 
-#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
-			   ev_global)					\
-{									\
-  tc = comm->nthreads;							\
-  f_start = buffers->get_f();						\
-  fix->start_watch(TIME_HOST_PAIR);					\
-  ev_global = buffers->get_ev_global_host();				\
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
+                           ev_global)                                   \
+{                                                                       \
+  tc = comm->nthreads;                                                  \
+  f_start = buffers->get_f();                                           \
+  fix->start_watch(TIME_HOST_PAIR);                                     \
+  ev_global = buffers->get_ev_global_host();                            \
 }
 
-#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
-				  f_stride, x, q)
-
-#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,	\
-				f_start, f_stride, x, offload, vflag,	\
-				ov0, ov1, ov2, ov3, ov4, ov5)		\
-{								        \
-  if (newton) {								\
-    if (vflag == 2 && nthreads > INTEL_HTHREADS) {			\
-      _use_omp_pragma("omp barrier");					\
-      buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2,	\
-			    ov3, ov4, ov5);				\
-    }									\
-  }									\
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
+                                  f_stride, x, q)
+
+#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
+                                f_start, f_stride, x, offload, vflag,   \
+                                ov0, ov1, ov2, ov3, ov4, ov5)           \
+{                                                                       \
+  if (newton) {                                                         \
+    if (vflag == 2 && nthreads > INTEL_HTHREADS) {                      \
+      _use_omp_pragma("omp barrier");                                   \
+      buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2,    \
+                            ov3, ov4, ov5);                             \
+    }                                                                   \
+  }                                                                     \
 }
 
-#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,	\
-			    ov0, ov1, ov2, ov3, ov4, ov5)		\
-{								        \
-  if (newton) {								\
-    if (vflag == 2 && nthreads <= INTEL_HTHREADS) {			\
-      int lt = nall * 4;						\
-      buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1,	\
-			       ov2, ov3, ov4, ov5);			\
-    }									\
-  }									\
+#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
+                            ov0, ov1, ov2, ov3, ov4, ov5)               \
+{                                                                       \
+  if (newton) {                                                         \
+    if (vflag == 2 && nthreads <= INTEL_HTHREADS) {                     \
+      int lt = nall * 4;                                                \
+      buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1,     \
+                               ov2, ov3, ov4, ov5);                     \
+    }                                                                   \
+  }                                                                     \
 }
 
 #endif
 
-#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz)		\
+#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz)            \
 {                                                                       \
   if (vflag == 1) {                                                     \
-    sv0 += delx * delx * fpair;						\
-    sv1 += dely * dely * fpair;						\
-    sv2 += delz * delz * fpair;						\
-    sv3 += delx * dely * fpair;						\
-    sv4 += delx * delz * fpair;						\
-    sv5 += dely * delz * fpair;						\
+    sv0 += delx * delx * fpair;                                         \
+    sv1 += dely * dely * fpair;                                         \
+    sv2 += delz * delz * fpair;                                         \
+    sv3 += delx * dely * fpair;                                         \
+    sv4 += delx * delz * fpair;                                         \
+    sv5 += dely * delz * fpair;                                         \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz)		\
+#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz)         \
 {                                                                       \
   if (vflag == 1) {                                                     \
-    sv0 += dx * fpx;							\
-    sv1 += dy * fpy;							\
-    sv2 += dz * fpz;							\
-    sv3 += dx * fpy;							\
-    sv4 += dx * fpz;							\
-    sv5 += dy * fpz;							\
+    sv0 += dx * fpx;                                                    \
+    sv1 += dy * fpy;                                                    \
+    sv2 += dz * fpz;                                                    \
+    sv3 += dx * fpy;                                                    \
+    sv4 += dx * fpz;                                                    \
+    sv5 += dy * fpz;                                                    \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)	\
+#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)   \
 {                                                                       \
   if (vflag == 1) {                                                     \
     sv0 += delx * fj[0] + delr2[0] * fk[0];                             \
-    sv1 += dely * fj[1] + delr2[1] * fk[1];				\
-    sv2 += delz * fj[2] + delr2[2] * fk[2];				\
-    sv3 += delx * fj[1] + delr2[0] * fk[1];				\
-    sv4 += delx * fj[2] + delr2[0] * fk[2];				\
-    sv5 += dely * fj[2] + delr2[1] * fk[2];				\
+    sv1 += dely * fj[1] + delr2[1] * fk[1];                             \
+    sv2 += delz * fj[2] + delr2[2] * fk[2];                             \
+    sv3 += delx * fj[1] + delr2[0] * fk[1];                             \
+    sv4 += delx * fj[2] + delr2[0] * fk[2];                             \
+    sv5 += dely * fj[2] + delr2[1] * fk[2];                             \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz)  \
 {                                                                       \
   if (vflag == 1) {                                                     \
-    sv0 += delx * fj0;							\
-    sv1 += dely * fj1;							\
-    sv2 += delz * fj2;							\
-    sv3 += delx * fj1;							\
-    sv4 += delx * fj2;							\
-    sv5 += dely * fj2;							\
+    sv0 += delx * fj0;                                                  \
+    sv1 += dely * fj1;                                                  \
+    sv2 += delz * fj2;                                                  \
+    sv3 += delx * fj1;                                                  \
+    sv4 += delx * fj2;                                                  \
+    sv5 += dely * fj2;                                                  \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
-			     fbond, delx, dely, delz, obond, force,	\
-			     newton, nlocal, ov0, ov1, ov2, ov3, ov4,	\
-			     ov5)					\
+                             fbond, delx, dely, delz, obond, force,     \
+                             newton, nlocal, ov0, ov1, ov2, ov3, ov4,   \
+                             ov5)                                       \
 {                                                                       \
-  flt_t ev_pre;								\
-  if (newton) ev_pre = (flt_t)1.0;					\
-  else {								\
-    ev_pre = (flt_t)0.0;						\
-    if (i1 < nlocal) ev_pre += (flt_t)0.5;				\
-    if (i2 < nlocal) ev_pre += (flt_t)0.5;				\
-  }									\
-									\
-  if (eflag) {								\
-    obond += ev_pre * ebond;						\
-    if (eatom) {							\
-      flt_t halfeng = ebond * (flt_t)0.5;				\
-      if (newton || i1 < nlocal) f[i1].w += halfeng;			\
-      if (newton || i2 < nlocal) f[i2].w += halfeng;			\
-    }									\
-  }									\
-									\
-  if (VFLAG && vflag) {							\
-    ov0 += ev_pre * (delx * delx * fbond);				\
-    ov1 += ev_pre * (dely * dely * fbond);				\
-    ov2 += ev_pre * (delz * delz * fbond);				\
-    ov3 += ev_pre * (delx * dely * fbond);				\
-    ov4 += ev_pre * (delx * delz * fbond);				\
-    ov5 += ev_pre * (dely * delz * fbond);				\
+  flt_t ev_pre;                                                         \
+  if (newton) ev_pre = (flt_t)1.0;                                      \
+  else {                                                                \
+    ev_pre = (flt_t)0.0;                                                \
+    if (i1 < nlocal) ev_pre += (flt_t)0.5;                              \
+    if (i2 < nlocal) ev_pre += (flt_t)0.5;                              \
+  }                                                                     \
+                                                                        \
+  if (eflag) {                                                          \
+    obond += ev_pre * ebond;                                            \
+    if (eatom) {                                                        \
+      flt_t halfeng = ebond * (flt_t)0.5;                               \
+      if (newton || i1 < nlocal) f[i1].w += halfeng;                    \
+      if (newton || i2 < nlocal) f[i2].w += halfeng;                    \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (VFLAG && vflag) {                                                 \
+    ov0 += ev_pre * (delx * delx * fbond);                              \
+    ov1 += ev_pre * (dely * dely * fbond);                              \
+    ov2 += ev_pre * (delz * delz * fbond);                              \
+    ov3 += ev_pre * (delx * dely * fbond);                              \
+    ov4 += ev_pre * (delx * delz * fbond);                              \
+    ov5 += ev_pre * (dely * delz * fbond);                              \
   }                                                                     \
 }
 
 #define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1,   \
-			      i2, i3, f1x, f1y, f1z, f3x, f3y, f3z,	\
-			      delx1, dely1, delz1, delx2, dely2, delz2,	\
-			      oeangle, force, newton, nlocal, ov0, ov1, \
-			      ov2, ov3, ov4, ov5)			\
+                              i2, i3, f1x, f1y, f1z, f3x, f3y, f3z,     \
+                              delx1, dely1, delz1, delx2, dely2, delz2, \
+                              oeangle, force, newton, nlocal, ov0, ov1, \
+                              ov2, ov3, ov4, ov5)                       \
 {                                                                       \
-  flt_t ev_pre;								\
-  if (newton) ev_pre = (flt_t)1.0;					\
-  else {								\
-    ev_pre = (flt_t)0.0;						\
-    if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
-    if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
-    if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
-  }									\
-									\
-  if (eflag) {								\
-    oeangle += ev_pre * eangle;						\
-    if (eatom) {							\
-      flt_t thirdeng = eangle * (flt_t)0.3333333333333333;		\
-      if (newton || i1 < nlocal) f[i1].w += thirdeng;			\
-      if (newton || i2 < nlocal) f[i2].w += thirdeng;			\
-      if (newton || i3 < nlocal) f[i3].w += thirdeng;			\
-    }									\
-  }									\
-									\
-  if (VFLAG && vflag) {							\
+  flt_t ev_pre;                                                         \
+  if (newton) ev_pre = (flt_t)1.0;                                      \
+  else {                                                                \
+    ev_pre = (flt_t)0.0;                                                \
+    if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
+    if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
+    if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
+  }                                                                     \
+                                                                        \
+  if (eflag) {                                                          \
+    oeangle += ev_pre * eangle;                                         \
+    if (eatom) {                                                        \
+      flt_t thirdeng = eangle * (flt_t)0.3333333333333333;              \
+      if (newton || i1 < nlocal) f[i1].w += thirdeng;                   \
+      if (newton || i2 < nlocal) f[i2].w += thirdeng;                   \
+      if (newton || i3 < nlocal) f[i3].w += thirdeng;                   \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (VFLAG && vflag) {                                                 \
     ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);                        \
     ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);                        \
     ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);                        \
@@ -817,74 +817,74 @@ inline double MIC_Wtime() {
 }
 
 #define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
-			      i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
-			      f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y,	\
-			      vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
-			      newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
-			      ov5)					\
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
+                              f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y,   \
+                              vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
+                              newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
+                              ov5)                                      \
 {                                                                       \
-  flt_t ev_pre;								\
-  if (newton) ev_pre = (flt_t)1.0;					\
-  else {								\
-    ev_pre = (flt_t)0.0;						\
-    if (i1 < nlocal) ev_pre += (flt_t)0.25;				\
-    if (i2 < nlocal) ev_pre += (flt_t)0.25;				\
-    if (i3 < nlocal) ev_pre += (flt_t)0.25;				\
-    if (i4 < nlocal) ev_pre += (flt_t)0.25;				\
-  }									\
-									\
-  if (eflag) {								\
-    oedihedral += ev_pre * deng;					\
-    if (eatom) {							\
-      flt_t qdeng = deng * (flt_t)0.25;					\
-      if (newton || i1 < nlocal) f[i1].w += qdeng;			\
-      if (newton || i2 < nlocal) f[i2].w += qdeng;			\
-      if (newton || i3 < nlocal) f[i3].w += qdeng;			\
-      if (newton || i4 < nlocal) f[i4].w += qdeng;			\
-    }									\
-  }									\
-									\
-  if (VFLAG && vflag) {							\
-    ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
-    ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
-    ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
-    ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);		\
-    ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);		\
-    ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);		\
+  flt_t ev_pre;                                                         \
+  if (newton) ev_pre = (flt_t)1.0;                                      \
+  else {                                                                \
+    ev_pre = (flt_t)0.0;                                                \
+    if (i1 < nlocal) ev_pre += (flt_t)0.25;                             \
+    if (i2 < nlocal) ev_pre += (flt_t)0.25;                             \
+    if (i3 < nlocal) ev_pre += (flt_t)0.25;                             \
+    if (i4 < nlocal) ev_pre += (flt_t)0.25;                             \
+  }                                                                     \
+                                                                        \
+  if (eflag) {                                                          \
+    oedihedral += ev_pre * deng;                                        \
+    if (eatom) {                                                        \
+      flt_t qdeng = deng * (flt_t)0.25;                                 \
+      if (newton || i1 < nlocal) f[i1].w += qdeng;                      \
+      if (newton || i2 < nlocal) f[i2].w += qdeng;                      \
+      if (newton || i3 < nlocal) f[i3].w += qdeng;                      \
+      if (newton || i4 < nlocal) f[i4].w += qdeng;                      \
+    }                                                                   \
+  }                                                                     \
+                                                                        \
+  if (VFLAG && vflag) {                                                 \
+    ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);            \
+    ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);            \
+    ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);            \
+    ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);            \
+    ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);            \
+    ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);            \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp)    	\
-{									\
-  if (eflag) {								\
-    f[i].w += fwtmp;							\
-    oevdwl += sevdwl;							\
-  }									\
-  if (newton == 0 && vflag == 1) {					\
-    ov0 += sv0;								\
-    ov1 += sv1;								\
-    ov2 += sv2;								\
-    ov3 += sv3;								\
-    ov4 += sv4;								\
-    ov5 += sv5;								\
-  }									\
+#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp)            \
+{                                                                       \
+  if (eflag) {                                                          \
+    f[i].w += fwtmp;                                                    \
+    oevdwl += sevdwl;                                                   \
+  }                                                                     \
+  if (newton == 0 && vflag == 1) {                                      \
+    ov0 += sv0;                                                         \
+    ov1 += sv1;                                                         \
+    ov2 += sv2;                                                         \
+    ov3 += sv3;                                                         \
+    ov4 += sv4;                                                         \
+    ov5 += sv5;                                                         \
+  }                                                                     \
 }
 
-#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp)    	\
-{									\
-  if (eflag) {								\
-    f[i].w += fwtmp;							\
-    oevdwl += sevdwl;							\
-    oecoul += secoul;							\
-  }									\
-  if (newton == 0 && vflag == 1) {					\
-    ov0 += sv0;								\
-    ov1 += sv1;								\
-    ov2 += sv2;								\
-    ov3 += sv3;								\
-    ov4 += sv4;								\
-    ov5 += sv5;								\
-  }									\
+#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp)           \
+{                                                                       \
+  if (eflag) {                                                          \
+    f[i].w += fwtmp;                                                    \
+    oevdwl += sevdwl;                                                   \
+    oecoul += secoul;                                                   \
+  }                                                                     \
+  if (newton == 0 && vflag == 1) {                                      \
+    ov0 += sv0;                                                         \
+    ov1 += sv1;                                                         \
+    ov2 += sv2;                                                         \
+    ov3 += sv3;                                                         \
+    ov4 += sv4;                                                         \
+    ov5 += sv5;                                                         \
+  }                                                                     \
 }
 
 }
diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h
index aa03a6f136..4616f628e7 100644
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@@ -42,25 +42,25 @@ namespace ip_simd {
   struct SIMD_int {
     __m512i v;
     SIMD_int() {}
-    SIMD_int(const __m512i in) : v(in) {} 
+    SIMD_int(const __m512i in) : v(in) {}
     operator __m512i() const { return v;}
   };
 
   struct SIMD_float {
     __m512 v;
     SIMD_float() {}
-    SIMD_float(const __m512 in) : v(in) {} 
+    SIMD_float(const __m512 in) : v(in) {}
     operator __m512() const { return v;}
   };
 
   struct SIMD_double {
     __m512d v;
     SIMD_double() {}
-    SIMD_double(const __m512d in) : v(in) {} 
+    SIMD_double(const __m512d in) : v(in) {}
     operator __m512d() const { return v;}
   };
 
-  template<class flt_t> 
+  template<class flt_t>
   class SIMD_type {
   };
 
@@ -92,20 +92,20 @@ namespace ip_simd {
 
   // ------- Set Operations
 
-  inline SIMD_int SIMD_set(const int l0, const int l1, const int l2, 
-			   const int l3, const int l4, const int l5,
-			   const int l6, const int l7, const int l8,
-			   const int l9, const int l10, const int l11,
-			   const int l12, const int l13, const int l14,
-			   const int l15) {
+  inline SIMD_int SIMD_set(const int l0, const int l1, const int l2,
+                           const int l3, const int l4, const int l5,
+                           const int l6, const int l7, const int l8,
+                           const int l9, const int l10, const int l11,
+                           const int l12, const int l13, const int l14,
+                           const int l15) {
     return _mm512_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7,
-			     l8,l9,l10,l11,l12,l13,l14,l15);
+                             l8,l9,l10,l11,l12,l13,l14,l15);
   }
 
   inline SIMD_int SIMD_set(const int l) {
     return _mm512_set1_epi32(l);
   }
-  
+
   inline SIMD_float SIMD_set(const float l) {
     return _mm512_set1_ps(l);
   }
@@ -113,28 +113,28 @@ namespace ip_simd {
   inline SIMD_double SIMD_set(const double l) {
     return _mm512_set1_pd(l);
   }
-  
+
   inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) {
     return _mm512_maskz_mov_epi32(m, one);
   }
 
-  inline SIMD_float SIMD_zero_masked(const SIMD_mask &m, 
-				     const SIMD_float &one) {
+  inline SIMD_float SIMD_zero_masked(const SIMD_mask &m,
+                                     const SIMD_float &one) {
     return _mm512_maskz_mov_ps(m, one);
   }
 
-  inline SIMD_double SIMD_zero_masked(const SIMD_mask &m, 
-				     const SIMD_double &one) {
+  inline SIMD_double SIMD_zero_masked(const SIMD_mask &m,
+                                     const SIMD_double &one) {
     return _mm512_maskz_mov_pd(m, one);
   }
 
-  inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, 
-			     const SIMD_float &one) {
+  inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m,
+                             const SIMD_float &one) {
     return _mm512_mask_mov_ps(src,m,one);
   }
 
-  inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, 
-			      const SIMD_double &one) {
+  inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m,
+                              const SIMD_double &one) {
     return _mm512_mask_mov_pd(src,m,one);
   }
 
@@ -147,11 +147,11 @@ namespace ip_simd {
   inline SIMD_float SIMD_load(const float *p) {
     return _mm512_load_ps(p);
   }
-  
+
   inline SIMD_double SIMD_load(const double *p) {
     return _mm512_load_pd(p);
   }
-  
+
   inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) {
     return _mm512_maskz_load_epi32(m, p);
   }
@@ -159,7 +159,7 @@ namespace ip_simd {
   inline SIMD_float SIMD_loadz(const SIMD_mask &m, const float *p) {
     return _mm512_maskz_load_ps(m, p);
   }
-  
+
   inline SIMD_double SIMD_loadz(const SIMD_mask &m, const double *p) {
     return _mm512_maskz_load_pd(m, p);
   }
@@ -168,7 +168,7 @@ namespace ip_simd {
     return _mm512_i32gather_epi32(i, p, _MM_SCALE_4);
   }
 
-  inline SIMD_float SIMD_gather(const float *p,	const SIMD_int &i) {
+  inline SIMD_float SIMD_gather(const float *p, const SIMD_int &i) {
     return _mm512_i32gather_ps(i, p, _MM_SCALE_4);
   }
 
@@ -177,56 +177,56 @@ namespace ip_simd {
   }
 
   inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
-			      const SIMD_int &i) {
+                              const SIMD_int &i) {
     return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, p,
-				       _MM_SCALE_4);
+                                       _MM_SCALE_4);
   }
 
   inline SIMD_float SIMD_gather(const SIMD_mask &m, const float *p,
-				const SIMD_int &i) {
+                                const SIMD_int &i) {
     return _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, p,
-				    _MM_SCALE_4);
+                                    _MM_SCALE_4);
   }
 
   inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
-				 const SIMD_int &i) {
+                                 const SIMD_int &i) {
     return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p,
-				      _MM_SCALE_8);
+                                      _MM_SCALE_8);
   }
 
   template <typename T>
   inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p,
-				      const SIMD_int &i) {
+                                      const SIMD_int &i) {
   }
 
   template <>
   inline SIMD_int SIMD_gatherz_offset<float>(const SIMD_mask &m, const int *p,
-					     const SIMD_int &i) {
+                                             const SIMD_int &i) {
     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
-				       _MM_SCALE_4);
+                                       _MM_SCALE_4);
   }
 
   template <>
   inline SIMD_int SIMD_gatherz_offset<double>(const SIMD_mask &m, const int *p,
-					      const SIMD_int &i) {
+                                              const SIMD_int &i) {
     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
-				       _MM_SCALE_8);
+                                       _MM_SCALE_8);
   }
 
   inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p,
-				 const SIMD_int &i) {
+                                 const SIMD_int &i) {
     return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p,
-				    _MM_SCALE_4);
+                                    _MM_SCALE_4);
   }
 
   inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
-				  const SIMD_int &i) {
+                                  const SIMD_int &i) {
     return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p,
-				      _MM_SCALE_8);
+                                      _MM_SCALE_8);
   }
 
   // ------- Store Operations
-  
+
   inline void SIMD_store(int *p, const SIMD_int &one) {
     return _mm512_store_epi32(p,one);
   }
@@ -240,17 +240,17 @@ namespace ip_simd {
   }
 
   inline void SIMD_scatter(const SIMD_mask &m, int *p,
-			   const SIMD_int &i, const SIMD_int &vec) {
+                           const SIMD_int &i, const SIMD_int &vec) {
     _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
   }
 
   inline void SIMD_scatter(const SIMD_mask &m, float *p,
-			   const SIMD_int &i, const SIMD_float &vec) {
+                           const SIMD_int &i, const SIMD_float &vec) {
     _mm512_mask_i32scatter_ps(p, m, i, vec, _MM_SCALE_4);
   }
 
   inline void SIMD_scatter(const SIMD_mask &m, double *p,
-			   const SIMD_int &i, const SIMD_double &vec) {
+                           const SIMD_int &i, const SIMD_double &vec) {
     _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8);
   }
 
@@ -263,76 +263,76 @@ namespace ip_simd {
   inline SIMD_float operator+(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_add_ps(one,two);
   }
-  
+
   inline SIMD_double operator+(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_add_pd(one,two);
   }
-  
+
   inline SIMD_int operator+(const SIMD_int &one, const int two) {
     return _mm512_add_epi32(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator+(const SIMD_float &one, const float two) {
     return _mm512_add_ps(one,SIMD_set(two));
   }
-  
+
   inline SIMD_double operator+(const SIMD_double &one, const double two) {
     return _mm512_add_pd(one,SIMD_set(two));
   }
 
   inline SIMD_int SIMD_add(const SIMD_mask &m,
-			   const SIMD_int &one, const int two) {
+                           const SIMD_int &one, const int two) {
     return _mm512_mask_add_epi32(one,m,one,SIMD_set(two));
   }
 
   inline SIMD_float SIMD_add(const SIMD_mask &m,
-			     const SIMD_float &one, const float two) {
+                             const SIMD_float &one, const float two) {
     return _mm512_mask_add_ps(one,m,one,SIMD_set(two));
   }
 
   inline SIMD_double SIMD_add(const SIMD_mask &m,
-			      const SIMD_double &one, const double two) {
+                              const SIMD_double &one, const double two) {
     return _mm512_mask_add_pd(one,m,one,SIMD_set(two));
   }
 
   inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m,
-			   const SIMD_int &one, const SIMD_int &two) {
+                           const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mask_add_epi32(s,m,one,two);
   }
 
   inline SIMD_float SIMD_add(const SIMD_float &s, const SIMD_mask &m,
-			     const SIMD_float &one, const SIMD_float &two) {
+                             const SIMD_float &one, const SIMD_float &two) {
     return _mm512_mask_add_ps(s,m,one,two);
   }
 
   inline SIMD_double SIMD_add(const SIMD_double &s, const SIMD_mask &m,
-			      const SIMD_double &one, const SIMD_double &two) {
+                              const SIMD_double &one, const SIMD_double &two) {
     return _mm512_mask_add_pd(s,m,one,two);
   }
 
   inline SIMD_int SIMD_sub(const SIMD_int &s, const SIMD_mask &m,
-			   const SIMD_int &one, const SIMD_int &two) {
+                           const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mask_sub_epi32(s,m,one,two);
   }
 
   inline SIMD_float SIMD_sub(const SIMD_float &s, const SIMD_mask &m,
-			     const SIMD_float &one, const SIMD_float &two) {
+                             const SIMD_float &one, const SIMD_float &two) {
     return _mm512_mask_sub_ps(s,m,one,two);
   }
 
   inline SIMD_double SIMD_sub(const SIMD_double &s, const SIMD_mask &m,
-			      const SIMD_double &one, const SIMD_double &two) {
+                              const SIMD_double &one, const SIMD_double &two) {
     return _mm512_mask_sub_pd(s,m,one,two);
   }
 
   inline SIMD_int operator-(const SIMD_int &one) {
     return _mm512_sub_epi32(SIMD_set((int)0),one);
   }
-  
+
   inline SIMD_float operator-(const SIMD_float &one) {
     return _mm512_sub_ps(SIMD_set((float)0),one);
   }
-  
+
   inline SIMD_double operator-(const SIMD_double &one) {
     return _mm512_sub_pd(SIMD_set((double)0),one);
   }
@@ -340,80 +340,80 @@ namespace ip_simd {
   inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_sub_epi32(one,two);
   }
-  
+
   inline SIMD_float operator-(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_sub_ps(one,two);
   }
-  
+
   inline SIMD_double operator-(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_sub_pd(one,two);
   }
-  
+
   inline SIMD_int operator-(const SIMD_int &one, const int two) {
     return _mm512_sub_epi32(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator-(const SIMD_float &one, const float two) {
     return _mm512_sub_ps(one,SIMD_set(two));
   }
-  
+
   inline SIMD_double operator-(const SIMD_double &one, const double two) {
     return _mm512_sub_pd(one,SIMD_set(two));
   }
-  
+
   inline SIMD_int operator*(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_mullo_epi32(one,two);
   }
-  
+
   inline SIMD_float operator*(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_mul_ps(one,two);
   }
-  
+
   inline SIMD_double operator*(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_mul_pd(one,two);
   }
-  
+
   inline SIMD_int operator*(const SIMD_int &one, const int two) {
     return _mm512_mullo_epi32(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator*(const SIMD_float &one, const float two) {
     return _mm512_mul_ps(one,SIMD_set(two));
   }
-  
+
   inline SIMD_double operator*(const SIMD_double &one, const double two) {
     return _mm512_mul_pd(one,SIMD_set(two));
   }
-  
+
   inline SIMD_float operator/(const SIMD_float &one, const SIMD_float &two) {
     return _mm512_div_ps(one,two);
   }
-  
+
   inline SIMD_double operator/(const SIMD_double &one, const SIMD_double &two) {
     return _mm512_div_pd(one,two);
   }
-  
+
   inline SIMD_float SIMD_fma(const SIMD_float &one, const SIMD_float &two,
-			     const SIMD_float &three) {
+                             const SIMD_float &three) {
     return _mm512_fmadd_ps(one,two,three);
   }
 
   inline SIMD_double SIMD_fma(const SIMD_double &one, const SIMD_double &two,
-			      const SIMD_double &three) {
+                              const SIMD_double &three) {
     return _mm512_fmadd_pd(one,two,three);
   }
 
   inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two,
-			     const SIMD_float &three) {
+                             const SIMD_float &three) {
     return _mm512_fmsub_ps(one,two,three);
   }
 
   inline SIMD_double SIMD_fms(const SIMD_double &one, const SIMD_double &two,
-			      const SIMD_double &three) {
+                              const SIMD_double &three) {
     return _mm512_fmsub_pd(one,two,three);
   }
-  
-  // ------- SVML operations  
+
+  // ------- SVML operations
 
   inline SIMD_float SIMD_rcp(const SIMD_float &one) {
     #ifdef __AVX512ER__
@@ -489,33 +489,33 @@ namespace ip_simd {
 
   // ------- Comparison operations
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one, 
-			   const SIMD_int &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one,
+                           const SIMD_int &two) {
     return _mm512_mask_cmplt_epi32_mask(m, one, two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one, 
-			   const SIMD_float &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one,
+                           const SIMD_float &two) {
     return _mm512_mask_cmplt_ps_mask(m, one, two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one, 
-			   const SIMD_double &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one,
+                           const SIMD_double &two) {
     return _mm512_mask_cmplt_pd_mask(m, one, two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const int one, 
-			   const SIMD_int &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const int one,
+                           const SIMD_int &two) {
     return _mm512_mask_cmplt_epi32_mask(m, SIMD_set(one), two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const float one, 
-			   const SIMD_float &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const float one,
+                           const SIMD_float &two) {
     return _mm512_mask_cmplt_ps_mask(m, SIMD_set(one), two);
   }
 
-  inline SIMD_mask SIMD_lt(SIMD_mask m, const double one, 
-			   const SIMD_double &two) {
+  inline SIMD_mask SIMD_lt(SIMD_mask m, const double one,
+                           const SIMD_double &two) {
     return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two);
   }
 
@@ -629,112 +629,112 @@ namespace ip_simd {
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_float &v1) {
+                                       SIMD_float &v1) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
-      
+                                      _mm512_lzcnt_epi32(cd));
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_float am_perm;
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_float am_perm;
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_double &v1) {
+                                       SIMD_double &v1) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
+                                      _mm512_lzcnt_epi32(cd));
       lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
-      
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_double am_perm;
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_double am_perm;
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_float &v1, SIMD_float &v2,
-				       SIMD_float &v3) {
+                                       SIMD_float &v1, SIMD_float &v2,
+                                       SIMD_float &v3) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
-      
+                                      _mm512_lzcnt_epi32(cd));
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_float am_perm;
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v2);
-	v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm);
-	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
-					     now_mask, lid, v3);
-	v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_float am_perm;
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v2);
+        v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm);
+        am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+                                             now_mask, lid, v3);
+        v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
 
   // i indices should be positive
   inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
-				       SIMD_double &v1, SIMD_double &v2,
-				       SIMD_double &v3) {
+                                       SIMD_double &v1, SIMD_double &v2,
+                                       SIMD_double &v3) {
     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
     if (todo_mask) {
       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
-				      _mm512_lzcnt_epi32(cd));
+                                      _mm512_lzcnt_epi32(cd));
       lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
-      
+
       while(todo_mask) {
-	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
-	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
-							  todo_bcast);
-	SIMD_double am_perm;
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v1);
-	v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v2);
-	v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm);
-	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
-					     now_mask, lid, v3);
-	v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm);
-	todo_mask = _mm512_kxor(todo_mask, now_mask);
+        SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+        SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
+                                                          todo_bcast);
+        SIMD_double am_perm;
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v1);
+        v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v2);
+        v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm);
+        am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+                                             now_mask, lid, v3);
+        v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm);
+        todo_mask = _mm512_kxor(todo_mask, now_mask);
       }
     }
   }
@@ -744,7 +744,7 @@ namespace ip_simd {
   inline SIMD_int operator&(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_and_epi32(one,two);
   }
-  
+
   inline SIMD_int operator>>(const SIMD_int &one, const SIMD_int &two) {
     return _mm512_srlv_epi32(one,two);
   }
@@ -752,21 +752,21 @@ namespace ip_simd {
   inline SIMD_int operator<<(const SIMD_int &one, const unsigned two) {
     return _mm512_slli_epi32(one,two);
   }
-  
+
   // -------- I/O operations
 
   inline void SIMD_print(const __m512i &vec) {
-    for (int i = 0; i < 16; i++) 
+    for (int i = 0; i < 16; i++)
       printf("%d ",(*((int*)&(vec) + (i))));
   }
 
   inline void SIMD_print(const __m512 &vec) {
-    for (int i = 0; i < 16; i++) 
+    for (int i = 0; i < 16; i++)
       printf("%f ",(*((float*)&(vec) + (i))));
   }
 
   inline void SIMD_print(const __m512d &vec) {
-    for (int i = 0; i < 8; i++) 
+    for (int i = 0; i < 8; i++)
       printf("%f ",(*((double*)&(vec) + (i))));
   }
 
@@ -801,280 +801,280 @@ namespace ip_simd {
   // ---------- LAMMPS operations
   #ifndef SW_GATHER_TEST
   inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom,
-			       const SIMD_int &i, SIMD_float &x, SIMD_float &y,
-			       SIMD_float &z) {
-    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, 
-				 _MM_SCALE_1);
+                               const SIMD_int &i, SIMD_float &x, SIMD_float &y,
+                               SIMD_float &z) {
+    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom,
+                                 _MM_SCALE_1);
     y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
     z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
   }
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom,
-			       const SIMD_int &i, SIMD_float &x, SIMD_float &y,
-			       SIMD_float &z, SIMD_int &type) {
-    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, 
-				 _MM_SCALE_1);
+                               const SIMD_int &i, SIMD_float &x, SIMD_float &y,
+                               SIMD_float &z, SIMD_int &type) {
+    x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom,
+                                 _MM_SCALE_1);
     y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
     z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2,
-				 _MM_SCALE_1);
+                                 _MM_SCALE_1);
     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
-				       _MM_SCALE_1);
+                                       _MM_SCALE_1);
   }
   #endif
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
-			       const SIMD_int &i, SIMD_double &x, 
-			       SIMD_double &y, SIMD_double &z) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, 
-				   _MM_SCALE_2);
+                               const SIMD_int &i, SIMD_double &x,
+                               SIMD_double &y, SIMD_double &z) {
+    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
+                                   _MM_SCALE_2);
     y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
     z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
   }
 
   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
-			       const SIMD_int &i, SIMD_double &x, 
-			       SIMD_double &y, SIMD_double &z, SIMD_int &type) {
-    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, 
-				   _MM_SCALE_2);
+                               const SIMD_int &i, SIMD_double &x,
+                               SIMD_double &y, SIMD_double &z, SIMD_int &type) {
+    x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom,
+                                   _MM_SCALE_2);
     y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
     z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2,
-				   _MM_SCALE_2);
+                                   _MM_SCALE_2);
     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
-				       _MM_SCALE_2);
+                                       _MM_SCALE_2);
   }
 
-  inline SIMD_float SIMD_ev_add(const SIMD_float &one, 
-				const SIMD_float &two) {
+  inline SIMD_float SIMD_ev_add(const SIMD_float &one,
+                                const SIMD_float &two) {
     return _mm512_add_ps(one,two);
   }
 
-  inline SIMD_double SIMD_ev_add(const SIMD_double &one, 
-				 const SIMD_double &two) {
+  inline SIMD_double SIMD_ev_add(const SIMD_double &one,
+                                 const SIMD_double &two) {
     return _mm512_add_pd(one,two);
   }
 
-  inline SIMD_double SIMD_ev_add(const SIMD_double &one, 
-				 const SIMD_float &two) {
+  inline SIMD_double SIMD_ev_add(const SIMD_double &one,
+                                 const SIMD_float &two) {
     SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two));
     SIMD_double ans = _mm512_add_pd(one,twod);
     twod = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			     _mm512_shuffle_f32x4(two,two,238)));
+                             _mm512_shuffle_f32x4(two,two,238)));
     return _mm512_add_pd(ans,twod);
   }
 
-  inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, 
-			       const SIMD_int &joffset, SIMD_float &eng) {
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force,
+                               const SIMD_int &joffset, SIMD_float &eng) {
     SIMD_float jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, 
-				    force, _MM_SCALE_1);
+    jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset,
+                                    force, _MM_SCALE_1);
     jeng = jeng + eng;
     _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1);
   }
 
-  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 
-			       const SIMD_int &joffset, SIMD_double &eng) {
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
+                               const SIMD_int &joffset, SIMD_double &eng) {
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jeng = jeng + eng;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
   }
 
-  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 
-			       const SIMD_int &joffset, SIMD_float &eng) {
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
+                               const SIMD_int &joffset, SIMD_float &eng) {
     SIMD_double engd, jeng;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
     SIMD_conflict_pi_reduce1(rmask, joffset, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jeng = jeng + engd;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			     _mm512_shuffle_f32x4(eng,eng,238)));
+                             _mm512_shuffle_f32x4(eng,eng,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force, _MM_SCALE_2);
     jeng = jeng + engd;
     _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
   }
 
-  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, 
-				  const SIMD_int &joffset1, SIMD_float &eng) {
+  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
+                                  const SIMD_int &joffset1, SIMD_float &eng) {
   }
 
-  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, 
-				  const SIMD_int &joffset1, SIMD_double &eng) {
+  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force,
+                                  const SIMD_int &joffset1, SIMD_double &eng) {
     SIMD_mask rmask = mask >> 8;
     SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238);
 
     SIMD_double jeng;
     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
-    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jeng = jeng + eng;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
-			       const SIMD_int &i, SIMD_float &fx,
-			       SIMD_float &fy, SIMD_float &fz) {
+                               const SIMD_int &i, SIMD_float &fx,
+                               SIMD_float &fy, SIMD_float &fz) {
     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
     SIMD_float jfrc;
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fx;
     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fy;
     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
-				    _MM_SCALE_1);
+                                    _MM_SCALE_1);
     jfrc = jfrc + fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
   }
 
   inline void SIMD_safe_jforce(const SIMD_mask &m, double *force,
-			       const SIMD_int &i, SIMD_double &fx,
-			       SIMD_double &fy, SIMD_double &fz) {
+                               const SIMD_int &i, SIMD_double &fx,
+                               SIMD_double &fy, SIMD_double &fz) {
     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fx;
     _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fy;
     _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-				      _MM_SCALE_2);
+                                      _MM_SCALE_2);
     jfrc = jfrc + fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
   }
 
-  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, 
-			       const SIMD_int &joffset, SIMD_float &amx,
-			       SIMD_float &amy, SIMD_float &amz) {
+  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
+                               const SIMD_int &joffset, SIMD_float &amx,
+                               SIMD_float &amy, SIMD_float &amz) {
     SIMD_double amxd, amyd, amzd;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
     _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force + 1, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
     _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
-				      force + 2, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset,
+                                      force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
     _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(amx,amx,238)));
+                                _mm512_shuffle_f32x4(amx,amx,238)));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amy,amy,238)));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amz,amz,238)));
     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force, _MM_SCALE_2);
     jfrc = jfrc + amxd;
     _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force + 1, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force + 1, _MM_SCALE_2);
     jfrc = jfrc + amyd;
     _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
-				      force + 2, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2,
+                                      force + 2, _MM_SCALE_2);
     jfrc = jfrc + amzd;
     _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
-				 const SIMD_int &i, const SIMD_float &fx,
-				 const SIMD_float &fy, const SIMD_float &fz) {
+                                 const SIMD_int &i, const SIMD_float &fx,
+                                 const SIMD_float &fy, const SIMD_float &fz) {
     SIMD_float jfrc;
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
+                                    _MM_SCALE_1);
     jfrc = jfrc - fx;
     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
+                                    _MM_SCALE_1);
     jfrc = jfrc - fy;
     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
-				    _MM_SCALE_1);
+                                    _MM_SCALE_1);
     jfrc = jfrc - fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
   }
 
   template <class ft>
   inline void SIMD_scalar_update(const int jj, const int* ejnum, ft *force,
-				 const int* i, const double *fx,
-				 const double *fy, const double *fz,
-				 const double *fx2, const double *fy2,
-				 const double *fz2) {
+                                 const int* i, const double *fx,
+                                 const double *fy, const double *fz,
+                                 const double *fx2, const double *fy2,
+                                 const double *fz2) {
     #pragma novector
     for (int k=0; k<8; k++) {
       if (jj < ejnum[k]) {
-	const int j = i[k];
-	force[j].x -= fx[k];
-	force[j].y -= fy[k];
-	force[j].z -= fz[k];
+        const int j = i[k];
+        force[j].x -= fx[k];
+        force[j].y -= fy[k];
+        force[j].z -= fz[k];
       }
     }
-    
+
     #pragma novector
     for (int k=8; k<16; k++) {
       if (jj < ejnum[k]) {
-	const int j = i[k];
-	force[j].x -= fx2[k-8];
-	force[j].y -= fy2[k-8];
-	force[j].z -= fz2[k-8];
+        const int j = i[k];
+        force[j].x -= fx2[k-8];
+        force[j].y -= fy2[k-8];
+        force[j].z -= fz2[k-8];
       }
     }
   }
 
   inline void SIMD_jforce_update(const SIMD_mask &m, double *force,
-				 const SIMD_int &i, const SIMD_double &fx,
-				 const SIMD_double &fy, const SIMD_double &fz)   {
+                                 const SIMD_int &i, const SIMD_double &fx,
+                                 const SIMD_double &fy, const SIMD_double &fz)   {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+                                      _MM_SCALE_2);
     jfrc = jfrc - fx;
     _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+                                      _MM_SCALE_2);
     jfrc = jfrc - fy;
     _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-				      _MM_SCALE_2);
+                                      _MM_SCALE_2);
     jfrc = jfrc - fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
   }
 
-  inline void SIMD_jforce_update(const SIMD_mask &rmask, 
+  inline void SIMD_jforce_update(const SIMD_mask &rmask,
          double *force, const SIMD_int &joffset, SIMD_float &amx,
-				 SIMD_float &amy, SIMD_float &amz) {
+                                 SIMD_float &amy, SIMD_float &amz) {
     SIMD_double amxd, amyd, amzd;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
@@ -1084,7 +1084,7 @@ namespace ip_simd {
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(amx,amx,238)));
+                                _mm512_shuffle_f32x4(amx,amx,238)));
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amy,amy,238)));
     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@@ -1095,8 +1095,8 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(float *pr, const int offset,
-			  const SIMD_float &fx,
-			  const SIMD_float &fy, const SIMD_float &fz) {
+                          const SIMD_float &fx,
+                          const SIMD_float &fy, const SIMD_float &fz) {
     float *p = pr;
     SIMD_float t;
     t = SIMD_load(p);
@@ -1113,8 +1113,8 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(double *pr, const int offset,
-			  const SIMD_double &fx,
-			  const SIMD_double &fy, const SIMD_double &fz) {
+                          const SIMD_double &fx,
+                          const SIMD_double &fy, const SIMD_double &fz) {
     double *p = pr;
     SIMD_double t;
     t = SIMD_load(p);
@@ -1131,8 +1131,8 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(double *pr, const int foffset,
-			  const SIMD_float &fx,
-			  const SIMD_float &fy, const SIMD_float &fz) {
+                          const SIMD_float &fx,
+                          const SIMD_float &fy, const SIMD_float &fz) {
     const int offset = foffset >> 1;
     double *p = pr;
     SIMD_double t, fd;
@@ -1142,7 +1142,7 @@ namespace ip_simd {
     t = t + fd;
     SIMD_store(p,t);
     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fx,fx,238)));
+                                _mm512_shuffle_f32x4(fx,fx,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + fd;
@@ -1154,7 +1154,7 @@ namespace ip_simd {
     t = t + fd;
     SIMD_store(p,t);
     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fy,fy,238)));
+                                _mm512_shuffle_f32x4(fy,fy,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + fd;
@@ -1166,7 +1166,7 @@ namespace ip_simd {
     t = t + fd;
     SIMD_store(p,t);
     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fz,fz,238)));
+                                _mm512_shuffle_f32x4(fz,fz,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + fd;
@@ -1174,15 +1174,15 @@ namespace ip_simd {
   }
 
   inline void SIMD_cache3(float *pr, const int offset,
-			  const SIMD_float &fx, const SIMD_float &fy,
-			  const SIMD_float &fz, const SIMD_float &fx2,
-			  const SIMD_float &fy2, const SIMD_float &fz2) {
+                          const SIMD_float &fx, const SIMD_float &fy,
+                          const SIMD_float &fz, const SIMD_float &fx2,
+                          const SIMD_float &fy2, const SIMD_float &fz2) {
   }
 
   inline void SIMD_cache3(double *pr, const int foffset,
-			  const SIMD_double &fx, const SIMD_double &fy,
-			  const SIMD_double &fz, const SIMD_double &fx2,
-			  const SIMD_double &fy2, const SIMD_double &fz2) {
+                          const SIMD_double &fx, const SIMD_double &fy,
+                          const SIMD_double &fz, const SIMD_double &fx2,
+                          const SIMD_double &fy2, const SIMD_double &fz2) {
     const int offset = foffset >> 1;
     double *p = pr;
     SIMD_double t;
@@ -1214,14 +1214,14 @@ namespace ip_simd {
     SIMD_store(p,t);
   }
 
-  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			       const SIMD_float &fjy, const SIMD_float &fjz,
-			       SIMD_float &fxtmp, SIMD_float &fytmp,
-			       SIMD_float &fztmp, SIMD_float &fjxtmp,
-			       SIMD_float &fjytmp, SIMD_float &fjztmp,
-			       SIMD_float &fxtmp2, SIMD_float &fytmp2,
-			       SIMD_float &fztmp2, SIMD_float &fjxtmp2,
-			       SIMD_float &fjytmp2, SIMD_float &fjztmp2) {
+  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                               const SIMD_float &fjy, const SIMD_float &fjz,
+                               SIMD_float &fxtmp, SIMD_float &fytmp,
+                               SIMD_float &fztmp, SIMD_float &fjxtmp,
+                               SIMD_float &fjytmp, SIMD_float &fjztmp,
+                               SIMD_float &fxtmp2, SIMD_float &fytmp2,
+                               SIMD_float &fztmp2, SIMD_float &fjxtmp2,
+                               SIMD_float &fjytmp2, SIMD_float &fjztmp2) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy);
@@ -1230,14 +1230,14 @@ namespace ip_simd {
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
   }
 
-  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx, 
-			       const SIMD_double &fjy, const SIMD_double &fjz,
-			       SIMD_double &fxtmp, SIMD_double &fytmp,
-			       SIMD_double &fztmp, SIMD_double &fjxtmp,
-			       SIMD_double &fjytmp, SIMD_double &fjztmp,
-			       SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			       SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			       SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
+  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx,
+                               const SIMD_double &fjy, const SIMD_double &fjz,
+                               SIMD_double &fxtmp, SIMD_double &fytmp,
+                               SIMD_double &fztmp, SIMD_double &fjxtmp,
+                               SIMD_double &fjytmp, SIMD_double &fjztmp,
+                               SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                               SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                               SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy);
@@ -1246,20 +1246,20 @@ namespace ip_simd {
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
   }
 
-  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			       const SIMD_float &fjy, const SIMD_float &fjz,
-			       SIMD_double &fxtmp, SIMD_double &fytmp,
-			       SIMD_double &fztmp, SIMD_double &fjxtmp,
-			       SIMD_double &fjytmp, SIMD_double &fjztmp,
-			       SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			       SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			       SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
+  inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                               const SIMD_float &fjy, const SIMD_float &fjz,
+                               SIMD_double &fxtmp, SIMD_double &fytmp,
+                               SIMD_double &fztmp, SIMD_double &fjxtmp,
+                               SIMD_double &fjytmp, SIMD_double &fjztmp,
+                               SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                               SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                               SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
     SIMD_mask kmask2 = kmask >> 8;
     SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx));
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjx,fjx,238)));
+                                _mm512_shuffle_f32x4(fjx,fjx,238)));
     fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd);
     fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd);
 
@@ -1267,7 +1267,7 @@ namespace ip_simd {
     fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd);
     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjy,fjy,238)));
+                                _mm512_shuffle_f32x4(fjy,fjy,238)));
     fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd);
     fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd);
 
@@ -1275,22 +1275,22 @@ namespace ip_simd {
     fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd);
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjz,fjz,238)));
+                                _mm512_shuffle_f32x4(fjz,fjz,238)));
     fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd);
     fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd);
   }
 
-  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			      const SIMD_float &fjy, const SIMD_float &fjz,
-			      const SIMD_float &fkx, const SIMD_float &fky,
-			      const SIMD_float &fkz,
-			      SIMD_float &fxtmp, SIMD_float &fytmp,
-			      SIMD_float &fztmp, SIMD_float &fjxtmp,
-			      SIMD_float &fjytmp, SIMD_float &fjztmp,
-			      SIMD_float &fxtmp2, SIMD_float &fytmp2,
-			      SIMD_float &fztmp2, SIMD_float &fjxtmp2,
-			      SIMD_float &fjytmp2, SIMD_float &fjztmp2,
-			      float *pr, const int offset) {
+  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                              const SIMD_float &fjy, const SIMD_float &fjz,
+                              const SIMD_float &fkx, const SIMD_float &fky,
+                              const SIMD_float &fkz,
+                              SIMD_float &fxtmp, SIMD_float &fytmp,
+                              SIMD_float &fztmp, SIMD_float &fjxtmp,
+                              SIMD_float &fjytmp, SIMD_float &fjztmp,
+                              SIMD_float &fxtmp2, SIMD_float &fytmp2,
+                              SIMD_float &fztmp2, SIMD_float &fjxtmp2,
+                              SIMD_float &fjytmp2, SIMD_float &fjztmp2,
+                              float *pr, const int offset) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky);
@@ -1312,17 +1312,17 @@ namespace ip_simd {
     SIMD_store(p, t);
   }
 
-  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx, 
-			      const SIMD_double &fjy, const SIMD_double &fjz,
-			      const SIMD_double &fkx, const SIMD_double &fky,
-			      const SIMD_double &fkz,
-			      SIMD_double &fxtmp, SIMD_double &fytmp,
-			      SIMD_double &fztmp, SIMD_double &fjxtmp,
-			      SIMD_double &fjytmp, SIMD_double &fjztmp,
-			      SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			      SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			      SIMD_double &fjytmp2, SIMD_double &fjztmp2,
-			      double *pr, const int offset) {
+  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx,
+                              const SIMD_double &fjy, const SIMD_double &fjz,
+                              const SIMD_double &fkx, const SIMD_double &fky,
+                              const SIMD_double &fkz,
+                              SIMD_double &fxtmp, SIMD_double &fytmp,
+                              SIMD_double &fztmp, SIMD_double &fjxtmp,
+                              SIMD_double &fjytmp, SIMD_double &fjztmp,
+                              SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                              SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                              SIMD_double &fjytmp2, SIMD_double &fjztmp2,
+                              double *pr, const int offset) {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky);
@@ -1344,17 +1344,17 @@ namespace ip_simd {
     SIMD_store(p, t);
   }
 
-  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, 
-			      const SIMD_float &fjy, const SIMD_float &fjz,
-			      const SIMD_float &fkx, const SIMD_float &fky,
-			      const SIMD_float &fkz,
-			      SIMD_double &fxtmp, SIMD_double &fytmp,
-			      SIMD_double &fztmp, SIMD_double &fjxtmp,
-			      SIMD_double &fjytmp, SIMD_double &fjztmp,
-			      SIMD_double &fxtmp2, SIMD_double &fytmp2,
-			      SIMD_double &fztmp2, SIMD_double &fjxtmp2,
-			      SIMD_double &fjytmp2, SIMD_double &fjztmp2,
-			      double *pr, const int foffset) {
+  inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx,
+                              const SIMD_float &fjy, const SIMD_float &fjz,
+                              const SIMD_float &fkx, const SIMD_float &fky,
+                              const SIMD_float &fkz,
+                              SIMD_double &fxtmp, SIMD_double &fytmp,
+                              SIMD_double &fztmp, SIMD_double &fjxtmp,
+                              SIMD_double &fjytmp, SIMD_double &fjztmp,
+                              SIMD_double &fxtmp2, SIMD_double &fytmp2,
+                              SIMD_double &fztmp2, SIMD_double &fjxtmp2,
+                              SIMD_double &fjytmp2, SIMD_double &fjztmp2,
+                              double *pr, const int foffset) {
     SIMD_mask kmask2 = kmask >> 8;
     const int offset = foffset >> 1;
     double *p = pr;
@@ -1368,9 +1368,9 @@ namespace ip_simd {
     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd - delfdk);
     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjx,fjx,238)));
+                                _mm512_shuffle_f32x4(fjx,fjx,238)));
     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fkx,fkx,238)));
+                                _mm512_shuffle_f32x4(fkx,fkx,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + delfdk;
@@ -1387,9 +1387,9 @@ namespace ip_simd {
     fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd - delfdk);
     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjy,fjy,238)));
+                                _mm512_shuffle_f32x4(fjy,fjy,238)));
     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fky,fky,238)));
+                                _mm512_shuffle_f32x4(fky,fky,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + delfdk;
@@ -1406,9 +1406,9 @@ namespace ip_simd {
     fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd - delfdk);
     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd);
     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fjz,fjz,238)));
+                                _mm512_shuffle_f32x4(fjz,fjz,238)));
     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(fkz,fkz,238)));
+                                _mm512_shuffle_f32x4(fkz,fkz,238)));
     p = p + offset;
     t = SIMD_load(p);
     t = t + delfdk;
@@ -1417,11 +1417,11 @@ namespace ip_simd {
     fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd);
   }
 
-  inline void SIMD_acc_energy3(const SIMD_mask &hmask, 
-			       const SIMD_float &evdwl, const int eatom,
-			       SIMD_float &sevdwl, SIMD_float &fwtmp,
-			       SIMD_float &fjtmp, SIMD_float &fwtmp2,
-			       SIMD_float &fjtmp2) {
+  inline void SIMD_acc_energy3(const SIMD_mask &hmask,
+                               const SIMD_float &evdwl, const int eatom,
+                               SIMD_float &sevdwl, SIMD_float &fwtmp,
+                               SIMD_float &fjtmp, SIMD_float &fwtmp2,
+                               SIMD_float &fjtmp2) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl);
     if (eatom) {
       const SIMD_float hevdwl = evdwl * (float)0.5;
@@ -1430,11 +1430,11 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_acc_energy3(const SIMD_mask &hmask, 
-			       const SIMD_double &evdwl, const int eatom,
-			       SIMD_double &sevdwl, SIMD_double &fwtmp,
-			       SIMD_double &fjtmp, SIMD_double &fwtmp2,
-			       SIMD_double &fjtmp2) {
+  inline void SIMD_acc_energy3(const SIMD_mask &hmask,
+                               const SIMD_double &evdwl, const int eatom,
+                               SIMD_double &sevdwl, SIMD_double &fwtmp,
+                               SIMD_double &fjtmp, SIMD_double &fwtmp2,
+                               SIMD_double &fjtmp2) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl);
     if (eatom) {
       const SIMD_double hevdwl = evdwl * (double)0.5;
@@ -1443,11 +1443,11 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_acc_energy3(const SIMD_mask &hmask, 
-			       const SIMD_float &evdwl, const int eatom,
-			       SIMD_double &sevdwl, SIMD_double &fwtmp,
-			       SIMD_double &fjtmp, SIMD_double &fwtmp2,
-			       SIMD_double &fjtmp2) {
+  inline void SIMD_acc_energy3(const SIMD_mask &hmask,
+                               const SIMD_float &evdwl, const int eatom,
+                               SIMD_double &sevdwl, SIMD_double &fwtmp,
+                               SIMD_double &fjtmp, SIMD_double &fwtmp2,
+                               SIMD_double &fjtmp2) {
     SIMD_double evdwld;
     evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(evdwl));
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwld);
@@ -1458,7 +1458,7 @@ namespace ip_simd {
     }
     SIMD_mask hmask2 = hmask >> 8;
     evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(evdwl,evdwl,238)));
+                                _mm512_shuffle_f32x4(evdwl,evdwl,238)));
     sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, evdwld);
     if (eatom) {
       const SIMD_double hevdwl = evdwld * (double)0.5;
@@ -1467,48 +1467,48 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 
-			     const int eatom, SIMD_float &sevdwl, 
-			     SIMD_float &fwtmp, SIMD_float &fjtmp, 
-			     SIMD_float &fwtmp2, SIMD_float &fjtmp2, 
-			     const SIMD_int &k, float *force) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
+                             const int eatom, SIMD_float &sevdwl,
+                             SIMD_float &fwtmp, SIMD_float &fjtmp,
+                             SIMD_float &fwtmp2, SIMD_float &fjtmp2,
+                             const SIMD_int &k, float *force) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
     if (eatom) {
       SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333);
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, 
-						 k, force + 3, _MM_SCALE_1);
+      SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask,
+                                                 k, force + 3, _MM_SCALE_1);
       keng = keng + hevdwl;
       _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1);
     }
   }
 
   inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad,
-			     const int eatom, SIMD_double &sevdwl, 
-			     SIMD_double &fwtmp, SIMD_double &fjtmp, 
-			     SIMD_double &fwtmp2, SIMD_double &fjtmp2, 
-			     const SIMD_int &k, double *force) {
+                             const int eatom, SIMD_double &sevdwl,
+                             SIMD_double &fwtmp, SIMD_double &fjtmp,
+                             SIMD_double &fwtmp2, SIMD_double &fjtmp2,
+                             const SIMD_int &k, double *force) {
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
     if (eatom) {
       SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333);
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
-						    hmask, k, force + 3, 
-						    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
+                                                    hmask, k, force + 3,
+                                                    _MM_SCALE_2);
       keng = keng + hevdwl;
       _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
     }
   }
 
-  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 
-			     const int eatom, SIMD_double &sevdwl, 
-			     SIMD_double &fwtmp, SIMD_double &fjtmp, 
-			     SIMD_double &fwtmp2, SIMD_double &fjtmp2, 
-			     const SIMD_int &k, double *force) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
+                             const int eatom, SIMD_double &sevdwl,
+                             SIMD_double &fwtmp, SIMD_double &fjtmp,
+                             SIMD_double &fwtmp2, SIMD_double &fjtmp2,
+                             const SIMD_int &k, double *force) {
     SIMD_double facradd;
     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad));
     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd);
@@ -1517,15 +1517,15 @@ namespace ip_simd {
       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
-						    hmask, k, force + 3, 
-						    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
+                                                    hmask, k, force + 3,
+                                                    _MM_SCALE_2);
       keng = keng + hevdwl;
       _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
     }
     SIMD_mask hmask2 = hmask >> 8;
     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(facrad,facrad,238)));
+                                _mm512_shuffle_f32x4(facrad,facrad,238)));
     sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, facradd);
     if (eatom) {
       SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
@@ -1533,20 +1533,20 @@ namespace ip_simd {
       fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
       SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
       SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
-      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
-						    hmask2, k2, force + 3, 
-						    _MM_SCALE_2);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(),
+                                                    hmask2, k2, force + 3,
+                                                    _MM_SCALE_2);
       keng = keng + hevdwl;
       _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
     }
   }
 
-  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 
-				 const float ev_pre,
-  	         const SIMD_float &fpair, const SIMD_float &delx,
-		 const SIMD_float &dely,  const SIMD_float &delz,
-		 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
-		 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
+  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
+                                 const float ev_pre,
+                 const SIMD_float &fpair, const SIMD_float &delx,
+                 const SIMD_float &dely,  const SIMD_float &delz,
+                 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
+                 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
     if (vflag == 1) {
       const SIMD_float prefpair = SIMD_set(ev_pre) * fpair;
       sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair);
@@ -1558,12 +1558,12 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 
-				 const double ev_pre,
-  	         const SIMD_double &fpair, const SIMD_double &delx,
-		 const SIMD_double &dely,  const SIMD_double &delz,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
+                                 const double ev_pre,
+                 const SIMD_double &fpair, const SIMD_double &delx,
+                 const SIMD_double &dely,  const SIMD_double &delz,
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       const SIMD_double prefpair = SIMD_set(ev_pre) * fpair;
       sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair);
@@ -1575,12 +1575,12 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 
-				 const float ev_pre,
-  	         const SIMD_float &fpair, const SIMD_float &delx,
-		 const SIMD_float &dely,  const SIMD_float &delz,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+  inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
+                                 const float ev_pre,
+                 const SIMD_float &fpair, const SIMD_float &delx,
+                 const SIMD_float &dely,  const SIMD_float &delz,
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       const SIMD_mask m2 = m >> 8;
       const SIMD_float prefpair = SIMD_set(ev_pre) * fpair;
@@ -1588,55 +1588,55 @@ namespace ip_simd {
       SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv0 = SIMD_add(sv0, m, sv0, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv0 = SIMD_add(sv0, m2, sv0, dpaird);
 
       dpair = dely * dely * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv1 = SIMD_add(sv1, m, sv1, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv1 = SIMD_add(sv1, m2, sv1, dpaird);
 
       dpair = delz * delz * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv2 = SIMD_add(sv2, m, sv2, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv2 = SIMD_add(sv2, m2, sv2, dpaird);
 
       dpair = delx * dely * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv3 = SIMD_add(sv3, m, sv3, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv3 = SIMD_add(sv3, m2, sv3, dpaird);
 
       dpair = delx * delz * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv4 = SIMD_add(sv4, m, sv4, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv4 = SIMD_add(sv4, m2, sv4, dpaird);
 
       dpair = dely * delz * prefpair;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv5 = SIMD_add(sv5, m, sv5, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv5 = SIMD_add(sv5, m2, sv5, dpaird);
     }
   }
 
-  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 
-		 const SIMD_float &fj0, const SIMD_float &fj1,  
-   	         const SIMD_float &fj2, const SIMD_float &fk0,
-   	         const SIMD_float &fk1, const SIMD_float &fk2,
+  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
+                 const SIMD_float &fj0, const SIMD_float &fj1,
+                 const SIMD_float &fj2, const SIMD_float &fk0,
+                 const SIMD_float &fk1, const SIMD_float &fk2,
                  const SIMD_float &delx, const SIMD_float &dely,
                  const SIMD_float &delz, const SIMD_float &delr2x,
                  const SIMD_float &delr2y, const SIMD_float &delr2z,
-		 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
-		 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
+                 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
+                 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
     if (vflag == 1) {
       sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0);
       sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1);
@@ -1647,15 +1647,15 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 
-		 const SIMD_double &fj0, const SIMD_double &fj1,  
-		 const SIMD_double &fj2, const SIMD_double &fk0,  
-		 const SIMD_double &fk1, const SIMD_double &fk2,  
-		 const SIMD_double &delx, const SIMD_double &dely,
-		 const SIMD_double &delz, const SIMD_double &delr2x,
-		 const SIMD_double &delr2y, const SIMD_double &delr2z,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
+                 const SIMD_double &fj0, const SIMD_double &fj1,
+                 const SIMD_double &fj2, const SIMD_double &fk0,
+                 const SIMD_double &fk1, const SIMD_double &fk2,
+                 const SIMD_double &delx, const SIMD_double &dely,
+                 const SIMD_double &delz, const SIMD_double &delr2x,
+                 const SIMD_double &delr2y, const SIMD_double &delr2z,
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0);
       sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1);
@@ -1666,62 +1666,62 @@ namespace ip_simd {
     }
   }
 
-  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 
-		 const SIMD_float &fj0, const SIMD_float &fj1,  
-   	         const SIMD_float &fj2, const SIMD_float &fk0,
-   	         const SIMD_float &fk1, const SIMD_float &fk2,
+  inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
+                 const SIMD_float &fj0, const SIMD_float &fj1,
+                 const SIMD_float &fj2, const SIMD_float &fk0,
+                 const SIMD_float &fk1, const SIMD_float &fk2,
                  const SIMD_float &delx, const SIMD_float &dely,
                  const SIMD_float &delz, const SIMD_float &delr2x,
                  const SIMD_float &delr2y, const SIMD_float &delr2z,
-		 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
-		 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
+                 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
+                 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
     if (vflag == 1) {
       const SIMD_mask m2 = m >> 8;
       SIMD_float dpair = delx * fj0 + delr2x * fk0;
       SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv0 = SIMD_add(sv0, m, sv0, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(dpair,dpair,238)));
+                                _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv0 = SIMD_add(sv0, m2, sv0, dpaird);
 
       dpair = dely * fj1 + delr2y * fk1;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv1 = SIMD_add(sv1, m, sv1, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv1 = SIMD_add(sv1, m2, sv1, dpaird);
 
       dpair = delz * fj2 + delr2z * fk2;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv2 = SIMD_add(sv2, m, sv2, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv2 = SIMD_add(sv2, m2, sv2, dpaird);
 
       dpair = delx * fj1 + delr2x * fk1;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv3 = SIMD_add(sv3, m, sv3, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv3 = SIMD_add(sv3, m2, sv3, dpaird);
 
       dpair = delx * fj2 + delr2x * fk2;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv4 = SIMD_add(sv4, m, sv4, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv4 = SIMD_add(sv4, m2, sv4, dpaird);
 
       dpair = dely * fj2 + delr2y * fk2;
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
       sv5 = SIMD_add(sv5, m, sv5, dpaird);
       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
-			      _mm512_shuffle_f32x4(dpair,dpair,238)));
+                              _mm512_shuffle_f32x4(dpair,dpair,238)));
       sv5 = SIMD_add(sv5, m2, sv5, dpaird);
     }
   }
 
-  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 
+  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
          float *force, const SIMD_int &joffset, SIMD_float &amx,
          SIMD_float &amy, SIMD_float &amz, SIMD_float &fxtmp,
          SIMD_float &fytmp, SIMD_float &fztmp, SIMD_float &fxtmp2,
@@ -1733,10 +1733,10 @@ namespace ip_simd {
     SIMD_jforce_update(rmask, force, joffset, amx, amy, amz);
   }
 
-  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 
+  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
          double *force, const SIMD_int &joffset, SIMD_double &amx,
          SIMD_double &amy, SIMD_double &amz, SIMD_double &fxtmp,
-         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, 
+         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2,
          SIMD_double &fytmp2, SIMD_double &fztmp2) {
     fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx);
     fytmp = SIMD_add(fytmp, rmask, fytmp, amy);
@@ -1745,10 +1745,10 @@ namespace ip_simd {
     SIMD_jforce_update(rmask, force, joffset, amx, amy, amz);
   }
 
-  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 
+  inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
          double *force, const SIMD_int &joffset, SIMD_float &amx,
          SIMD_float &amy, SIMD_float &amz, SIMD_double &fxtmp,
-         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, 
+         SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2,
          SIMD_double &fytmp2, SIMD_double &fztmp2) {
     SIMD_double amxd, amyd, amzd;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
@@ -1762,7 +1762,7 @@ namespace ip_simd {
 
     SIMD_mask rmask2 = rmask >> 8;
     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
-				_mm512_shuffle_f32x4(amx,amx,238)));
+                                _mm512_shuffle_f32x4(amx,amx,238)));
     fxtmp2 = SIMD_add(fxtmp2, rmask2, fxtmp2, amxd);
     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
                                 _mm512_shuffle_f32x4(amy,amy,238)));
@@ -1776,57 +1776,57 @@ namespace ip_simd {
   }
 
   inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
-				 const SIMD_int &i, const SIMD_float &fx,
-				 const SIMD_float &fy, const SIMD_float &fz,
-				 const int EFLAG, const int eatom,
-				 const SIMD_float &fwtmp) {
+                                 const SIMD_int &i, const SIMD_float &fx,
+                                 const SIMD_float &fy, const SIMD_float &fz,
+                                 const int EFLAG, const int eatom,
+                                 const SIMD_float &fwtmp) {
     SIMD_float jfrc;
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fx;
     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
-    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
-				    _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
+                                    _MM_SCALE_1);
     jfrc = jfrc + fy;
     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
-				    _MM_SCALE_1);
+                                    _MM_SCALE_1);
     jfrc = jfrc + fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
     if (EFLAG) {
       if (eatom) {
-	jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
-					_MM_SCALE_1);
-	jfrc = jfrc + fwtmp;
-	_mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
+        jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
+                                        _MM_SCALE_1);
+        jfrc = jfrc + fwtmp;
+        _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
       }
     }
   }
 
   inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
-				 const SIMD_int &i, const SIMD_double &fx,
-				 const SIMD_double &fy, const SIMD_double &fz,
-				 const int EFLAG, const int eatom,
-				 const SIMD_double &fwtmp) {
+                                 const SIMD_int &i, const SIMD_double &fx,
+                                 const SIMD_double &fy, const SIMD_double &fz,
+                                 const int EFLAG, const int eatom,
+                                 const SIMD_double &fwtmp) {
     SIMD_double jfrc;
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fx;
     _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
-    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
-				      _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1,
+                                      _MM_SCALE_2);
     jfrc = jfrc + fy;
     _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
-				      _MM_SCALE_2);
+                                      _MM_SCALE_2);
     jfrc = jfrc + fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
     if (EFLAG) {
       if (eatom) {
-	jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, 
-					  force + 3, _MM_SCALE_2);
-	jfrc = jfrc + fwtmp;
-	_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+        jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i,
+                                          force + 3, _MM_SCALE_2);
+        jfrc = jfrc + fwtmp;
+        _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
       }
     }
   }
@@ -1834,8 +1834,8 @@ namespace ip_simd {
   #ifdef SW_GATHER_TEST
   template <class atom_t>
   inline void SIMD_atom_gather(const SIMD_mask &m, const atom_t *atom,
-			       const SIMD_int &i, SIMD_float &x, SIMD_float &y,
-			       SIMD_float &z, SIMD_int &type) {
+                               const SIMD_int &i, SIMD_float &x, SIMD_float &y,
+                               SIMD_float &z, SIMD_int &type) {
     int jv_scalar[16] __attribute__((aligned(64)));
     int jm_scalar[16] __attribute__((aligned(64)));
     _mm512_store_epi32(jv_scalar, i);
@@ -1846,65 +1846,65 @@ namespace ip_simd {
     pl1 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[1];
     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[2];
     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[3];
     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     js = jv_scalar[4];
     pl2 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[5];
     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[6];
     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[7];
     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     js = jv_scalar[8];
     pl3 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[9];
     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[10];
     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[11];
     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     js = jv_scalar[12];
     pl4 = _mm512_loadu_ps((float *)((char *)atom + js));
     js = jv_scalar[13];
     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
-							js)), 1);
+                                                        js)), 1);
     js = jv_scalar[14];
     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
-							js)), 2);
+                                                        js)), 2);
     js = jv_scalar[15];
     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
-							js)), 3);
-    
+                                                        js)), 3);
+
     SIMD_int c0 = _mm512_setr_epi32(0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c,
-				    0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d);
+                                    0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d);
     SIMD_int c1 = _mm512_setr_epi32(0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d,
-				    0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c);
+                                    0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c);
     SIMD_int c2 = _mm512_setr_epi32(0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e,
-				    0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f);
+                                    0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f);
     SIMD_int c3 = _mm512_setr_epi32(0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f,
-				    0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e);
+                                    0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e);
     SIMD_mask k_1 = _mm512_int2mask(65280);
 
     SIMD_float sl1 = _mm512_permutex2var_ps(pl3, c0, pl4);
     SIMD_float sl2 = _mm512_permutex2var_ps(pl1, c1, pl2);
     SIMD_float sl3 = _mm512_permutex2var_ps(pl3, c2, pl4);
     SIMD_float sl4 = _mm512_permutex2var_ps(pl1, c3, pl2);
-    
+
     x = _mm512_shuffle_f32x4(sl2, sl1, 78);
     z = _mm512_shuffle_f32x4(sl4, sl3, 78);
     y = _mm512_mask_blend_ps(k_1, sl2, sl1);
diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h
index 403b74d8fe..547fadb6e9 100644
--- a/src/USER-INTEL/math_extra_intel.h
+++ b/src/USER-INTEL/math_extra_intel.h
@@ -18,110 +18,110 @@
 #ifndef LMP_MATH_EXTRA_INTEL_H
 #define LMP_MATH_EXTRA_INTEL_H
 
-#define ME_quat_to_mat_trans(quat, mat)		\
-{						\
-  flt_t quat_w = quat.w;			\
-  flt_t quat_i = quat.i;			\
-  flt_t quat_j = quat.j;			\
-  flt_t quat_k = quat.k;			\
-  flt_t w2 = quat_w * quat_w;			\
-  flt_t i2 = quat_i * quat_i;			\
-  flt_t j2 = quat_j * quat_j;			\
-  flt_t k2 = quat_k * quat_k;			\
-  flt_t twoij = (flt_t)2.0 * quat_i * quat_j;	\
-  flt_t twoik = (flt_t)2.0 * quat_i * quat_k;	\
-  flt_t twojk = (flt_t)2.0 * quat_j * quat_k;	\
-  flt_t twoiw = (flt_t)2.0 * quat_i * quat_w;	\
-  flt_t twojw = (flt_t)2.0 * quat_j * quat_w;	\
-  flt_t twokw = (flt_t)2.0 * quat_k * quat_w;	\
-  						\
-  mat##_0 = w2 + i2 - j2 - k2;			\
-  mat##_3 = twoij - twokw;			\
-  mat##_6 = twojw + twoik;			\
-  						\
-  mat##_1 = twoij + twokw;			\
-  mat##_4 = w2 - i2 + j2 - k2;			\
-  mat##_7 = twojk - twoiw;			\
-  						\
-  mat##_2 = twoik - twojw;			\
-  mat##_5 = twojk + twoiw;			\
-  mat##_8 = w2 - i2 - j2 + k2;			\
+#define ME_quat_to_mat_trans(quat, mat)         \
+{                                               \
+  flt_t quat_w = quat.w;                        \
+  flt_t quat_i = quat.i;                        \
+  flt_t quat_j = quat.j;                        \
+  flt_t quat_k = quat.k;                        \
+  flt_t w2 = quat_w * quat_w;                   \
+  flt_t i2 = quat_i * quat_i;                   \
+  flt_t j2 = quat_j * quat_j;                   \
+  flt_t k2 = quat_k * quat_k;                   \
+  flt_t twoij = (flt_t)2.0 * quat_i * quat_j;   \
+  flt_t twoik = (flt_t)2.0 * quat_i * quat_k;   \
+  flt_t twojk = (flt_t)2.0 * quat_j * quat_k;   \
+  flt_t twoiw = (flt_t)2.0 * quat_i * quat_w;   \
+  flt_t twojw = (flt_t)2.0 * quat_j * quat_w;   \
+  flt_t twokw = (flt_t)2.0 * quat_k * quat_w;   \
+                                                \
+  mat##_0 = w2 + i2 - j2 - k2;                  \
+  mat##_3 = twoij - twokw;                      \
+  mat##_6 = twojw + twoik;                      \
+                                                \
+  mat##_1 = twoij + twokw;                      \
+  mat##_4 = w2 - i2 + j2 - k2;                  \
+  mat##_7 = twojk - twoiw;                      \
+                                                \
+  mat##_2 = twoik - twojw;                      \
+  mat##_5 = twojk + twoiw;                      \
+  mat##_8 = w2 - i2 - j2 + k2;                  \
 }
 
 /* ----------------------------------------------------------------------
    diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */
 
-#define ME_diag_times3(d, m, ans)			\
-  {							\
-  ans##_0 = d[0] * m##_0;				\
-  ans##_1 = d[0] * m##_1;				\
-  ans##_2 = d[0] * m##_2;				\
-  ans##_3 = d[1] * m##_3;				\
-  ans##_4 = d[1] * m##_4;				\
-  ans##_5 = d[1] * m##_5;				\
-  ans##_6 = d[2] * m##_6;				\
-  ans##_7 = d[2] * m##_7;				\
-  ans##_8 = d[2] * m##_8;				\
+#define ME_diag_times3(d, m, ans)                       \
+  {                                                     \
+  ans##_0 = d[0] * m##_0;                               \
+  ans##_1 = d[0] * m##_1;                               \
+  ans##_2 = d[0] * m##_2;                               \
+  ans##_3 = d[1] * m##_3;                               \
+  ans##_4 = d[1] * m##_4;                               \
+  ans##_5 = d[1] * m##_5;                               \
+  ans##_6 = d[2] * m##_6;                               \
+  ans##_7 = d[2] * m##_7;                               \
+  ans##_8 = d[2] * m##_8;                               \
 }
 
-#define ME_diag_times3a(d, m, ans)			\
-  {							\
-  ans##_0 = d##_0 * m##_0;				\
-  ans##_1 = d##_0 * m##_1;				\
-  ans##_2 = d##_0 * m##_2;				\
-  ans##_3 = d##_1 * m##_3;				\
-  ans##_4 = d##_1 * m##_4;				\
-  ans##_5 = d##_1 * m##_5;				\
-  ans##_6 = d##_2 * m##_6;				\
-  ans##_7 = d##_2 * m##_7;				\
-  ans##_8 = d##_2 * m##_8;				\
+#define ME_diag_times3a(d, m, ans)                      \
+  {                                                     \
+  ans##_0 = d##_0 * m##_0;                              \
+  ans##_1 = d##_0 * m##_1;                              \
+  ans##_2 = d##_0 * m##_2;                              \
+  ans##_3 = d##_1 * m##_3;                              \
+  ans##_4 = d##_1 * m##_4;                              \
+  ans##_5 = d##_1 * m##_5;                              \
+  ans##_6 = d##_2 * m##_6;                              \
+  ans##_7 = d##_2 * m##_7;                              \
+  ans##_8 = d##_2 * m##_8;                              \
 }
 
 /* ----------------------------------------------------------------------
    multiply the transpose of mat1 times mat2
 ------------------------------------------------------------------------- */
 
-#define ME_transpose_times3(m1, m2, ans)                	\
-{								\
-  ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6;	\
-  ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7;	\
-  ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8;	\
-  ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6;	\
-  ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7;	\
-  ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8;	\
-  ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6;	\
-  ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7;	\
-  ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8;	\
+#define ME_transpose_times3(m1, m2, ans)                        \
+{                                                               \
+  ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6;      \
+  ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7;      \
+  ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8;      \
+  ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6;      \
+  ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7;      \
+  ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8;      \
+  ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6;      \
+  ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7;      \
+  ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8;      \
 }
 
 /* ----------------------------------------------------------------------
    normalize a vector, return in ans
 ------------------------------------------------------------------------- */
 
-#define ME_normalize3(v0, v1, v2, ans)	        	\
-{							\
-  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);	\
-  ans##_0 = v0 * scale;					\
-  ans##_1 = v1 * scale;					\
-  ans##_2 = v2 * scale;					\
+#define ME_normalize3(v0, v1, v2, ans)                  \
+{                                                       \
+  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);   \
+  ans##_0 = v0 * scale;                                 \
+  ans##_1 = v1 * scale;                                 \
+  ans##_2 = v2 * scale;                                 \
 }
 
 /* ----------------------------------------------------------------------
    add two matrices
 ------------------------------------------------------------------------- */
 
-#define ME_plus3(m1, m2, ans)			\
-{						\
-  ans##_0 = m1##_0 + m2##_0;			\
-  ans##_1 = m1##_1 + m2##_1;			\
-  ans##_2 = m1##_2 + m2##_2;			\
-  ans##_3 = m1##_3 + m2##_3;			\
-  ans##_4 = m1##_4 + m2##_4;			\
-  ans##_5 = m1##_5 + m2##_5;			\
-  ans##_6 = m1##_6 + m2##_6;			\
-  ans##_7 = m1##_7 + m2##_7;			\
-  ans##_8 = m1##_8 + m2##_8;			\
+#define ME_plus3(m1, m2, ans)                   \
+{                                               \
+  ans##_0 = m1##_0 + m2##_0;                    \
+  ans##_1 = m1##_1 + m2##_1;                    \
+  ans##_2 = m1##_2 + m2##_2;                    \
+  ans##_3 = m1##_3 + m2##_3;                    \
+  ans##_4 = m1##_4 + m2##_4;                    \
+  ans##_5 = m1##_5 + m2##_5;                    \
+  ans##_6 = m1##_6 + m2##_6;                    \
+  ans##_7 = m1##_7 + m2##_7;                    \
+  ans##_8 = m1##_8 + m2##_8;                    \
 }
 
 /* ----------------------------------------------------------------------
@@ -135,7 +135,7 @@
    determinant of a matrix
 ------------------------------------------------------------------------- */
 
-#define ME_det3(m)				    \
+#define ME_det3(m)                                  \
   ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
     m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
     m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
@@ -144,8 +144,8 @@
    row vector times matrix
 ------------------------------------------------------------------------- */
 
-#define ME_vecmat(v, m, ans)				    \
-{							    \
+#define ME_vecmat(v, m, ans)                                \
+{                                                           \
   ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6;  \
   ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7;  \
   ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8;  \
@@ -155,214 +155,214 @@
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
-#define ME_cross3(v1, v2, ans)			\
-{						\
-  ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1;	\
-  ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2;	\
-  ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0;	\
+#define ME_cross3(v1, v2, ans)                  \
+{                                               \
+  ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1;  \
+  ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2;  \
+  ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0;  \
 }
 
 /* ----------------------------------------------------------------------
    cross product of 2 vectors
 ------------------------------------------------------------------------- */
 
-#define ME_mv0_cross3(m1, v2, ans)		\
-{						\
-  ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1;	\
-  ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2;	\
-  ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0;	\
+#define ME_mv0_cross3(m1, v2, ans)              \
+{                                               \
+  ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1;  \
+  ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2;  \
+  ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0;  \
 }
 
-#define ME_mv1_cross3(m1, v2, ans)		\
-{						\
-  ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1;	\
-  ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2;	\
-  ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0;	\
+#define ME_mv1_cross3(m1, v2, ans)              \
+{                                               \
+  ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1;  \
+  ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2;  \
+  ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0;  \
 }
 
-#define ME_mv2_cross3(m1, v2, ans)		\
-{						\
-  ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1;	\
-  ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2;	\
-  ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0;	\
+#define ME_mv2_cross3(m1, v2, ans)              \
+{                                               \
+  ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1;  \
+  ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2;  \
+  ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0;  \
 }
 
 
 #define ME_compute_eta_torque(m1, m2, s1, ans)                              \
-{								            \
-  flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7-		    \
-    m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5-				    \
-    m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8;				    \
-  den = (flt_t)1.0 / den;						    \
-									    \
+{                                                                           \
+  flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7-                    \
+    m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5-                              \
+    m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8;                              \
+  den = (flt_t)1.0 / den;                                                   \
+                                                                            \
   ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0-   \
- 		   m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+    \
-		   m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8-	 	    \
-		   m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+		    \
-		   m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den;	    \
-									    \
-  ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+		    \
-		   (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5-    \
-		   (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2-    \
-		   m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+		    \
-		   m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den;	    \
-  									    \
+                   m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+    \
+                   m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8-               \
+                   m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+               \
+                   m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den;          \
+                                                                            \
+  ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+              \
+                   (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5-    \
+                   (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2-    \
+                   m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+               \
+                   m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den;          \
+                                                                            \
   ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4-              \
-		   m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1-		    \
-		   m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+		    \
-		   (flt_t)2.0*m1##_4*m1##_0*m2##_2-                         \
-		   (flt_t)2.0*m1##_3*m2##_2*m1##_1+			    \
-		   m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den;	    \
-									    \
+                   m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1-               \
+                   m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+               \
+                   (flt_t)2.0*m1##_4*m1##_0*m2##_2-                         \
+                   (flt_t)2.0*m1##_3*m2##_2*m1##_1+                         \
+                   m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den;          \
+                                                                            \
   ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+  \
-		   m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+    \
-		   m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8-		    \
-		   m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- 		    \
-		   m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den;	    \
-									    \
-  ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+		    \
-		   (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5-    \
-		   (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+    \
-		   m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2-		    \
-		   m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den;	    \
-									    \
-  ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4-		    \
-		   m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+		    \
-		   (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+    \
-		   m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4-		    \
-		   (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)*   \
-    den;							   	    \
-									    \
-  ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+  	    \
-		   (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+    \
-		   m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5-    \
-		   m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7-		    \
-		   m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den;	    \
-									    \
-  ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7-		    \
-		    (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+   \
-		    (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8-   \
-		    m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+		    \
-		    m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den;	    \
-									    \
-  ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4-		    \
-		   m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7-		    \
-		   m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+		    \
-		   (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+    \
+                   m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+    \
+                   m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8-               \
+                   m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4-               \
+                   m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den;          \
+                                                                            \
+  ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+              \
+                   (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5-    \
+                   (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+    \
+                   m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2-               \
+                   m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den;          \
+                                                                            \
+  ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4-              \
+                   m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+               \
+                   (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+    \
+                   m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4-               \
+                   (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)*   \
+    den;                                                                    \
+                                                                            \
+  ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+             \
+                   (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+    \
+                   m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5-    \
+                   m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7-               \
+                   m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den;          \
+                                                                            \
+  ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7-             \
+                    (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+   \
+                    (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8-   \
+                    m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+              \
+                    m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den;         \
+                                                                            \
+  ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4-              \
+                   m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7-               \
+                   m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+               \
+                   (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+    \
                     m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)*  \
-    den;								    \
+    den;                                                                    \
 }
 
-#define ME_vcopy4(dst,src)			\
-  dst##_0 = src##_0;				\
-  dst##_1 = src##_1;				\
-  dst##_2 = src##_2;				\
+#define ME_vcopy4(dst,src)                      \
+  dst##_0 = src##_0;                            \
+  dst##_1 = src##_1;                            \
+  dst##_2 = src##_2;                            \
   dst##_3 = src##_3;
 
-#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error)	\
-{							\
-  flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5;	\
-  flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t;	\
-							\
-  aug_3 = v_0;						\
-  aug_0 = m1##_0;					\
-  aug_1 = m1##_1;					\
-  aug_2 = m1##_2;					\
-  aug_7 = v_1;						\
-  aug_4 = m1##_3;					\
-  aug_5 = m1##_4;					\
-  aug_6 = m1##_5;					\
-  aug_11 = v_2;						\
-  aug_8 = m1##_6;					\
-  aug_9 = m1##_7;					\
-  aug_10 = m1##_8;					\
-							\
-  if (fabs(aug_4) > fabs(aug_0)) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
-    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
-    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
-    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
-  }							\
-  if (fabs(aug_8) > fabs(aug_0)) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
+#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error)     \
+{                                                       \
+  flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5;       \
+  flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t;  \
+                                                        \
+  aug_3 = v_0;                                          \
+  aug_0 = m1##_0;                                       \
+  aug_1 = m1##_1;                                       \
+  aug_2 = m1##_2;                                       \
+  aug_7 = v_1;                                          \
+  aug_4 = m1##_3;                                       \
+  aug_5 = m1##_4;                                       \
+  aug_6 = m1##_5;                                       \
+  aug_11 = v_2;                                         \
+  aug_8 = m1##_6;                                       \
+  aug_9 = m1##_7;                                       \
+  aug_10 = m1##_8;                                      \
+                                                        \
+  if (fabs(aug_4) > fabs(aug_0)) {                      \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;        \
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;        \
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;        \
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;        \
+  }                                                     \
+  if (fabs(aug_8) > fabs(aug_0)) {                      \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;        \
     swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \
     swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
     swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
-  }							\
-							\
-  if (aug_0 != (flt_t)0.0) {				\
-  } else if (aug_4 != (flt_t)0.0) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
-    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
-    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
-    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
-  } else if (aug_8 != (flt_t)0.0) {			\
-    flt_t swapt;					\
-    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
-    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;	\
-    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;	\
-    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;	\
-  } else						\
-    error = 1;						\
-							\
-  t = aug_4 / aug_0;					\
-  aug_5 -= t * aug_1;					\
-  aug_6 -= t * aug_2;					\
-  aug_7 -= t * aug_3;					\
-  t = aug_8 / aug_0;					\
-  aug_9 -= t * aug_1;					\
-  aug_10 -= t * aug_2;					\
-  aug_11 -= t * aug_3;					\
-							\
-  if (fabs(aug_9) > fabs(aug_5)) {			\
-    flt_t swapt;					\
-    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;	\
-    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
-    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
-    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
-  }							\
-							\
-  if (aug_5 != (flt_t)0.0) {				\
-  } else if (aug_9 != (flt_t)0.0) {			\
-    flt_t swapt;					\
+  }                                                     \
+                                                        \
+  if (aug_0 != (flt_t)0.0) {                            \
+  } else if (aug_4 != (flt_t)0.0) {                     \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;        \
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;        \
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;        \
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;        \
+  } else if (aug_8 != (flt_t)0.0) {                     \
+    flt_t swapt;                                        \
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;        \
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \
+    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
+    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
+  } else                                                \
+    error = 1;                                          \
+                                                        \
+  t = aug_4 / aug_0;                                    \
+  aug_5 -= t * aug_1;                                   \
+  aug_6 -= t * aug_2;                                   \
+  aug_7 -= t * aug_3;                                   \
+  t = aug_8 / aug_0;                                    \
+  aug_9 -= t * aug_1;                                   \
+  aug_10 -= t * aug_2;                                  \
+  aug_11 -= t * aug_3;                                  \
+                                                        \
+  if (fabs(aug_9) > fabs(aug_5)) {                      \
+    flt_t swapt;                                        \
+    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;        \
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;        \
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;      \
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;      \
+  }                                                     \
+                                                        \
+  if (aug_5 != (flt_t)0.0) {                            \
+  } else if (aug_9 != (flt_t)0.0) {                     \
+    flt_t swapt;                                        \
     swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;        \
-    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
-    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
-    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
-  }							\
-							\
-  t = aug_9 / aug_5;					\
-  aug_10 -= t * aug_6;					\
-  aug_11 -= t * aug_7;					\
-							\
-  if (aug_10 == (flt_t)0.0)				\
-    error = 1;						\
-							\
-  ans##_2 = aug_11/aug_10;				\
-  t = (flt_t)0.0;					\
-  t += aug_6 * ans##_2;					\
-  ans##_1 = (aug_7-t) / aug_5;				\
-  t = (flt_t)0.0;					\
-  t += aug_1 * ans##_1;					\
-  t += aug_2 * ans##_2;					\
-  ans##_0 = (aug_3 - t) / aug_0;			\
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;        \
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;      \
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;      \
+  }                                                     \
+                                                        \
+  t = aug_9 / aug_5;                                    \
+  aug_10 -= t * aug_6;                                  \
+  aug_11 -= t * aug_7;                                  \
+                                                        \
+  if (aug_10 == (flt_t)0.0)                             \
+    error = 1;                                          \
+                                                        \
+  ans##_2 = aug_11/aug_10;                              \
+  t = (flt_t)0.0;                                       \
+  t += aug_6 * ans##_2;                                 \
+  ans##_1 = (aug_7-t) / aug_5;                          \
+  t = (flt_t)0.0;                                       \
+  t += aug_1 * ans##_1;                                 \
+  t += aug_2 * ans##_2;                                 \
+  ans##_0 = (aug_3 - t) / aug_0;                        \
 }
 
 /* ----------------------------------------------------------------------
    normalize a quaternion
 ------------------------------------------------------------------------- */
 
-#define ME_qnormalize(q)						\
-{									\
-  double norm = 1.0 /							\
-    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);	\
-  q##_w *= norm;							\
-  q##_i *= norm;							\
-  q##_j *= norm;							\
-  q##_k *= norm;							\
+#define ME_qnormalize(q)                                                \
+{                                                                       \
+  double norm = 1.0 /                                                   \
+    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);        \
+  q##_w *= norm;                                                        \
+  q##_i *= norm;                                                        \
+  q##_j *= norm;                                                        \
+  q##_k *= norm;                                                        \
 }
 
 /* ----------------------------------------------------------------------
@@ -373,106 +373,106 @@
      and divide by principal moments
 ------------------------------------------------------------------------- */
 
-#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)	\
-{									\
-  double wbody_0, wbody_1, wbody_2;					\
-  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8;	\
-									\
-  double w2 = quat##_w * quat##_w;					\
-  double i2 = quat##_i * quat##_i;					\
-  double j2 = quat##_j * quat##_j;					\
-  double k2 = quat##_k * quat##_k;					\
-  double twoij = 2.0 * quat##_i * quat##_j;				\
-  double twoik = 2.0 * quat##_i * quat##_k;				\
-  double twojk = 2.0 * quat##_j * quat##_k;				\
-  double twoiw = 2.0 * quat##_i * quat##_w;				\
-  double twojw = 2.0 * quat##_j * quat##_w;				\
-  double twokw = 2.0 * quat##_k * quat##_w;				\
-    									\
-  rot##_0 = w2 + i2 - j2 - k2;					        \
-  rot##_1 = twoij - twokw;						\
-  rot##_2 = twojw + twoik;						\
-	  								\
-  rot##_3 = twoij + twokw;					        \
-  rot##_4 = w2 - i2 + j2 - k2;				                \
-  rot##_5 = twojk - twoiw;					        \
-									\
-  rot##_6 = twoik - twojw;				                \
-  rot##_7 = twojk + twoiw;				                \
-  rot##_8 = w2 - i2 - j2 + k2;			                        \
-									\
+#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)     \
+{                                                                       \
+  double wbody_0, wbody_1, wbody_2;                                     \
+  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
+                                                                        \
+  double w2 = quat##_w * quat##_w;                                      \
+  double i2 = quat##_i * quat##_i;                                      \
+  double j2 = quat##_j * quat##_j;                                      \
+  double k2 = quat##_k * quat##_k;                                      \
+  double twoij = 2.0 * quat##_i * quat##_j;                             \
+  double twoik = 2.0 * quat##_i * quat##_k;                             \
+  double twojk = 2.0 * quat##_j * quat##_k;                             \
+  double twoiw = 2.0 * quat##_i * quat##_w;                             \
+  double twojw = 2.0 * quat##_j * quat##_w;                             \
+  double twokw = 2.0 * quat##_k * quat##_w;                             \
+                                                                        \
+  rot##_0 = w2 + i2 - j2 - k2;                                          \
+  rot##_1 = twoij - twokw;                                              \
+  rot##_2 = twojw + twoik;                                              \
+                                                                        \
+  rot##_3 = twoij + twokw;                                              \
+  rot##_4 = w2 - i2 + j2 - k2;                                          \
+  rot##_5 = twojk - twoiw;                                              \
+                                                                        \
+  rot##_6 = twoik - twojw;                                              \
+  rot##_7 = twojk + twoiw;                                              \
+  rot##_8 = w2 - i2 - j2 + k2;                                          \
+                                                                        \
   wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
   wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
   wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
-									\
-  wbody_0 *= moments_0;							\
-  wbody_1 *= moments_1;							\
-  wbody_2 *= moments_2;							\
-									\
+                                                                        \
+  wbody_0 *= moments_0;                                                 \
+  wbody_1 *= moments_1;                                                 \
+  wbody_2 *= moments_2;                                                 \
+                                                                        \
   w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
   w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
   w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
 }
 
-#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)	\
-{									\
-  angmomin[0] += dtf * torque[0];					\
-  double angmom_0 = angmomin[0];					\
-  angmomin[1] += dtf * torque[1];					\
-  double angmom_1 = angmomin[1];					\
-  angmomin[2] += dtf * torque[2];					\
-  double angmom_2 = angmomin[2];					\
-									\
-  double quat_w = quatin[0];						\
-  double quat_i = quatin[1];						\
-  double quat_j = quatin[2];						\
-  double quat_k = quatin[3];						\
-									\
-  double omega_0, omega_1, omega_2;					\
-  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);				\
-									\
-  double wq_0, wq_1, wq_2, wq_3;					\
-  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;		\
-  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;		\
-  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;		\
-  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;		\
-									\
-  double qfull_w, qfull_i, qfull_j, qfull_k;				\
-  qfull_w = quat_w + dtq * wq_0;					\
-  qfull_i = quat_i + dtq * wq_1;					\
-  qfull_j = quat_j + dtq * wq_2;					\
-  qfull_k = quat_k + dtq * wq_3;					\
-  ME_qnormalize(qfull);							\
-									\
-  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;				\
-  qhalf_w = quat_w + 0.5*dtq * wq_0;					\
-  qhalf_i = quat_i + 0.5*dtq * wq_1;					\
-  qhalf_j = quat_j + 0.5*dtq * wq_2;					\
-  qhalf_k = quat_k + 0.5*dtq * wq_3;					\
-  ME_qnormalize(qhalf);							\
-  									\
-  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);				\
-  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;		\
-  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;		\
-  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;		\
-  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;		\
-									\
-  qhalf_w += 0.5*dtq * wq_0;						\
-  qhalf_i += 0.5*dtq * wq_1;						\
-  qhalf_j += 0.5*dtq * wq_2;						\
-  qhalf_k += 0.5*dtq * wq_3;						\
-  ME_qnormalize(qhalf);							\
-									\
-  quat_w = 2.0*qhalf_w - qfull_w;					\
-  quat_i = 2.0*qhalf_i - qfull_i;					\
-  quat_j = 2.0*qhalf_j - qfull_j;					\
-  quat_k = 2.0*qhalf_k - qfull_k;					\
-  ME_qnormalize(quat);							\
-									\
-  quatin[0] = quat_w;							\
-  quatin[1] = quat_i;							\
-  quatin[2] = quat_j;							\
-  quatin[3] = quat_k;							\
+#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)    \
+{                                                                       \
+  angmomin[0] += dtf * torque[0];                                       \
+  double angmom_0 = angmomin[0];                                        \
+  angmomin[1] += dtf * torque[1];                                       \
+  double angmom_1 = angmomin[1];                                        \
+  angmomin[2] += dtf * torque[2];                                       \
+  double angmom_2 = angmomin[2];                                        \
+                                                                        \
+  double quat_w = quatin[0];                                            \
+  double quat_i = quatin[1];                                            \
+  double quat_j = quatin[2];                                            \
+  double quat_k = quatin[3];                                            \
+                                                                        \
+  double omega_0, omega_1, omega_2;                                     \
+  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);                           \
+                                                                        \
+  double wq_0, wq_1, wq_2, wq_3;                                        \
+  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;             \
+  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;              \
+  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;              \
+  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;              \
+                                                                        \
+  double qfull_w, qfull_i, qfull_j, qfull_k;                            \
+  qfull_w = quat_w + dtq * wq_0;                                        \
+  qfull_i = quat_i + dtq * wq_1;                                        \
+  qfull_j = quat_j + dtq * wq_2;                                        \
+  qfull_k = quat_k + dtq * wq_3;                                        \
+  ME_qnormalize(qfull);                                                 \
+                                                                        \
+  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;                            \
+  qhalf_w = quat_w + 0.5*dtq * wq_0;                                    \
+  qhalf_i = quat_i + 0.5*dtq * wq_1;                                    \
+  qhalf_j = quat_j + 0.5*dtq * wq_2;                                    \
+  qhalf_k = quat_k + 0.5*dtq * wq_3;                                    \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);                          \
+  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;          \
+  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;           \
+  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;           \
+  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;           \
+                                                                        \
+  qhalf_w += 0.5*dtq * wq_0;                                            \
+  qhalf_i += 0.5*dtq * wq_1;                                            \
+  qhalf_j += 0.5*dtq * wq_2;                                            \
+  qhalf_k += 0.5*dtq * wq_3;                                            \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  quat_w = 2.0*qhalf_w - qfull_w;                                       \
+  quat_i = 2.0*qhalf_i - qfull_i;                                       \
+  quat_j = 2.0*qhalf_j - qfull_j;                                       \
+  quat_k = 2.0*qhalf_k - qfull_k;                                       \
+  ME_qnormalize(quat);                                                  \
+                                                                        \
+  quatin[0] = quat_w;                                                   \
+  quatin[1] = quat_i;                                                   \
+  quatin[2] = quat_j;                                                   \
+  quatin[3] = quat_k;                                                   \
 }
 
 #endif
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index bff3d53636..c5574a78c7 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -51,11 +51,11 @@ NBinIntel::~NBinIntel() {
     const int * bins = this->bins;
     const int * _atombin = this->_atombin;
     const int * _binpacked = this->_binpacked;
-    #pragma offload_transfer target(mic:_cop)	\
+    #pragma offload_transfer target(mic:_cop)   \
       nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
   }
   #endif
-}  
+}
 
 /* ----------------------------------------------------------------------
    setup for bin_atoms()
@@ -70,8 +70,8 @@ void NBinIntel::bin_atoms_setup(int nall)
     #ifdef _LMP_INTEL_OFFLOAD
     if (_offload_alloc) {
       const int * binhead = this->binhead;
-      #pragma offload_transfer target(mic:_cop)	\
-	nocopy(binhead:alloc_if(0) free_if(1))
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(binhead:alloc_if(0) free_if(1))
     }
     #endif
 
@@ -98,8 +98,8 @@ void NBinIntel::bin_atoms_setup(int nall)
       const int * bins = this->bins;
       const int * _atombin = this->_atombin;
       const int * _binpacked = this->_binpacked;
-      #pragma offload_transfer target(mic:_cop)	\
-	nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
     }
     #endif
     memory->destroy(bins);
@@ -157,10 +157,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
     const flt_t dx = (INTEL_BIGP - bboxhi[0]);
     const flt_t dy = (INTEL_BIGP - bboxhi[1]);
     const flt_t dz = (INTEL_BIGP - bboxhi[2]);
-    if (dx * dx + dy * dy + dz * dz < 
-	static_cast<flt_t>(neighbor->cutneighmaxsq))
+    if (dx * dx + dy * dy + dz * dz <
+        static_cast<flt_t>(neighbor->cutneighmaxsq))
       error->one(FLERR,
-	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+        "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
   }
 
   // ---------- Grow and cast/pack buffers -------------
@@ -183,7 +183,7 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
   {
     int ifrom, ito, tid;
     IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
-			      sizeof(ATOM_T));
+                              sizeof(ATOM_T));
     buffers->thr_pack(ifrom, ito, 0);
   }
   _fix->stop_watch(TIME_PACK);
diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp
index ae4f599176..06c10c080f 100644
--- a/src/USER-INTEL/npair_full_bin_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_intel.cpp
@@ -70,48 +70,48 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   #endif
 
   buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
-		     _fix->nbor_pack_width());
+                     _fix->nbor_pack_width());
 
   int need_ic = 0;
   if (atom->molecular)
     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-			 neighbor->cutneighmax);
+                         neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_fix->three_body_neighbor()) {
     if (need_ic) {
       if (offload_noghost) {
-	bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
       } else {
-	bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+        bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
       }
     } else {
       if (offload_noghost) {
-	bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
       } else {
-	bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
+        bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
       }
     }
   } else {
     if (need_ic) {
       if (offload_noghost) {
-	bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
       } else {
-	bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+        bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
       }
     } else {
       if (offload_noghost) {
-	bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
       } else {
-	bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
-	bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
+        bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
+        bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
       }
     }
   }
diff --git a/src/USER-INTEL/npair_full_bin_intel.h b/src/USER-INTEL/npair_full_bin_intel.h
index 83f2c3cd4c..0f8a27b3b4 100644
--- a/src/USER-INTEL/npair_full_bin_intel.h
+++ b/src/USER-INTEL/npair_full_bin_intel.h
@@ -15,7 +15,7 @@
 
 NPairStyle(full/bin/intel,
            NPairFullBinIntel,
-           NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | 
+           NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
            NP_INTEL)
 #else
 
diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
index e7d5995cc5..c761557097 100644
--- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
@@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : 
+NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
   NPairIntel(lmp) {}
 
 /* ----------------------------------------------------------------------
@@ -75,14 +75,14 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int need_ic = 0;
   if (atom->molecular)
     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-			 neighbor->cutneighmax);
+                         neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (need_ic) {
     if (offload_noghost) {
       bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal, 
-					off_end);
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
       bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
       bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
@@ -90,7 +90,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   } else {
     if (offload_noghost) {
       bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal, 
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal,
                                         off_end);
     } else {
       bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
@@ -98,7 +98,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
     }
   }
   #else
-  if (need_ic) 
+  if (need_ic)
     bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
   else
     bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
index 3c36458f06..d70f1ec589 100644
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
@@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : 
+NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
   NPairIntel(lmp) {}
 
 /* ----------------------------------------------------------------------
@@ -75,14 +75,14 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int need_ic = 0;
   if (atom->molecular)
     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-			 neighbor->cutneighmax);
+                         neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (need_ic) {
     if (offload_noghost) {
       bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal, 
-					off_end);
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
       bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
       bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
@@ -90,8 +90,8 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   } else {
     if (offload_noghost) {
       bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
-      bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal, 
-					off_end);
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal,
+                                        off_end);
     } else {
       bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
       bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index 0412398796..b20b1dcd08 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -40,7 +40,7 @@ NPairIntel::~NPairIntel() {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_off_map_stencil) {
     const int * stencil = this->stencil;
-    #pragma offload_transfer target(mic:_cop)	\
+    #pragma offload_transfer target(mic:_cop)   \
       nocopy(stencil:alloc_if(0) free_if(1))
   }
   #endif
@@ -49,10 +49,10 @@ NPairIntel::~NPairIntel() {
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t, int offload_noghost, int need_ic,
-	  int FULL, int TRI, int THREE>
-void NPairIntel::bin_newton(const int offload, NeighList *list, 
-                            IntelBuffers<flt_t,acc_t> *buffers, 
-                            const int astart, const int aend, 
+          int FULL, int TRI, int THREE>
+void NPairIntel::bin_newton(const int offload, NeighList *list,
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const int astart, const int aend,
                             const int offload_end) {
 
   if (aend-astart == 0) return;
@@ -66,7 +66,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
   if (THREE == 0 && offload) {
     if (INTEL_MIC_NBOR_PAD > 1)
       pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else 
+  } else
   #endif
     if (THREE == 0 && INTEL_NBOR_PAD > 1)
       pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
@@ -120,7 +120,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
     overflow = _fix->get_off_overflow_flag();
     _fix->stop_watch(TIME_HOST_NEIGHBOR);
     _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else 
+  } else
   #endif
   {
     tnum = comm->nthreads;
@@ -193,8 +193,8 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       int end = stencil[k] + 1;
       for (int kk = k + 1; kk < nstencil; kk++) {
         if (stencil[kk-1]+1 == stencil[kk]) {
-	  end++;
-	  k++;
+          end++;
+          k++;
         } else break;
       }
       binend[nstencilp] = end;
@@ -214,16 +214,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       int tid, ifrom, ito;
 
       if (THREE) {
-	IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
+        IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
       } else {
-	IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+        IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
       }
       ifrom += astart;
       ito += astart;
       int e_ito = ito;
       if (THREE && ito == num) {
-	int imod = ito % pack_width;
-	if (imod) e_ito += pack_width - imod;
+        int imod = ito % pack_width;
+        if (imod) e_ito += pack_width - imod;
       }
       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
 
@@ -251,313 +251,313 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       // loop over all atoms in other bins in stencil, store every pair
       int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
       if (THREE) {
-	lane = 0;
-	max_chunk = 0;
+        lane = 0;
+        max_chunk = 0;
       }
       for (int i = ifrom; i < ito; i++) {
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const int itype = x[i].w;
-	tagint itag;
-	if (THREE) itag = tag[i];
+        tagint itag;
+        if (THREE) itag = tag[i];
         const int ioffset = ntypes * itype;
 
         const int ibin = atombin[i];
-	if (ibin != oldbin) {
-	  oldbin = ibin;
-	  ncount = 0;
-	  for (int k = 0; k < nstencilp; k++) {
-	    const int bstart = binhead[ibin + binstart[k]];
-	    const int bend = binhead[ibin + binend[k]];
+        if (ibin != oldbin) {
+          oldbin = ibin;
+          ncount = 0;
+          for (int k = 0; k < nstencilp; k++) {
+            const int bstart = binhead[ibin + binstart[k]];
+            const int bend = binhead[ibin + binend[k]];
             #if defined(LMP_SIMD_COMPILER)
-	    #pragma vector aligned
-	    #pragma simd
-	    #endif
-	    for (int jj = bstart; jj < bend; jj++)
-	      tj[ncount++] = binpacked[jj];
-	  }
+            #pragma vector aligned
+            #pragma simd
+            #endif
+            for (int jj = bstart; jj < bend; jj++)
+              tj[ncount++] = binpacked[jj];
+          }
           #if defined(LMP_SIMD_COMPILER)
           #pragma vector aligned
-	  #pragma simd
-	  #endif
-	  for (int u = 0; u < ncount; u++) {
-	    const int j = tj[u];
-	    tx[u] = x[j].x;
-	    ty[u] = x[j].y;
-	    tz[u] = x[j].z;
-	    tjtype[u] = x[j].w;
-	  }
-
-	  if (FULL == 0 || TRI == 1) {
-	    icount = 0;
-	    istart = ncount;
-	    const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-	    int nedge = istart % alignb;
-	    if (nedge) istart + (alignb - nedge);
-	    itx = tx + istart;
-	    ity = ty + istart;
-	    itz = tz + istart;
-	    itj = tj + istart;
-	    itjtype = tjtype + istart;
+          #pragma simd
+          #endif
+          for (int u = 0; u < ncount; u++) {
+            const int j = tj[u];
+            tx[u] = x[j].x;
+            ty[u] = x[j].y;
+            tz[u] = x[j].z;
+            tjtype[u] = x[j].w;
+          }
+
+          if (FULL == 0 || TRI == 1) {
+            icount = 0;
+            istart = ncount;
+            const int alignb = INTEL_DATA_ALIGN / sizeof(int);
+            int nedge = istart % alignb;
+            if (nedge) istart + (alignb - nedge);
+            itx = tx + istart;
+            ity = ty + istart;
+            itz = tz + istart;
+            itj = tj + istart;
+            itjtype = tjtype + istart;
 
             const int bstart = binhead[ibin];
-	    const int bend = binhead[ibin + 1];
+            const int bend = binhead[ibin + 1];
             #if defined(LMP_SIMD_COMPILER)
-	    #pragma vector aligned
-	    #pragma simd
-	    #endif
-	    for (int jj = bstart; jj < bend; jj++) {
-	      const int j = binpacked[jj];
-	      itj[icount] = j;
-	      itx[icount] = x[j].x;
-	      ity[icount] = x[j].y;
-	      itz[icount] = x[j].z;
-	      itjtype[icount] = x[j].w;
-	      icount++;
-	    }
-	    if (icount + istart > obound) *overflow = 1;
-	  } else
-	    if (ncount > obound) *overflow = 1;
-	}
-
-	// ---------------------- Loop over i bin
+            #pragma vector aligned
+            #pragma simd
+            #endif
+            for (int jj = bstart; jj < bend; jj++) {
+              const int j = binpacked[jj];
+              itj[icount] = j;
+              itx[icount] = x[j].x;
+              ity[icount] = x[j].y;
+              itz[icount] = x[j].z;
+              itjtype[icount] = x[j].w;
+              icount++;
+            }
+            if (icount + istart > obound) *overflow = 1;
+          } else
+            if (ncount > obound) *overflow = 1;
+        }
+
+        // ---------------------- Loop over i bin
 
         int n = 0;
-	if (FULL == 0 || TRI == 1) {
+        if (FULL == 0 || TRI == 1) {
           #if defined(LMP_SIMD_COMPILER)
           #pragma vector aligned
-	  #pragma ivdep
-	  #endif
-	  for (int u = 0; u < icount; u++) {
-	    int addme = 1;
-	    int j = itj[u];
-
-	    // Cutoff Check
-	    const flt_t delx = xtmp - itx[u];
-	    const flt_t dely = ytmp - ity[u];
-	    const flt_t delz = ztmp - itz[u];
-	    const int jtype = itjtype[u];
-	    const flt_t rsq = delx * delx + dely * dely + delz * delz;
-	    if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
-	  
-	    // i bin (half) check and offload ghost check
-	    if (j < nlocal) {
-       	      const int ijmod = (i + j) % 2;
-	      if (i > j) {
-	        if (ijmod == 0) addme = 0;
-	      } else if (i < j) {
-	        if (ijmod == 1) addme = 0;
-	      } else 
- 		addme = 0;
+          #pragma ivdep
+          #endif
+          for (int u = 0; u < icount; u++) {
+            int addme = 1;
+            int j = itj[u];
+
+            // Cutoff Check
+            const flt_t delx = xtmp - itx[u];
+            const flt_t dely = ytmp - ity[u];
+            const flt_t delz = ztmp - itz[u];
+            const int jtype = itjtype[u];
+            const flt_t rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
+
+            // i bin (half) check and offload ghost check
+            if (j < nlocal) {
+              const int ijmod = (i + j) % 2;
+              if (i > j) {
+                if (ijmod == 0) addme = 0;
+              } else if (i < j) {
+                if (ijmod == 1) addme = 0;
+              } else
+                addme = 0;
               #ifdef _LMP_INTEL_OFFLOAD
-	      if (offload_noghost && i < offload_end) addme = 0;
-	      #endif
-	    } else {
+              if (offload_noghost && i < offload_end) addme = 0;
+              #endif
+            } else {
               #ifdef _LMP_INTEL_OFFLOAD
-	      if (offload_noghost && offload) addme = 0;
-	      #endif
-	      if (itz[u] < ztmp) addme = 0;
-	      if (itz[u] == ztmp) {
+              if (offload_noghost && offload) addme = 0;
+              #endif
+              if (itz[u] < ztmp) addme = 0;
+              if (itz[u] == ztmp) {
                 if (ity[u] < ytmp) addme = 0;
                 if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
               }
-            } 
-
-	    if (need_ic) {
-	      int no_special;
-	      ominimum_image_check(no_special, delx, dely, delz);
-	      if (no_special)
-		j = -j - 1;
-	    }
-
-	    if (addme)
-	      neighptr[n++] = j;
-	  }
-	} // if FULL==0
-
-	// ---------------------- Loop over other bins
-
-	int n2, *neighptr2;
-	if (THREE) {
-	  n = pack_offset;
-	  n2 = pack_offset + maxnbors;
-	  neighptr2 = neighptr;
-	}
-	#if defined(LMP_SIMD_COMPILER)
+            }
+
+            if (need_ic) {
+              int no_special;
+              ominimum_image_check(no_special, delx, dely, delz);
+              if (no_special)
+                j = -j - 1;
+            }
+
+            if (addme)
+              neighptr[n++] = j;
+          }
+        } // if FULL==0
+
+        // ---------------------- Loop over other bins
+
+        int n2, *neighptr2;
+        if (THREE) {
+          n = pack_offset;
+          n2 = pack_offset + maxnbors;
+          neighptr2 = neighptr;
+        }
+        #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
-	#endif
-	for (int u = 0; u < ncount; u++) {
-	  int addme = 1;
+        #endif
+        for (int u = 0; u < ncount; u++) {
+          int addme = 1;
           int j = tj[u];
 
-	  if (FULL)
-	    if (i == j) addme = 0;
+          if (FULL)
+            if (i == j) addme = 0;
 
-	  // Cutoff Check
+          // Cutoff Check
           const flt_t delx = xtmp - tx[u];
           const flt_t dely = ytmp - ty[u];
           const flt_t delz = ztmp - tz[u];
-	  const int jtype = tjtype[u];
+          const int jtype = tjtype[u];
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
-	  
-	  // Triclinic
-	  if (TRI) {
-	    if (tz[u] < ztmp) addme = 0;
-	    if (tz[u] == ztmp) {
-	      if (ty[u] < ytmp) addme = 0;
-	      if (ty[u] == ytmp) {
-	        if (tx[u] < xtmp) addme = 0;
+
+          // Triclinic
+          if (TRI) {
+            if (tz[u] < ztmp) addme = 0;
+            if (tz[u] == ztmp) {
+              if (ty[u] < ytmp) addme = 0;
+              if (ty[u] == ytmp) {
+                if (tx[u] < xtmp) addme = 0;
                 if (tx[u] == xtmp && j <= i) addme = 0;
               }
-	    }
-	  }
+            }
+          }
 
-	  // offload ghost check
+          // offload ghost check
           #ifdef _LMP_INTEL_OFFLOAD
-	  if (offload_noghost) {
-	    if (j < nlocal) {
-	      if (i < offload_end) addme = 0;
+          if (offload_noghost) {
+            if (j < nlocal) {
+              if (i < offload_end) addme = 0;
             } else if (offload) addme = 0;
-	  }
-	  #endif
-
-	  int pj;
-	  if (THREE) pj = j;
-	  if (need_ic) {
-	    int no_special;
-	    ominimum_image_check(no_special, delx, dely, delz);
-	    if (no_special)
-	      j = -j - 1;
-	  }
-
-	  if (THREE) {
-	    const int jtag = tag[pj];
-	    int flist = 0;
-	    if (itag > jtag) {
-	      if ((itag+jtag) % 2 == 0) flist = 1;
-	    } else if (itag < jtag) {
-	      if ((itag+jtag) % 2 == 1) flist = 1;
-	    } else {
-	      if (tz[u] < ztmp) flist = 1;
-	      else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
-	      else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) 
-	        flist = 1;
-	    }
-	    if (addme) {
-	      if (flist)
-		neighptr2[n2++] = j;
-	      else
-		neighptr[n++] = j;
-	    }
-	  } else {
-	    if (addme)
-	      neighptr[n++] = j;
-	  }
-	} // for u
+          }
+          #endif
+
+          int pj;
+          if (THREE) pj = j;
+          if (need_ic) {
+            int no_special;
+            ominimum_image_check(no_special, delx, dely, delz);
+            if (no_special)
+              j = -j - 1;
+          }
+
+          if (THREE) {
+            const int jtag = tag[pj];
+            int flist = 0;
+            if (itag > jtag) {
+              if ((itag+jtag) % 2 == 0) flist = 1;
+            } else if (itag < jtag) {
+              if ((itag+jtag) % 2 == 1) flist = 1;
+            } else {
+              if (tz[u] < ztmp) flist = 1;
+              else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+              else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
+                flist = 1;
+            }
+            if (addme) {
+              if (flist)
+                neighptr2[n2++] = j;
+              else
+                neighptr[n++] = j;
+            }
+          } else {
+            if (addme)
+              neighptr[n++] = j;
+          }
+        } // for u
 
         #ifndef _LMP_INTEL_OFFLOAD
-	if (exclude) {
-	  int alln = n;
-	  if (THREE) n = pack_offset;
-	  else n = 0;
-	  for (int u = pack_offset; u < alln; u++) {
-	    const int j = neighptr[u];
-	    int pj = j;
-	    if (need_ic)
-	      if (pj < 0) pj = -j - 1;
-	    const int jtype = x[pj].w;
-	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
-	    neighptr[n++] = j;
+        if (exclude) {
+          int alln = n;
+          if (THREE) n = pack_offset;
+          else n = 0;
+          for (int u = pack_offset; u < alln; u++) {
+            const int j = neighptr[u];
+            int pj = j;
+            if (need_ic)
+              if (pj < 0) pj = -j - 1;
+            const int jtype = x[pj].w;
+            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+            neighptr[n++] = j;
+          }
+          if (THREE) {
+            alln = n2;
+            n2 = pack_offset + maxnbors;
+            for (int u = pack_offset + maxnbors; u < alln; u++) {
+              const int j = neighptr[u];
+              int pj = j;
+              if (need_ic)
+                if (pj < 0) pj = -j - 1;
+              const int jtype = x[pj].w;
+              if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+              neighptr[n2++] = j;
+            }
           }
-	  if (THREE) {
-	    alln = n2;
-	    n2 = pack_offset + maxnbors;
-	    for (int u = pack_offset + maxnbors; u < alln; u++) {
-	      const int j = neighptr[u];
-	      int pj = j;
-	      if (need_ic)
-		if (pj < 0) pj = -j - 1;
-	      const int jtype = x[pj].w;
-	      if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
-	      neighptr[n2++] = j;
-	    }
-	  }
         }
-	#endif
-	int ns;
-	if (THREE) {
-	  int alln = n;
-	  ns = n - pack_offset;
-	  atombin[i] = ns;
-	  n = lane;
-	  for (int u = pack_offset; u < alln; u++) {
-	    neighptr[n] = neighptr[u];
-	    n += pack_width;
-	  }
-	  ns += n2 - pack_offset - maxnbors;
-	  for (int u = pack_offset + maxnbors; u < n2; u++) {
-	    neighptr[n] = neighptr[u];
-	    n += pack_width;
-	  }
-	  if (ns > maxnbors) *overflow = 1;
-	} else
-	  if (n > maxnbors) *overflow = 1;
+        #endif
+        int ns;
+        if (THREE) {
+          int alln = n;
+          ns = n - pack_offset;
+          atombin[i] = ns;
+          n = lane;
+          for (int u = pack_offset; u < alln; u++) {
+            neighptr[n] = neighptr[u];
+            n += pack_width;
+          }
+          ns += n2 - pack_offset - maxnbors;
+          for (int u = pack_offset + maxnbors; u < n2; u++) {
+            neighptr[n] = neighptr[u];
+            n += pack_width;
+          }
+          if (ns > maxnbors) *overflow = 1;
+        } else
+          if (n > maxnbors) *overflow = 1;
 
         ilist[i] = i;
         cnumneigh[i] = ct;
-	if (THREE) {
-	  cnumneigh[i] += lane;
-	  numneigh[i] = ns;
-	} else {
-	  int edge = (n % pad_width);
-	  if (edge) {
-	    const int pad_end = n + (pad_width - edge);
+        if (THREE) {
+          cnumneigh[i] += lane;
+          numneigh[i] = ns;
+        } else {
+          int edge = (n % pad_width);
+          if (edge) {
+            const int pad_end = n + (pad_width - edge);
             #if defined(LMP_SIMD_COMPILER)
-	    #pragma vector aligned
+            #pragma vector aligned
             #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
-	            avg=INTEL_COMPILE_WIDTH/2
+                    avg=INTEL_COMPILE_WIDTH/2
             #endif
             for ( ; n < pad_end; n++)
               neighptr[n] = e_nall;
           }
-	  numneigh[i] = n;
-	}
-
-	if (THREE) {
-  	  if (ns > max_chunk) max_chunk = ns;
-	  lane++;
-	  if (lane == pack_width) {
-	    ct += max_chunk * pack_width;
-	    const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-	    const int edge = (ct % alignb);
-	    if (edge) ct += alignb - edge;
-	    neighptr = firstneigh + ct;
-	    max_chunk = 0;
-	    pack_offset = maxnbors * pack_width;
-	    lane = 0;
-	    if (ct + obound > list_size) {
-	      if (i < ito - 1) {
-		*overflow = 1;
-		ct = (ifrom + tid * 2) * maxnbors;
-	      }
-	    }
-	  }
-	} else {
-	  ct += n;
-	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-	  const int edge = (ct % alignb);
-	  if (edge) ct += alignb - edge;
-	  neighptr = firstneigh + ct;
-	  if (ct + obound > list_size) {
-	    if (i < ito - 1) {
-	      *overflow = 1;
-	      ct = (ifrom + tid * 2) * maxnbors;
-	    }
-	  }
-	}
+          numneigh[i] = n;
+        }
+
+        if (THREE) {
+          if (ns > max_chunk) max_chunk = ns;
+          lane++;
+          if (lane == pack_width) {
+            ct += max_chunk * pack_width;
+            const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+            const int edge = (ct % alignb);
+            if (edge) ct += alignb - edge;
+            neighptr = firstneigh + ct;
+            max_chunk = 0;
+            pack_offset = maxnbors * pack_width;
+            lane = 0;
+            if (ct + obound > list_size) {
+              if (i < ito - 1) {
+                *overflow = 1;
+                ct = (ifrom + tid * 2) * maxnbors;
+              }
+            }
+          }
+        } else {
+          ct += n;
+          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+          const int edge = (ct % alignb);
+          if (edge) ct += alignb - edge;
+          neighptr = firstneigh + ct;
+          if (ct + obound > list_size) {
+            if (i < ito - 1) {
+              *overflow = 1;
+              ct = (ifrom + tid * 2) * maxnbors;
+            }
+          }
+        }
       }
 
       if (*overflow == 1)
@@ -568,16 +568,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
       int ghost_offset = 0, nall_offset = e_nall;
       if (separate_buffers) {
-	for (int i = ifrom; i < ito; ++i) {
+        for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
-	  #if __INTEL_COMPILER+0 > 1499
-	  #pragma vector aligned
+          #if __INTEL_COMPILER+0 > 1499
+          #pragma vector aligned
           #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-	  #endif
-	  for (int jj = 0; jj < jnum; jj++) {
- 	    int j = jlist[jj];
-	    if (need_ic && j < 0) j = -j - 1;
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            if (need_ic && j < 0) j = -j - 1;
             if (j < nlocal) {
               if (j < vlmin) vlmin = j;
               if (j > vlmax) vlmax = j;
@@ -585,33 +585,33 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
               if (j < vgmin) vgmin = j;
               if (j > vgmax) vgmax = j;
             }
-	  }
-	}
-	lmin = MIN(lmin,vlmin);
-	gmin = MIN(gmin,vgmin);
-	lmax = MAX(lmax,vlmax);
-	gmax = MAX(gmax,vgmax);
+          }
+        }
+        lmin = MIN(lmin,vlmin);
+        gmin = MIN(gmin,vgmin);
+        lmax = MAX(lmax,vlmax);
+        gmax = MAX(gmax,vgmax);
 
         #if defined(_OPENMP)
         #pragma omp critical
         #endif
         {
-  	  if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-	  if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-	  if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-	  if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
         }
-	#pragma omp barrier
-	
-	int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
- 	if (nghost < 0) nghost = 0;
-	if (offload) {
-	  ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-	  nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-	} else {
-	  ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-	  nall_offset = nlocal + nghost;
-	}
       } // if separate_buffers
       #endif
 
@@ -620,67 +620,67 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
 
-	  if (THREE) {
-	    const int trip = jnum * pack_width;
+          if (THREE) {
+            const int trip = jnum * pack_width;
             for (int jj = 0; jj < trip; jj+=pack_width) {
               const int j = jlist[jj];
-	      if (need_ic && j < 0) {
-	        which = 0;
-	        jlist[jj] = -j - 1;
+              if (need_ic && j < 0) {
+                which = 0;
+                jlist[jj] = -j - 1;
               } else
                 ofind_special(which, special, nspecial, i, tag[j]);
-	      #ifdef _LMP_INTEL_OFFLOAD
-	      if (j >= nlocal) {
-	        if (j == e_nall)
-		  jlist[jj] = nall_offset;
-	        else if (which) 
-		  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-	        else jlist[jj]-=ghost_offset;
+              #ifdef _LMP_INTEL_OFFLOAD
+              if (j >= nlocal) {
+                if (j == e_nall)
+                  jlist[jj] = nall_offset;
+                else if (which)
+                  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+                else jlist[jj]-=ghost_offset;
               } else
-	      #endif
+              #endif
               if (which) jlist[jj] = j ^ (which << SBBITS);
-	    }
-	  } else {
+            }
+          } else {
             #if defined(LMP_SIMD_COMPILER)
-	    #pragma vector aligned
+            #pragma vector aligned
             #pragma simd
-	    #endif 
+            #endif
             for (int jj = 0; jj < jnum; jj++) {
               const int j = jlist[jj];
-	      if (need_ic && j < 0) {
-	        which = 0;
-	        jlist[jj] = -j - 1;
+              if (need_ic && j < 0) {
+                which = 0;
+                jlist[jj] = -j - 1;
               } else
                 ofind_special(which, special, nspecial, i, tag[j]);
-	      #ifdef _LMP_INTEL_OFFLOAD
-	      if (j >= nlocal) {
-	        if (j == e_nall)
-		  jlist[jj] = nall_offset;
-	        else if (which) 
-		  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-	        else jlist[jj]-=ghost_offset;
+              #ifdef _LMP_INTEL_OFFLOAD
+              if (j >= nlocal) {
+                if (j == e_nall)
+                  jlist[jj] = nall_offset;
+                else if (which)
+                  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+                else jlist[jj]-=ghost_offset;
               } else
-	      #endif
+              #endif
               if (which) jlist[jj] = j ^ (which << SBBITS);
             }
-	  }
-	} // for i
+          }
+        } // for i
       } // if molecular
       #ifdef _LMP_INTEL_OFFLOAD
       else if (separate_buffers) {
-	for (int i = ifrom; i < ito; ++i) {
+        for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
-	  int jj = 0;
-	  #pragma vector aligned
-	  #pragma simd
-	  for (jj = 0; jj < jnum; jj++) {
-	    if (jlist[jj] >= nlocal) {
- 	      if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-	      else jlist[jj] -= ghost_offset;
-	    }
-	  }
-	}
+          int jj = 0;
+          #pragma vector aligned
+          #pragma simd
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] -= ghost_offset;
+            }
+          }
+        }
       }
       #endif
     } // end omp
@@ -704,9 +704,9 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       _fix->start_watch(TIME_PACK);
       _fix->set_neighbor_host_sizes();
       buffers->pack_sep_from_single(_fix->host_min_local(),
-				    _fix->host_used_local(),
-				    _fix->host_min_ghost(),
-				    _fix->host_used_ghost());
+                                    _fix->host_used_local(),
+                                    _fix->host_min_ghost(),
+                                    _fix->host_used_ghost());
       _fix->stop_watch(TIME_PACK);
     }
   }
@@ -732,9 +732,9 @@ void NPairIntel::grow_stencil()
     _off_map_stencil = stencil;
     const int * stencil = _off_map_stencil;
     const int maxstencil = ns->get_maxstencil();
-    #pragma offload_transfer target(mic:_cop)	\
+    #pragma offload_transfer target(mic:_cop)   \
       in(stencil:length(maxstencil) alloc_if(1) free_if(0))
-  }  
+  }
 }
 #endif
 
diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h
index 51574a252c..55a529b2cb 100644
--- a/src/USER-INTEL/npair_intel.h
+++ b/src/USER-INTEL/npair_intel.h
@@ -84,8 +84,8 @@ class NPairIntel : public NPair {
   FixIntel *_fix;
 
   template <class flt_t, class acc_t, int, int, int, int, int>
-  void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, 
-		  const int, const int, const int offload_end = 0);
+  void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
+                  const int, const int, const int offload_end = 0);
 
   #ifdef _LMP_INTEL_OFFLOAD
   int _cop;
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index cdea9e76c4..07beae1e41 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -55,7 +55,7 @@ PairBuckCoulCutIntel::~PairBuckCoulCutIntel()
 void PairBuckCoulCutIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -70,8 +70,8 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairBuckCoulCutIntel::compute(int eflag, int vflag,
-				   IntelBuffers<flt_t,acc_t> *buffers,
-				   const ForceConst<flt_t> &fc)
+                                   IntelBuffers<flt_t,acc_t> *buffers,
+                                   const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -94,13 +94,13 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				packthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
+
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
@@ -127,9 +127,9 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
-				IntelBuffers<flt_t,acc_t> *buffers,
-				const ForceConst<flt_t> &fc,
-				const int astart, const int aend)
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc,
+                                const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -160,8 +160,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -198,8 +198,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
     *timer_compute = MIC_Wtime();
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			      f_stride, x, q);
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
@@ -233,20 +233,20 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
         acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
-  
+
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
         if (NEWTON_PAIR == 0)
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
@@ -262,19 +262,19 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
-	  
-          #ifdef INTEL_VMASK 
+
+          #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_coulsq) {
           #endif
             forcecoul = qqrd2e * qtmp*q[j]/r;
-            if (EFLAG) 
+            if (EFLAG)
               ecoul = forcecoul;
             if (sbindex){
               const flt_t factor_coul = special_coul[sbindex];
               forcecoul *= factor_coul;
               if(EFLAG)
                 ecoul *= factor_coul;
-              
+
             }
           #ifdef INTEL_VMASK
           }
@@ -282,7 +282,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           if (rsq >= c_cuti[jtype].cut_coulsq)
             { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           #endif
-          
+
           #ifdef INTEL_VMASK
           if (rsq < c_cuti[jtype].cut_ljsq) {
           #endif
@@ -290,14 +290,14 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
             flt_t rexp = exp(-r * c_forcei[jtype].rhoinv);
             forcebuck = r * rexp * c_forcei[jtype].buck1 -
               r6inv * c_forcei[jtype].buck2;
-            if (EFLAG) 
+            if (EFLAG)
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
                 c_energyi[jtype].offset;
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
-              if (EFLAG) 
+              if (EFLAG)
                 evdwl *= factor_lj;
             }
           #ifdef INTEL_VMASK
@@ -311,51 +311,51 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           if (rsq < c_cuti[jtype].cutsq) {
           #endif
             const flt_t fpair = (forcecoul + forcebuck) * r2inv;
-	    const flt_t fpx = fpair * delx;
-	    fxtmp += fpx;
-	    if (NEWTON_PAIR) f[j].x -= fpx;
-	    const flt_t fpy = fpair * dely;
-	    fytmp += fpy;
-	    if (NEWTON_PAIR) f[j].y -= fpy;
-	    const flt_t fpz = fpair * delz;
-	    fztmp += fpz;
-	    if (NEWTON_PAIR) f[j].z -= fpz;
-
-            
-	    if (EFLAG) {
-	      sevdwl += evdwl;
-	      secoul += ecoul;
-	      if (eatom) {
-		fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-		if (NEWTON_PAIR) 
-		  f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+
+            if (EFLAG) {
+              sevdwl += evdwl;
+              secoul += ecoul;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               }
-	    }
-	    if (NEWTON_PAIR == 0)
+            }
+            if (NEWTON_PAIR == 0)
               IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
         if (NEWTON_PAIR) {
-	  f[i].x += fxtmp;
-	  f[i].y += fytmp;
-	  f[i].z += fztmp;
-	} else {
-	  f[i].x = fxtmp;
-	  f[i].y = fytmp;
-	  f[i].z = fztmp;
-	}
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
         IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
@@ -364,12 +364,12 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
@@ -410,7 +410,7 @@ void PairBuckCoulCutIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
@@ -492,9 +492,9 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   const int ntable,
-							   Memory *memory,
-							   const int cop) {
+                                                           const int ntable,
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -505,12 +505,12 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_cut_t * oc_cut = c_cut[0];
 
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
-          oc_energy != NULL && ospecial_coul != NULL && 
+          oc_energy != NULL && ospecial_coul != NULL &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
           nocopy(oc_force, oc_energy: alloc_if(0) free_if(1))        \
-          nocopy(oc_cut: alloc_if(0) free_if(1)) 
+          nocopy(oc_cut: alloc_if(0) free_if(1))
       }
       #endif
 
@@ -534,7 +534,7 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_cut_t * oc_cut = c_cut[0];
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL &&
-          oc_energy != NULL && ospecial_coul != NULL &&  
+          oc_energy != NULL && ospecial_coul != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.h b/src/USER-INTEL/pair_buck_coul_cut_intel.h
index 42a55ac21f..7204323903 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.h
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.h
@@ -51,8 +51,8 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
 
   template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -75,7 +75,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index a9aee1e53e..995e2e8583 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -55,7 +55,7 @@ PairBuckCoulLongIntel::~PairBuckCoulLongIntel()
 void PairBuckCoulLongIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -70,8 +70,8 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::compute(int eflag, int vflag,
-				    IntelBuffers<flt_t,acc_t> *buffers,
-				    const ForceConst<flt_t> &fc)
+                                    IntelBuffers<flt_t,acc_t> *buffers,
+                                    const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -85,7 +85,7 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
-    
+
     int packthreads;
     if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
     else packthreads = 1;
@@ -94,13 +94,13 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				packthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
+
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
@@ -127,9 +127,9 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
-				 IntelBuffers<flt_t,acc_t> *buffers,
-				 const ForceConst<flt_t> &fc,
-				 const int astart, const int aend)
+                                 IntelBuffers<flt_t,acc_t> *buffers,
+                                 const ForceConst<flt_t> &fc,
+                                 const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -175,8 +175,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -213,7 +213,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
-    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)	\
+    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)    \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -224,8 +224,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     *timer_compute = MIC_Wtime();
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			      f_stride, x, q);
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
@@ -260,24 +260,24 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
         const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
-        const flt_t * _noalias const rho_invi = rho_inv + ptr_off; 
+        const flt_t * _noalias const rho_invi = rho_inv + ptr_off;
 
         const int   * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	if (NEWTON_PAIR == 0)
-	  if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
-	int ej = 0;
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
@@ -287,33 +287,33 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-	  const int jtype = x[j].w;
+          const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
-	  
+
           if (rsq < c_forcei[jtype].cutsq) {
-	    trsq[ej]=rsq;
-	    tdelx[ej]=delx;
-	    tdely[ej]=dely;
-	    tdelz[ej]=delz;
-	    tjtype[ej]=jtype;
-	    tj[ej]=jlist[jj];
-	    ej++;
-	  }
-	}
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=jtype;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-		                 sv0, sv1, sv2, sv3, sv4, sv5)
+                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcebuck, evdwl, ecoul;
           forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
 
-	  const int j = tj[jj] & NEIGHMASK;
+          const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
-	  const int jtype = tjtype[jj];
-	  const flt_t rsq = trsq[jj];
+          const int jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
           const flt_t r = (flt_t)1.0 / sqrt(r2inv);
 
@@ -321,52 +321,52 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
           if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
             const flt_t A1 =  0.254829592;
-	    const flt_t A2 = -0.284496736;
-	    const flt_t A3 =  1.421413741;
-	    const flt_t A4 = -1.453152027;
-	    const flt_t A5 =  1.061405429;
-	    const flt_t EWALD_F = 1.12837917;
-	    const flt_t INV_EWALD_P = 1.0 / 0.3275911;
-	    
-	    const flt_t grij = g_ewald * r;
-	    const flt_t expm2 = exp(-grij * grij);
-	    const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-	    const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-	    forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-	    if (EFLAG) ecoul = prefactor * erfc;
-
-	    const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
-	      prefactor;
-	    forcecoul -= adjust;
-	    if (EFLAG) ecoul -= adjust;
-	    
+            const flt_t A2 = -0.284496736;
+            const flt_t A3 =  1.421413741;
+            const flt_t A4 = -1.453152027;
+            const flt_t A5 =  1.061405429;
+            const flt_t EWALD_F = 1.12837917;
+            const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+            const flt_t grij = g_ewald * r;
+            const flt_t expm2 = exp(-grij * grij);
+            const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+            forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+            if (EFLAG) ecoul = prefactor * erfc;
+
+            const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+              prefactor;
+            forcecoul -= adjust;
+            if (EFLAG) ecoul -= adjust;
+
           #ifdef INTEL_ALLOW_TABLE
           } else {
-	    float rsq_lookup = rsq;
-	    const int itable = (__intel_castf32_u32(rsq_lookup) &
-	      ncoulmask) >> ncoulshiftbits;
-	    const flt_t fraction = (rsq_lookup - table[itable].r) *
-	      table[itable].dr;
-	    
-	    const flt_t tablet = table[itable].f +
-	      fraction * table[itable].df;
-	    forcecoul = qtmp * q[j] * tablet;
-	    if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-	      fraction * detable[itable]);
-	    if (sbindex) {
-	      const flt_t table2 = ctable[itable] +
-		fraction * dctable[itable];
-	      const flt_t prefactor = qtmp * q[j] * table2;
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-		prefactor;
-	      forcecoul -= adjust;
-	      if (EFLAG) ecoul -= adjust;
+            float rsq_lookup = rsq;
+            const int itable = (__intel_castf32_u32(rsq_lookup) &
+              ncoulmask) >> ncoulshiftbits;
+            const flt_t fraction = (rsq_lookup - table[itable].r) *
+              table[itable].dr;
+
+            const flt_t tablet = table[itable].f +
+              fraction * table[itable].df;
+            forcecoul = qtmp * q[j] * tablet;
+            if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+              fraction * detable[itable]);
+            if (sbindex) {
+              const flt_t table2 = ctable[itable] +
+                fraction * dctable[itable];
+              const flt_t prefactor = qtmp * q[j] * table2;
+              const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
             }
           }
           #endif
 
-	  #ifdef INTEL_VMASK
+          #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cut_ljsq) {
           #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
@@ -389,7 +389,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
-	  const flt_t fpair = (forcecoul + forcebuck) * r2inv;
+          const flt_t fpair = (forcecoul + forcebuck) * r2inv;
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
@@ -400,38 +400,38 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
-	  if (EFLAG) {
+          if (EFLAG) {
             sevdwl += evdwl;
-	    secoul += ecoul;
-	    if (eatom) {
-	      fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-	      if (NEWTON_PAIR) 
-		f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+            secoul += ecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
-	  }
-	  if (NEWTON_PAIR == 0)
-	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
-				  fpx, fpy, fpz);
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
-	if (NEWTON_PAIR) {
-	  f[i].x += fxtmp;
-	  f[i].y += fytmp;
-	  f[i].z += fztmp;
-	} else {
-	  f[i].x = fxtmp;
-	  f[i].y = fytmp;
-	  f[i].z = fztmp;
-	}	  
-	IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
@@ -440,12 +440,12 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
@@ -486,7 +486,7 @@ void PairBuckCoulLongIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
@@ -549,7 +549,7 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int j = 0; j < tp1; j++) {
       if (cutsq[i][j] < cut_ljsq[i][j])
         error->all(FLERR,
-	 "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
+         "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
@@ -603,9 +603,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   const int ntable,
-							   Memory *memory,
-							   const int cop) {
+                                                           const int ntable,
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -625,10 +625,10 @@ void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
-	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
-	  nocopy(orho_inv: alloc_if(0) free_if(1)) \
-	  nocopy(otable: alloc_if(0) free_if(1)) \
-	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+          nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
+          nocopy(orho_inv: alloc_if(0) free_if(1)) \
+          nocopy(otable: alloc_if(0) free_if(1)) \
+          nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.h b/src/USER-INTEL/pair_buck_coul_long_intel.h
index ec2cdba177..ec37c699c8 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.h
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.h
@@ -50,8 +50,8 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
 
   template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -76,7 +76,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index bbfc7225dd..8c63d2e62d 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -48,7 +48,7 @@ PairBuckIntel::~PairBuckIntel()
 void PairBuckIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED)
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -63,8 +63,8 @@ void PairBuckIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairBuckIntel::compute(int eflag, int vflag,
-			    IntelBuffers<flt_t,acc_t> *buffers,
-			    const ForceConst<flt_t> &fc)
+                            IntelBuffers<flt_t,acc_t> *buffers,
+                            const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -87,13 +87,13 @@ void PairBuckIntel::compute(int eflag, int vflag,
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				packthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
+
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
@@ -120,9 +120,9 @@ void PairBuckIntel::compute(int eflag, int vflag,
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckIntel::eval(const int offload, const int vflag,
-			 IntelBuffers<flt_t,acc_t> *buffers,
-			 const ForceConst<flt_t> &fc,
-			 const int astart, const int aend)
+                         IntelBuffers<flt_t,acc_t> *buffers,
+                         const ForceConst<flt_t> &fc,
+                         const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -147,8 +147,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -160,7 +160,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
   int *overflow = fix->get_off_overflow_flag();
   double *timer_compute = fix->off_watch_pair();
   // Redeclare as local variables for offload
- 
+
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload)                 \
     in(special_lj:length(0) alloc_if(0) free_if(0)) \
@@ -182,8 +182,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
     *timer_compute = MIC_Wtime();
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			      f_stride, x, 0);
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl =  (acc_t)0;
@@ -215,23 +215,23 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl,  sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
-	if (NEWTON_PAIR == 0)
+        if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
+        if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
-          
+
           flt_t  forcebuck, evdwl;
           forcebuck = evdwl =  (flt_t)0.0;
 
@@ -245,7 +245,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
           const flt_t r = sqrt(rsq);
           const flt_t r2inv = (flt_t)1.0 / rsq;
-	  
+
           #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cutsq) {
           #endif
@@ -257,7 +257,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
             #ifndef INTEL_VMASK
             if (rsq > c_forcei[jtype].cutsq)
               forcebuck =(flt_t)0.0;
-            #endif 
+            #endif
             if (EFLAG) {
               evdwl = rexp * c_energyi[jtype].a -
                 r6inv * c_energyi[jtype].c -
@@ -272,67 +272,67 @@ void PairBuckIntel::eval(const int offload, const int vflag,
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcebuck *= factor_lj;
-              if (EFLAG) 
+              if (EFLAG)
                 evdwl *= factor_lj;
             }
             const flt_t fpair =  forcebuck * r2inv;
-	    const flt_t fpx = fpair * delx;
-	    fxtmp += fpx;
-	    if (NEWTON_PAIR) f[j].x -= fpx;
-	    const flt_t fpy = fpair * dely;
-	    fytmp += fpy;
-	    if (NEWTON_PAIR) f[j].y -= fpy;
-	    const flt_t fpz = fpair * delz;
-	    fztmp += fpz;
-	    if (NEWTON_PAIR) f[j].z -= fpz;
-
-	    if (EFLAG) {
-	      sevdwl += evdwl;
-	      if (eatom) {
-		fwtmp += (flt_t)0.5 * evdwl;
-		if (NEWTON_PAIR) 
-		  f[j].w += (flt_t)0.5 * evdwl;
-	      }
-	    }
-	    if (NEWTON_PAIR == 0)
-	      IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
+              }
+            }
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
-	if (NEWTON_PAIR) {
-	  f[i].x += fxtmp;
-	  f[i].y += fytmp;
-	  f[i].z += fztmp;
-	} else {
-	  f[i].x = fxtmp;
-	  f[i].y = fytmp;
-	  f[i].z = fztmp;
-	}
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
         IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;	
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
       ev_global[0] = oevdwl;
       ev_global[1] = (acc_t)0;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
@@ -371,7 +371,7 @@ void PairBuckIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
@@ -442,7 +442,7 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t>
-void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, 
+void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                   Memory *memory,
                                                   const int cop) {
   if ( (ntypes != _ntypes ) ) {
@@ -452,8 +452,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
 
-      if (ospecial_lj != NULL && oc_force != NULL && 
-          oc_energy != NULL  && 
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL  &&
           _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: alloc_if(0) free_if(1)) \
@@ -476,8 +476,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_force_t * oc_force = c_force[0];
       c_energy_t * oc_energy = c_energy[0];
       int tp1sq = ntypes*ntypes;
-      if (ospecial_lj != NULL && oc_force != NULL && 
-          oc_energy != NULL &&  
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL &&
           cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_buck_intel.h b/src/USER-INTEL/pair_buck_intel.h
index e699a1611e..ab5e135262 100644
--- a/src/USER-INTEL/pair_buck_intel.h
+++ b/src/USER-INTEL/pair_buck_intel.h
@@ -50,8 +50,8 @@ private:
 
   template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -59,7 +59,7 @@ private:
 
   template <class flt_t>
   class ForceConst {
-  
+
   public:
     typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t;
     typedef struct { flt_t a, c, offset, pad; } c_energy_t;
@@ -78,7 +78,7 @@ private:
     int _ntypes, _cop;
     Memory *_memory;
   };
-  
+
   ForceConst<float> force_const_single;
   ForceConst<double> force_const_double;
 };
diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp
index 541f9745cb..b97128bf9f 100644
--- a/src/USER-INTEL/pair_eam_intel.cpp
+++ b/src/USER-INTEL/pair_eam_intel.cpp
@@ -74,8 +74,8 @@ void PairEAMIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairEAMIntel::compute(int eflag, int vflag,
-			   IntelBuffers<flt_t,acc_t> *buffers,
-			   const ForceConst<flt_t> &fc)
+                           IntelBuffers<flt_t,acc_t> *buffers,
+                           const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag, vflag);
@@ -111,37 +111,37 @@ void PairEAMIntel::compute(int eflag, int vflag,
   if (_onetype) {
     if (eflag) {
       if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (eflag) {
       if (force->newton_pair) {
-	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
@@ -151,8 +151,8 @@ void PairEAMIntel::compute(int eflag, int vflag,
 
 template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairEAMIntel::eval(const int offload, const int vflag,
-			IntelBuffers<flt_t,acc_t> *buffers,
-			const ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t,acc_t> *buffers,
+                        const ForceConst<flt_t> &fc,
                         const int astart, const int aend)
 {
   const int inum = aend - astart;
@@ -251,8 +251,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
     #endif
     {
       int iifrom, iito, tid;
-      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, 
-			      INTEL_VECTOR_WIDTH);
+      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads,
+                              INTEL_VECTOR_WIDTH);
       iifrom += astart;
       iito += astart;
 
@@ -264,8 +264,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       else foff = 0;
       double * _noalias const trho = rho + foff;
       if (NEWTON_PAIR) {
-	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
-	memset(trho, 0, nall * sizeof(double));
+        memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+        memset(trho, 0, nall * sizeof(double));
       }
 
       const int toffs = tid * ccache_stride;
@@ -280,108 +280,108 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       int rhor_joff, frho_ioff;
       if (ONETYPE) {
         const int ptr_off=_onetype * ntypes + _onetype;
-	oscale = scale_f[ptr_off];
-	int rhor_ioff = istride * _onetype;
-	rhor_joff = rhor_ioff + _onetype * jstride;
-	frho_ioff = fstride * _onetype;
+        oscale = scale_f[ptr_off];
+        int rhor_ioff = istride * _onetype;
+        rhor_joff = rhor_ioff + _onetype * jstride;
+        frho_ioff = fstride * _onetype;
       }
       for (int i = iifrom; i < iito; ++i) {
         int itype, rhor_ioff;
-	if (!ONETYPE) {
+        if (!ONETYPE) {
           itype = x[i].w;
-	  rhor_ioff = istride * itype;
-	}
-	const int * _noalias const jlist = firstneigh + cnumneigh[i];
-	const int jnum = numneigh[i];
+          rhor_ioff = istride * itype;
+        }
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
 
-	const flt_t xtmp = x[i].x;
-	const flt_t ytmp = x[i].y;
-	const flt_t ztmp = x[i].z;
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
 
-	acc_t rhoi = (acc_t)0.0;
-	int ej = 0;
+        acc_t rhoi = (acc_t)0.0;
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
-	#endif
-	for (int jj = 0; jj < jnum; jj++) {
-	  const int j = jlist[jj] & NEIGHMASK;
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-	  const flt_t rsq = delx*delx + dely*dely + delz*delz;
+          const flt_t rsq = delx*delx + dely*dely + delz*delz;
 
-	  if (rsq < fcutforcesq) {
-	    trsq[ej]=rsq;
-	    if (!ONETYPE) tjtype[ej]=x[j].w;
-	    tj[ej]=jlist[jj];
-	    ej++;
+          if (rsq < fcutforcesq) {
+            trsq[ej]=rsq;
+            if (!ONETYPE) tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
           }
         }
 
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
+        #pragma vector aligned
         #pragma simd reduction(+:rhoi)
-	#endif
+        #endif
         for (int jj = 0; jj < ej; jj++) {
-	  int jtype;
-	  const int j = tj[jj] & NEIGHMASK;
-	  if (!ONETYPE) jtype = tjtype[jj];
-	  const flt_t rsq = trsq[jj];
-	  flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
-	  int m = static_cast<int> (p);
-	  m = MIN(m,nr-1);
-	  p -= m;
-	  p = MIN(p,(flt_t)1.0);
-	  if (!ONETYPE)
-	    rhor_joff = rhor_ioff + jtype * jstride;
-	  const int joff = rhor_joff + m;
-	  flt_t ra;
-	  ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
-		rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
-	  rhoi += ra;
-	  if (NEWTON_PAIR) {
-	    if (!ONETYPE) {
-	      const int ioff = jtype * istride + itype * jstride + m;
-	      ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
-		    rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
-	    }
-	    trho[j] += ra;
-	  }
+          int jtype;
+          const int j = tj[jj] & NEIGHMASK;
+          if (!ONETYPE) jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
+          flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
+          int m = static_cast<int> (p);
+          m = MIN(m,nr-1);
+          p -= m;
+          p = MIN(p,(flt_t)1.0);
+          if (!ONETYPE)
+            rhor_joff = rhor_ioff + jtype * jstride;
+          const int joff = rhor_joff + m;
+          flt_t ra;
+          ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
+                rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
+          rhoi += ra;
+          if (NEWTON_PAIR) {
+            if (!ONETYPE) {
+              const int ioff = jtype * istride + itype * jstride + m;
+              ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
+                    rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
+            }
+            trho[j] += ra;
+          }
         } // for jj
-	if (NEWTON_PAIR)
-	  trho[i] += rhoi;
-	else
-	  trho[i] = rhoi;
+        if (NEWTON_PAIR)
+          trho[i] += rhoi;
+        else
+          trho[i] = rhoi;
       } // for i
 
       #if defined(_OPENMP)
       if (NEWTON_PAIR && nthreads > 1) {
         #pragma omp barrier
-        if (tid == 0) {  
+        if (tid == 0) {
           const int rcount = nall;
-	  if (nthreads == 2) {
+          if (nthreads == 2) {
             double *trho2 = rho + nmax;
-	    #pragma vector aligned
+            #pragma vector aligned
             #pragma simd
-	    for (int n = 0; n < rcount; n++)
-	      rho[n] += trho2[n];
+            for (int n = 0; n < rcount; n++)
+              rho[n] += trho2[n];
           } else if (nthreads == 4) {
             double *trho2 = rho + nmax;
-	    double *trho3 = trho2 + nmax;
-	    double *trho4 = trho3 + nmax;
-	    #pragma vector aligned
-	    #pragma simd
-	    for (int n = 0; n < rcount; n++)
-	      rho[n] += trho2[n] + trho3[n] + trho4[n];
+            double *trho3 = trho2 + nmax;
+            double *trho4 = trho3 + nmax;
+            #pragma vector aligned
+            #pragma simd
+            for (int n = 0; n < rcount; n++)
+              rho[n] += trho2[n] + trho3[n] + trho4[n];
           } else {
-	    double *trhon = rho + nmax;
-	    for (int t = 1; t < nthreads; t++) {
-  	      #pragma vector aligned
-	      #pragma simd
-	      for (int n = 0; n < rcount; n++)
-	        rho[n] += trhon[n];
-	      trhon += nmax;
+            double *trhon = rho + nmax;
+            for (int t = 1; t < nthreads; t++) {
+              #pragma vector aligned
+              #pragma simd
+              for (int n = 0; n < rcount; n++)
+                rho[n] += trhon[n];
+              trhon += nmax;
             }
           }
         }
@@ -411,32 +411,32 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       #pragma simd reduction(+:tevdwl)
       #endif
       for (int i = iifrom; i < iito; ++i) {
-	int itype;
-	if (!ONETYPE) itype = x[i].w;
-	flt_t p = rho[i]*frdrho + (flt_t)1.0;
-	int m = static_cast<int> (p);
-	m = MAX(1,MIN(m,nrho-1));
-	p -= m;
-	p = MIN(p,(flt_t)1.0);
-	if (!ONETYPE) frho_ioff = itype * fstride;
-	const int ioff = frho_ioff + m;
-	fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p + 
-	  frho_spline_f[ioff].c;
-	if (EFLAG) {
-	  flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p + 
-		       frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
-	  if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
-	  if (!ONETYPE) {
-	    const int ptr_off=itype*ntypes + itype;
-	    oscale = scale_f[ptr_off];
-	  }
-	  phi *= oscale;
-	  tevdwl += phi;
-	  if (eatom) f[i].w += phi;
-	}
+        int itype;
+        if (!ONETYPE) itype = x[i].w;
+        flt_t p = rho[i]*frdrho + (flt_t)1.0;
+        int m = static_cast<int> (p);
+        m = MAX(1,MIN(m,nrho-1));
+        p -= m;
+        p = MIN(p,(flt_t)1.0);
+        if (!ONETYPE) frho_ioff = itype * fstride;
+        const int ioff = frho_ioff + m;
+        fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p +
+          frho_spline_f[ioff].c;
+        if (EFLAG) {
+          flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p +
+                       frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d;
+          if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax);
+          if (!ONETYPE) {
+            const int ptr_off=itype*ntypes + itype;
+            oscale = scale_f[ptr_off];
+          }
+          phi *= oscale;
+          tevdwl += phi;
+          if (eatom) f[i].w += phi;
+        }
       }
       if (EFLAG) oevdwl += tevdwl;
-      
+
 
       // communicate derivative of embedding function
 
@@ -447,7 +447,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       if (tid == 0)
         comm->forward_comm_pair(this);
       if (NEWTON_PAIR)
-	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+        memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       #if defined(_OPENMP)
       #pragma omp barrier
@@ -458,94 +458,94 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 
       for (int i = iifrom; i < iito; ++i) {
         int itype, rhor_ioff;
-	const flt_t * _noalias scale_fi;
-	if (!ONETYPE) {
-	  itype = x[i].w;
-	  rhor_ioff = istride * itype;
-	  scale_fi = scale_f + itype*ntypes;
-	}
-	const int * _noalias const jlist = firstneigh + cnumneigh[i];
-	const int jnum = numneigh[i];
-
-	acc_t fxtmp, fytmp, fztmp, fwtmp;
-	acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
-
-	const flt_t xtmp = x[i].x;
-	const flt_t ytmp = x[i].y;
-	const flt_t ztmp = x[i].z;
-	fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        const flt_t * _noalias scale_fi;
+        if (!ONETYPE) {
+          itype = x[i].w;
+          rhor_ioff = istride * itype;
+          scale_fi = scale_f + itype*ntypes;
+        }
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
         if (NEWTON_PAIR == 0)
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
-	int ej = 0;
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
-	#endif
-	for (int jj = 0; jj < jnum; jj++) {
-	  const int j = jlist[jj] & NEIGHMASK;
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-	  const flt_t rsq = delx*delx + dely*dely + delz*delz;
-
-	  if (rsq < fcutforcesq) {
-	    trsq[ej]=rsq;
-	    tdelx[ej]=delx;
-	    tdely[ej]=dely;
-	    tdelz[ej]=delz;
-	    if (!ONETYPE) tjtype[ej]=x[j].w;
-	    tj[ej]=jlist[jj];
-	    ej++;
-	  }
-	}
+          const flt_t rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq < fcutforcesq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            if (!ONETYPE) tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
 
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
+        #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-		                 sv0, sv1, sv2, sv3, sv4, sv5)
+                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
-	  int jtype;
-	  const int j = tj[jj] & NEIGHMASK;
-	  if (!ONETYPE) jtype = tjtype[jj];
-	  const flt_t rsq = trsq[jj];
-	  const flt_t r = sqrt(rsq);
-	  flt_t p = r*frdr + (flt_t)1.0;
-	  int m = static_cast<int> (p);
-	  m = MIN(m,nr-1);
-	  p -= m;
-	  p = MIN(p,(flt_t)1.0);
-	  if (!ONETYPE)
-	    rhor_joff = rhor_ioff + jtype * jstride;
-	  const int joff = rhor_joff + m;
-	  const flt_t rhojp = (rhor_spline_f[joff].a*p + 
-			       rhor_spline_f[joff].b)*p + 
-	    rhor_spline_f[joff].c;
-	  flt_t rhoip;
-	  if (!ONETYPE) {
-	    const int ioff = jtype * istride + itype * jstride + m;
-	    rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + 
-	      rhor_spline_f[ioff].c;
-	  } else
-	    rhoip = rhojp;
-	  const flt_t z2p = (z2r_spline_t[joff].a*p + 
-			     z2r_spline_t[joff].b)*p + 
-	    z2r_spline_t[joff].c;
-	  const flt_t z2 = ((z2r_spline_t[joff].d*p + 
-			     z2r_spline_t[joff].e)*p + 
-			    z2r_spline_t[joff].f)*p + 
-	    z2r_spline_t[joff].g;
-	  
-	  const flt_t recip = (flt_t)1.0/r;
-	  const flt_t phi = z2*recip;
-	  const flt_t phip = z2p*recip - phi*recip;
-	  const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
-	  if (!ONETYPE)
-	    oscale = scale_fi[jtype];
-	  const flt_t fpair = -oscale*psip*recip;
-	  
+          int jtype;
+          const int j = tj[jj] & NEIGHMASK;
+          if (!ONETYPE) jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
+          const flt_t r = sqrt(rsq);
+          flt_t p = r*frdr + (flt_t)1.0;
+          int m = static_cast<int> (p);
+          m = MIN(m,nr-1);
+          p -= m;
+          p = MIN(p,(flt_t)1.0);
+          if (!ONETYPE)
+            rhor_joff = rhor_ioff + jtype * jstride;
+          const int joff = rhor_joff + m;
+          const flt_t rhojp = (rhor_spline_f[joff].a*p +
+                               rhor_spline_f[joff].b)*p +
+            rhor_spline_f[joff].c;
+          flt_t rhoip;
+          if (!ONETYPE) {
+            const int ioff = jtype * istride + itype * jstride + m;
+            rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p +
+              rhor_spline_f[ioff].c;
+          } else
+            rhoip = rhojp;
+          const flt_t z2p = (z2r_spline_t[joff].a*p +
+                             z2r_spline_t[joff].b)*p +
+            z2r_spline_t[joff].c;
+          const flt_t z2 = ((z2r_spline_t[joff].d*p +
+                             z2r_spline_t[joff].e)*p +
+                            z2r_spline_t[joff].f)*p +
+            z2r_spline_t[joff].g;
+
+          const flt_t recip = (flt_t)1.0/r;
+          const flt_t phi = z2*recip;
+          const flt_t phip = z2p*recip - phi*recip;
+          const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
+          if (!ONETYPE)
+            oscale = scale_fi[jtype];
+          const flt_t fpair = -oscale*psip*recip;
+
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
@@ -556,20 +556,20 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
-	  if (EFLAG) {
-	    const flt_t evdwl = oscale*phi;
-	    sevdwl += evdwl;
-	    if (eatom) {
-	      fwtmp += (flt_t)0.5 * evdwl;
-	      if (NEWTON_PAIR)
-		f[j].w += (flt_t)0.5 * evdwl;
-	    }
-	  }
-	  if (NEWTON_PAIR == 0) 
-	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
-				  fpx, fpy, fpz);
+          if (EFLAG) {
+            const flt_t evdwl = oscale*phi;
+            sevdwl += evdwl;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
-	if (NEWTON_PAIR) {
+        if (NEWTON_PAIR) {
           f[i].x += fxtmp;
           f[i].y += fytmp;
           f[i].z += fztmp;
@@ -577,19 +577,19 @@ void PairEAMIntel::eval(const int offload, const int vflag,
           f[i].x = fxtmp;
           f[i].y = fytmp;
           f[i].z = fztmp;
-	  sevdwl *= (acc_t)0.5;
+          sevdwl *= (acc_t)0.5;
         }
-	
+
         IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for i
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } /// omp
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       ev_global[0] = oevdwl;
@@ -597,13 +597,13 @@ void PairEAMIntel::eval(const int offload, const int vflag,
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
-      }	
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -665,7 +665,7 @@ void PairEAMIntel::init_style()
 
 template <class flt_t, class acc_t>
 void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
-				    IntelBuffers<flt_t,acc_t> *buffers)
+                                    IntelBuffers<flt_t,acc_t> *buffers)
 {
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
@@ -684,14 +684,14 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
-	cut = init_one(i,j);
-	cutneigh = cut + neighbor->skin;
-	cutsq[i][j] = cutsq[j][i] = cut*cut;
-	cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
       }
     }
   }
-  
+
   _onetype=-1;
   double oldscale=-1;
   for (int i = 1; i < tp1; i++) {
@@ -709,32 +709,32 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int j = 1; j < tp1; j++) {
       fc.scale_f[i][j] = scale[i][j];
       if (type2rhor[i][j] >= 0) {
-	const int joff = ioff + j * fc.rhor_jstride();
-	for (int k = 0; k < nr + 1; k++) {
-	  if (type2rhor[j][i] != type2rhor[i][j])
-	    _onetype = 0;
+        const int joff = ioff + j * fc.rhor_jstride();
+        for (int k = 0; k < nr + 1; k++) {
+          if (type2rhor[j][i] != type2rhor[i][j])
+            _onetype = 0;
           else if (_onetype < 0)
-	    _onetype = i;
+            _onetype = i;
           if (oldscale < 0)
             oldscale = scale[i][j];
           else
-	    if (oldscale != scale[i][j])
-	      _onetype = 0;
-	  fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
-	  fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
-	  fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
-	  fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
-	  fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
-	  fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
-	  fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
-	  fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
-	  fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
-	  fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
-	  fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
-	  fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
-	  fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
-	  fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
-	}
+            if (oldscale != scale[i][j])
+              _onetype = 0;
+          fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0];
+          fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1];
+          fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2];
+          fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3];
+          fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
+          fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
+          fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
+          fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
+          fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
+          fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
+          fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
+          fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
+          fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
+          fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
+        }
       }
     }
   }
@@ -745,9 +745,9 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-						 const int nr, const int nrho,
-						 Memory *memory,
-						 const int cop) {
+                                                 const int nr, const int nrho,
+                                                 Memory *memory,
+                                                 const int cop) {
   if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
     if (_ntypes > 0) {
       _memory->destroy(rhor_spline_f);
@@ -780,7 +780,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 /* ---------------------------------------------------------------------- */
 
 int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
-				    int pbc_flag, int *pbc)
+                                    int pbc_flag, int *pbc)
 {
   if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     return pack_forward_comm(n, list, buf, fp);
@@ -802,7 +802,7 @@ void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf)
 
 template<class flt_t>
 int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
-				    flt_t *fp_f)
+                                    flt_t *fp_f)
 {
   int i,j,m;
 
@@ -817,8 +817,8 @@ int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf,
 /* ---------------------------------------------------------------------- */
 
 template<class flt_t>
-void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf, 
-				       flt_t *fp_f)
+void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf,
+                                       flt_t *fp_f)
 {
   int i,m,last;
 
diff --git a/src/USER-INTEL/pair_eam_intel.h b/src/USER-INTEL/pair_eam_intel.h
index c7bb3b7bd0..f34e740bda 100644
--- a/src/USER-INTEL/pair_eam_intel.h
+++ b/src/USER-INTEL/pair_eam_intel.h
@@ -53,8 +53,8 @@ class PairEAMIntel : public PairEAM {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	    class acc_t>
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t,
+            class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
@@ -79,8 +79,8 @@ class PairEAMIntel : public PairEAM {
     ForceConst() : _ntypes(0), _nr(0)  {}
     ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
 
-    void set_ntypes(const int ntypes, const int nr, const int nrho, 
-		    Memory *memory, const int cop);
+    void set_ntypes(const int ntypes, const int nr, const int nrho,
+                    Memory *memory, const int cop);
     inline int rhor_jstride() const { return _nr; }
     inline int rhor_istride() const { return _nr * _ntypes; }
     inline int frho_stride() const { return _nrho; }
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index af96fcbb79..ed7dd424af 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -98,17 +98,17 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
-				sizeof(ATOM_T));
+                                sizeof(ATOM_T));
       if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
 
       for (int i = ifrom; i < ito; i++) {
-	int qi = ellipsoid[i];
-	if (qi > -1) {
-	  quat[i].w = bonus[qi].quat[0];
-	  quat[i].i = bonus[qi].quat[1];
-	  quat[i].j = bonus[qi].quat[2];
-	  quat[i].k = bonus[qi].quat[3];
-	}
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i].w = bonus[qi].quat[0];
+          quat[i].i = bonus[qi].quat[1];
+          quat[i].j = bonus[qi].quat[2];
+          quat[i].k = bonus[qi].quat[3];
+        }
       }
     }
     quat[nall].w = (flt_t)1.0;
@@ -161,65 +161,65 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   if (fix->separate_buffers()) {
     fix->start_watch(TIME_PACK);
     if (offload) {
-      #pragma omp parallel 
+      #pragma omp parallel
       {
         int ifrom, ito, tid;
-	int nthreads = comm->nthreads;
-	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
-				  nthreads, sizeof(ATOM_T));
-	if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
-	for (int i = ifrom; i < ito; i++) {
-	  int qi = ellipsoid[i];
-	  if (qi > -1) {
-	    quat[i].w = bonus[qi].quat[0];
-	    quat[i].i = bonus[qi].quat[1];
-	    quat[i].j = bonus[qi].quat[2];
-	    quat[i].k = bonus[qi].quat[3];
-	  }
-	}
-	int nghost = nall - nlocal;
-	if (nghost) {
-	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
-				 nthreads, sizeof(ATOM_T));
-	  int offset = 0;
-	  ifrom += nlocal;
-	  ito += nlocal;
-	  if (ago != 0) {
-	    offset = fix->offload_min_ghost() - nlocal;
-	    buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
-	  }
-	  for (int i = ifrom; i < ito; i++) {
-	    int qi = ellipsoid[i + offset];
-	    if (qi > -1) {
-	      quat[i].w = bonus[qi].quat[0];
-	      quat[i].i = bonus[qi].quat[1];
-	      quat[i].j = bonus[qi].quat[2];
-	      quat[i].k = bonus[qi].quat[3];
-	    }
-	  }
-	}
+        int nthreads = comm->nthreads;
+        IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,
+                                  nthreads, sizeof(ATOM_T));
+        if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
+        for (int i = ifrom; i < ito; i++) {
+          int qi = ellipsoid[i];
+          if (qi > -1) {
+            quat[i].w = bonus[qi].quat[0];
+            quat[i].i = bonus[qi].quat[1];
+            quat[i].j = bonus[qi].quat[2];
+            quat[i].k = bonus[qi].quat[3];
+          }
+        }
+        int nghost = nall - nlocal;
+        if (nghost) {
+          IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,
+                                 nthreads, sizeof(ATOM_T));
+          int offset = 0;
+          ifrom += nlocal;
+          ito += nlocal;
+          if (ago != 0) {
+            offset = fix->offload_min_ghost() - nlocal;
+            buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
+          }
+          for (int i = ifrom; i < ito; i++) {
+            int qi = ellipsoid[i + offset];
+            if (qi > -1) {
+              quat[i].w = bonus[qi].quat[0];
+              quat[i].i = bonus[qi].quat[1];
+              quat[i].j = bonus[qi].quat[2];
+              quat[i].k = bonus[qi].quat[3];
+            }
+          }
+        }
       }
     } else {
       if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
       for (int i = fix->host_min_local(); i < nlocal; i++) {
-	int qi = ellipsoid[i];
-	if (qi > -1) {
-	  quat[i].w = bonus[qi].quat[0];
-	  quat[i].i = bonus[qi].quat[1];
-	  quat[i].j = bonus[qi].quat[2];
-	  quat[i].k = bonus[qi].quat[3];
-	}
+        int qi = ellipsoid[i];
+        if (qi > -1) {
+          quat[i].w = bonus[qi].quat[0];
+          quat[i].i = bonus[qi].quat[1];
+          quat[i].j = bonus[qi].quat[2];
+          quat[i].k = bonus[qi].quat[3];
+        }
       }
       int offset = fix->host_min_ghost() - nlocal;
       if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
       for (int i = nlocal; i < nall; i++) {
-	int qi = ellipsoid[i + offset];
-	if (qi > -1) {
-	  quat[i].w = bonus[qi].quat[0];
-	  quat[i].i = bonus[qi].quat[1];
-	  quat[i].j = bonus[qi].quat[2];
-	  quat[i].k = bonus[qi].quat[3];
-	}
+        int qi = ellipsoid[i + offset];
+        if (qi > -1) {
+          quat[i].w = bonus[qi].quat[0];
+          quat[i].i = bonus[qi].quat[1];
+          quat[i].j = bonus[qi].quat[2];
+          quat[i].k = bonus[qi].quat[3];
+        }
       }
     }
     fix->stop_watch(TIME_PACK);
@@ -252,8 +252,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -303,26 +303,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     #ifdef _LMP_INTEL_OFFLOAD
     if (separate_flag) {
       if (separate_flag < 3) {
-	int all_local = nlocal;
-	int ghost_min = overflow[LMP_GHOST_MIN];
-	nlocal = overflow[LMP_LOCAL_MAX] + 1;
-	int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
-	if (nghost < 0) nghost = 0;
-	nall = nlocal + nghost;
-	separate_flag--;
-	int flength;
-	if (NEWTON_PAIR) flength = nall;
-	else flength = nlocal;
-	IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
-			     separate_flag);
-	if (nghost) {
-	  if (nlocal < all_local || ghost_min > all_local) {
-	    memmove(x + nlocal, x + ghost_min,
-		    (nall - nlocal) * sizeof(ATOM_T));
-	    memmove(quat + nlocal, quat + ghost_min,
-		    (nall - nlocal) * sizeof(QUAT_T));
-	  }
-	}
+        int all_local = nlocal;
+        int ghost_min = overflow[LMP_GHOST_MIN];
+        nlocal = overflow[LMP_LOCAL_MAX] + 1;
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
+        if (nghost < 0) nghost = 0;
+        nall = nlocal + nghost;
+        separate_flag--;
+        int flength;
+        if (NEWTON_PAIR) flength = nall;
+        else flength = nlocal;
+        IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),
+                             separate_flag);
+        if (nghost) {
+          if (nlocal < all_local || ghost_min > all_local) {
+            memmove(x + nlocal, x + ghost_min,
+                    (nall - nlocal) * sizeof(ATOM_T));
+            memmove(quat + nlocal, quat + ghost_min,
+                    (nall - nlocal) * sizeof(QUAT_T));
+          }
+        }
       }
       x[nall].x = (flt_t)INTEL_BIGP;
       x[nall].y = (flt_t)INTEL_BIGP;
@@ -395,17 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
         fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
 
-	if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
-	if (NEWTON_PAIR == 0)
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+        if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
 
         bool multiple_forms = false;
         int packed_j = 0;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma ivdep
-	#endif
-	for (int jj = 0; jj < jnum; jj++) {
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
           int jm = jlist[jj];
           int j = jm & NEIGHMASK;
           const int jtype = x[j].w;
@@ -428,27 +428,27 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           } else
             multiple_forms = true;
         }
-	const int edge = (packed_j % pad_width);
-	if (edge) {
-	  const int packed_end = packed_j + (pad_width - edge);
+        const int edge = (packed_j % pad_width);
+        if (edge) {
+          const int packed_end = packed_j + (pad_width - edge);
           #if defined(LMP_SIMD_COMPILER)
           #pragma loop_count min=1, max=15, avg=8
           #endif
-	  for ( ; packed_j < packed_end; packed_j++)
-	    jlist_form[packed_j] = nall;
-	}
-	  
+          for ( ; packed_j < packed_end; packed_j++)
+            jlist_form[packed_j] = nall;
+        }
+
         // -------------------------------------------------------------
 
-	#ifdef INTEL_V512
-	__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
-	__assume(packed_j % 8 == 0);
-	__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
-	#endif
+        #ifdef INTEL_V512
+        __assume(packed_j % INTEL_VECTOR_WIDTH == 0);
+        __assume(packed_j % 8 == 0);
+        __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
+        #endif
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
-	                         sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+        #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
+                                 sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
         #endif
         for (int jj = 0; jj < packed_j; jj++) {
           flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
@@ -458,15 +458,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
           flt_t rtor_0, rtor_1, rtor_2;
 
-	  const int sbindex = jlist_form[jj] >> SBBITS & 3;
-	  const int j = jlist_form[jj] & NEIGHMASK;
+          const int sbindex = jlist_form[jj] >> SBBITS & 3;
+          const int j = jlist_form[jj] & NEIGHMASK;
           flt_t factor_lj = special_lj[sbindex];
           const int jtype = jtype_form[jj];
-	  const flt_t sigma = ijci[jtype].sigma;
-	  const flt_t epsilon = ijci[jtype].epsilon;
-	  const flt_t shape2_0 = ic[jtype].shape2[0];
-	  const flt_t shape2_1 = ic[jtype].shape2[1];
-	  const flt_t shape2_2 = ic[jtype].shape2[2];
+          const flt_t sigma = ijci[jtype].sigma;
+          const flt_t epsilon = ijci[jtype].epsilon;
+          const flt_t shape2_0 = ic[jtype].shape2[0];
+          const flt_t shape2_1 = ic[jtype].shape2[1];
+          const flt_t shape2_2 = ic[jtype].shape2[2];
           flt_t one_eng, evdwl;
 
           ME_quat_to_mat_trans(quat[j], a2);
@@ -488,7 +488,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ME_plus3(g1, g2, g12);
           flt_t kappa_0, kappa_1, kappa_2;
           ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
-		       kappa, ierror);
+                       kappa, ierror);
 
           // tempv = G12^-1*r12hat
 
@@ -520,7 +520,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           flt_t iota_0, iota_1, iota_2;
           ME_plus3(b1, b2, b12);
           ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
-		       iota, ierror);
+                       iota, ierror);
 
           // tempv = G12^-1*r12hat
 
@@ -534,7 +534,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           // compute dUr/dr
 
           temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
-	    sigma;
+            sigma;
           temp1 = temp1 * (flt_t)24.0 * epsilon;
           flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
           flt_t dUr_0, dUr_1, dUr_2;
@@ -548,8 +548,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
           flt_t dchi_0, dchi_1, dchi_2;
           temp1 = ME_dot3(iota, r12hat);
-          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * 
-	    std::pow(chi, (mu - (flt_t)1.0) / mu);
+          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
+            std::pow(chi, (mu - (flt_t)1.0) / mu);
           dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
           dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
           dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
@@ -663,36 +663,36 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           temp3 = chi * eta;
 
           ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) *
-	    (flt_t)-1.0;
+            (flt_t)-1.0;
           ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) *
-	    (flt_t)-1.0;
+            (flt_t)-1.0;
           ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
-	    (flt_t)-1.0;
+            (flt_t)-1.0;
 
           if (NEWTON_PAIR) {
             rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
-	      (flt_t)-1.0;
+              (flt_t)-1.0;
             rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
-	      (flt_t)-1.0;
+              (flt_t)-1.0;
             rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) *
-	      (flt_t)-1.0;
+              (flt_t)-1.0;
           }
 
           one_eng = temp1 * chi;
-	  #ifndef INTEL_VMASK
-	  if (jlist_form[jj] == nall) {
-	    one_eng = (flt_t)0.0;
-	    fforce_0 = 0.0;
-	    fforce_1 = 0.0;
-	    fforce_2 = 0.0;
-	    ttor_0 = 0.0;
-	    ttor_1 = 0.0;
-	    ttor_2 = 0.0;
-	    rtor_0 = 0.0;
-	    rtor_1 = 0.0;
-	    rtor_2 = 0.0;
-	  }
-	  #endif
+          #ifndef INTEL_VMASK
+          if (jlist_form[jj] == nall) {
+            one_eng = (flt_t)0.0;
+            fforce_0 = 0.0;
+            fforce_1 = 0.0;
+            fforce_2 = 0.0;
+            ttor_0 = 0.0;
+            ttor_1 = 0.0;
+            ttor_2 = 0.0;
+            rtor_0 = 0.0;
+            rtor_1 = 0.0;
+            rtor_2 = 0.0;
+          }
+          #endif
 
           fforce_0 *= factor_lj;
           fforce_1 *= factor_lj;
@@ -701,53 +701,53 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ttor_1 *= factor_lj;
           ttor_2 *= factor_lj;
 
-	  #ifdef INTEL_VMASK
-	  if (jlist_form[jj] < nall) {
-	  #endif
-	    fxtmp += fforce_0;
-	    fytmp += fforce_1;
-	    fztmp += fforce_2;
-	    t1tmp += ttor_0;
-	    t2tmp += ttor_1;
-	    t3tmp += ttor_2;
-
-	    if (NEWTON_PAIR) {
-	      rtor_0 *= factor_lj;
-	      rtor_1 *= factor_lj;
-	      rtor_2 *= factor_lj;
-	      int jp = j * 2;
-	      f[jp].x -= fforce_0;
-	      f[jp].y -= fforce_1;
-	      f[jp].z -= fforce_2;
-	      jp++;
-	      f[jp].x += rtor_0;
-	      f[jp].y += rtor_1;
-	      f[jp].z += rtor_2;
-	    }
-
-	    if (EFLAG) {
-	      evdwl = factor_lj * one_eng;
-	      sevdwl += evdwl;
-	      if (eatom) {
-		fwtmp += (flt_t)0.5 * evdwl;
-		if (NEWTON_PAIR)
-		  f[j*2].w += (flt_t)0.5 * evdwl;
-	      }
-	    }
-
-	    if (NEWTON_PAIR == 0) {
-	      if (vflag == 1) {
-		sv0 += delx_form[jj] * fforce_0;
-		sv1 += dely_form[jj] * fforce_1;
-		sv2 += delz_form[jj] * fforce_2;
-		sv3 += delx_form[jj] * fforce_1;
-		sv4 += delx_form[jj] * fforce_2;
-		sv5 += dely_form[jj] * fforce_2;
-	      }
+          #ifdef INTEL_VMASK
+          if (jlist_form[jj] < nall) {
+          #endif
+            fxtmp += fforce_0;
+            fytmp += fforce_1;
+            fztmp += fforce_2;
+            t1tmp += ttor_0;
+            t2tmp += ttor_1;
+            t3tmp += ttor_2;
+
+            if (NEWTON_PAIR) {
+              rtor_0 *= factor_lj;
+              rtor_1 *= factor_lj;
+              rtor_2 *= factor_lj;
+              int jp = j * 2;
+              f[jp].x -= fforce_0;
+              f[jp].y -= fforce_1;
+              f[jp].z -= fforce_2;
+              jp++;
+              f[jp].x += rtor_0;
+              f[jp].y += rtor_1;
+              f[jp].z += rtor_2;
+            }
+
+            if (EFLAG) {
+              evdwl = factor_lj * one_eng;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j*2].w += (flt_t)0.5 * evdwl;
+              }
+            }
+
+            if (NEWTON_PAIR == 0) {
+              if (vflag == 1) {
+                sv0 += delx_form[jj] * fforce_0;
+                sv1 += dely_form[jj] * fforce_1;
+                sv2 += delz_form[jj] * fforce_2;
+                sv3 += delx_form[jj] * fforce_1;
+                sv4 += delx_form[jj] * fforce_2;
+                sv5 += dely_form[jj] * fforce_2;
+              }
             } // EVFLAG
-	  #ifdef INTEL_VMASK
-	  }
-	  #endif
+          #ifdef INTEL_VMASK
+          }
+          #endif
         } // for jj
 
         // -------------------------------------------------------------
@@ -756,29 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ierror = 2;
 
         int ip = i * 2;
-	if (NEWTON_PAIR) {
-	  f[ip].x += fxtmp;
-	  f[ip].y += fytmp;
-	  f[ip].z += fztmp;
-	  ip++;
-	  f[ip].x += t1tmp;
-	  f[ip].y += t2tmp;
-	  f[ip].z += t3tmp;
-	} else {
-	  f[ip].x = fxtmp;
-	  f[ip].y = fytmp;
-	  f[ip].z = fztmp;
-	  ip++;
-	  f[ip].x = t1tmp;
-	  f[ip].y = t2tmp;
-	  f[ip].z = t3tmp;
-	}
-
-	if (EFLAG) {
-	  oevdwl += sevdwl;
-	  if (eatom) f[i * 2].w += fwtmp;
-	}
-	if (NEWTON_PAIR == 0) {
+        if (NEWTON_PAIR) {
+          f[ip].x += fxtmp;
+          f[ip].y += fytmp;
+          f[ip].z += fztmp;
+          ip++;
+          f[ip].x += t1tmp;
+          f[ip].y += t2tmp;
+          f[ip].z += t3tmp;
+        } else {
+          f[ip].x = fxtmp;
+          f[ip].y = fytmp;
+          f[ip].z = fztmp;
+          ip++;
+          f[ip].x = t1tmp;
+          f[ip].y = t2tmp;
+          f[ip].z = t3tmp;
+        }
+
+        if (EFLAG) {
+          oevdwl += sevdwl;
+          if (eatom) f[i * 2].w += fwtmp;
+        }
+        if (NEWTON_PAIR == 0) {
           if (vflag == 1) {
             ov0 += sv0;
             ov1 += sv1;
@@ -792,30 +792,30 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       int o_range;
       if (NEWTON_PAIR) {
         o_range = nall;
-	if (offload == 0) o_range -= minlocal;
-	IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
+        if (offload == 0) o_range -= minlocal;
+        IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
                                sizeof(FORCE_T));
-	const int sto = iito * 8;
-	const int fst4 = f_stride * 4;
+        const int sto = iito * 8;
+        const int fst4 = f_stride * 4;
         #if defined(_OPENMP)
         #pragma omp barrier
         #endif
-	acc_t *f_scalar = &f_start[0].x;
+        acc_t *f_scalar = &f_start[0].x;
         acc_t *f_scalar2 = f_scalar + fst4;
-	for (int t = 1; t < nthreads; t++) {
+        for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector aligned
-	  #pragma simd
+          #pragma vector aligned
+          #pragma simd
           #endif
-	  for (int n = iifrom * 8; n < sto; n++)
-	    f_scalar[n] += f_scalar2[n];
-	  f_scalar2 += fst4;
+          for (int n = iifrom * 8; n < sto; n++)
+            f_scalar[n] += f_scalar2[n];
+          f_scalar2 += fst4;
         }
 
         if (vflag==2) {
-	  const ATOM_T * _noalias const xo = x + minlocal;
+          const ATOM_T * _noalias const xo = x + minlocal;
           #if defined(LMP_SIMD_COMPILER)
-	  #pragma novector
+          #pragma novector
           #endif
           for (int n = iifrom; n < iito; n++) {
             const int nt2 = n * 2;
@@ -826,7 +826,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
             ov4 += f_start[nt2].z * xo[n].x;
             ov5 += f_start[nt2].z * xo[n].y;
           }
-	}
+        }
       }
 
       if (ierror)
@@ -840,12 +840,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)-0.5;
-	ov1 *= (acc_t)-0.5;
-	ov2 *= (acc_t)-0.5;
-	ov3 *= (acc_t)-0.5;
-	ov4 *= (acc_t)-0.5;
-	ov5 *= (acc_t)-0.5;
+        ov0 *= (acc_t)-0.5;
+        ov1 *= (acc_t)-0.5;
+        ov2 *= (acc_t)-0.5;
+        ov3 *= (acc_t)-0.5;
+        ov4 *= (acc_t)-0.5;
+        ov5 *= (acc_t)-0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
@@ -982,7 +982,7 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                       const int one_length,
                                                       const int nthreads,
                                                       Memory *memory,
-						      const int cop) {
+                                                      const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed3 *oic = ic;
@@ -999,9 +999,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       int * ojlist_form = jlist_form[0];
 
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
-	  orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
-	  odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
-	  _cop >= 0) {
+          orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
+          odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
+          _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
           nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
@@ -1033,14 +1033,14 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       memory->create(jlist_form, nthreads, one_length, "jlist_form");
 
       for (int zn = 0; zn < nthreads; zn++)
-	for (int zo = 0; zo < one_length; zo++) {
-	  rsq_form[zn][zo] = 10.0;
-	  delx_form[zn][zo] = 10.0;
-	  dely_form[zn][zo] = 10.0;
-	  delz_form[zn][zo] = 10.0;
-	  jtype_form[zn][zo] = 1;
-	  jlist_form[zn][zo] = 0;
-	}
+        for (int zo = 0; zo < one_length; zo++) {
+          rsq_form[zn][zo] = 10.0;
+          delx_form[zn][zo] = 10.0;
+          dely_form[zn][zo] = 10.0;
+          delz_form[zn][zo] = 10.0;
+          jtype_form[zn][zo] = 1;
+          jlist_form[zn][zo] = 0;
+        }
 
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * ospecial_lj = special_lj;
@@ -1057,9 +1057,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 
       int tp1sq = ntypes*ntypes;
       if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
-	  oic != NULL && orsq_form != NULL && odelx_form != NULL &&
-	  odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
-	  ojlist_form !=NULL && cop >= 0) {
+          oic != NULL && orsq_form != NULL && odelx_form != NULL &&
+          odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL &&
+          ojlist_form !=NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index 7548b6eea3..fe99525122 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -67,8 +67,8 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
-					IntelBuffers<flt_t,acc_t> *buffers,
-					const ForceConst<flt_t> &fc)
+                                        IntelBuffers<flt_t,acc_t> *buffers,
+                                        const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -125,9 +125,9 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -177,8 +177,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -227,7 +227,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
-			      f_stride, x, q);
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
@@ -259,7 +259,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
       int * _noalias const tjtype = ccachej + toffs;
 
       for (int i = iifrom; i < iito; i += iip) {
-	//        const int i = ilist[ii];
+        //        const int i = ilist[ii];
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -270,175 +270,175 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	if (NEWTON_PAIR == 0)
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
-	int ej = 0;
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma ivdep
+        #pragma vector aligned
+        #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           const int j = jlist[jj] & NEIGHMASK;
-	  const flt_t delx = xtmp - x[j].x;
+          const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
-	  if (rsq < cut_coulsq) {
-	    trsq[ej]=rsq;
-	    tdelx[ej]=delx;
-	    tdely[ej]=dely;
-	    tdelz[ej]=delz;
-	    tjtype[ej]=x[j].w;
-	    tj[ej]=jlist[jj];
-	    ej++;
-	  }
-	}
+          if (rsq < cut_coulsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
 
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
 
-	  const int j = tj[jj] & NEIGHMASK;
+          const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
-	  const int jtype = tjtype[jj];
-	  const flt_t rsq = trsq[jj];
+          const int jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_ALLOW_TABLE
           if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
             const flt_t A1 =  0.254829592;
-	    const flt_t A2 = -0.284496736;
-	    const flt_t A3 =  1.421413741;
-	    const flt_t A4 = -1.453152027;
-	    const flt_t A5 =  1.061405429;
-	    const flt_t EWALD_F = 1.12837917;
-	    const flt_t INV_EWALD_P = 1.0 / 0.3275911;
-
-	    const flt_t r = (flt_t)1.0 / sqrt(r2inv);
-	    const flt_t grij = g_ewald * r;
-	    const flt_t expm2 = exp(-grij * grij);
-	    const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-	    const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-	    forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-	    if (EFLAG) ecoul = prefactor * erfc;
-
-	    const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
-	      prefactor;
-	    forcecoul -= adjust;
-	    if (EFLAG) ecoul -= adjust;
+            const flt_t A2 = -0.284496736;
+            const flt_t A3 =  1.421413741;
+            const flt_t A4 = -1.453152027;
+            const flt_t A5 =  1.061405429;
+            const flt_t EWALD_F = 1.12837917;
+            const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+            const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+            const flt_t grij = g_ewald * r;
+            const flt_t expm2 = exp(-grij * grij);
+            const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+            forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+            if (EFLAG) ecoul = prefactor * erfc;
+
+            const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+              prefactor;
+            forcecoul -= adjust;
+            if (EFLAG) ecoul -= adjust;
 
           #ifdef INTEL_ALLOW_TABLE
-	  } else {
-	    float rsq_lookup = rsq;
-	    const int itable = (__intel_castf32_u32(rsq_lookup) &
-	                        ncoulmask) >> ncoulshiftbits;
-	    const flt_t fraction = (rsq_lookup - table[itable].r) *
-	      table[itable].dr;
-
-	    const flt_t tablet = table[itable].f +
-	      fraction * table[itable].df;
-	    forcecoul = qtmp * q[j] * tablet;
-	    if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-	                       fraction * detable[itable]);
-	    if (sbindex) {
-   	      const flt_t table2 = ctable[itable] +
-		fraction * dctable[itable];
-	      const flt_t prefactor = qtmp * q[j] * table2;
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-		prefactor;
-	      forcecoul -= adjust;
-	      if (EFLAG) ecoul -= adjust;
-	    }
+          } else {
+            float rsq_lookup = rsq;
+            const int itable = (__intel_castf32_u32(rsq_lookup) &
+                                ncoulmask) >> ncoulshiftbits;
+            const flt_t fraction = (rsq_lookup - table[itable].r) *
+              table[itable].dr;
+
+            const flt_t tablet = table[itable].f +
+              fraction * table[itable].df;
+            forcecoul = qtmp * q[j] * tablet;
+            if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                               fraction * detable[itable]);
+            if (sbindex) {
+              const flt_t table2 = ctable[itable] +
+                fraction * dctable[itable];
+              const flt_t prefactor = qtmp * q[j] * table2;
+              const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
           }
           #endif
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < cut_ljsq) {
-	  #endif
+          #ifdef INTEL_VMASK
+          if (rsq < cut_ljsq) {
+          #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
             if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
 
-	    #ifdef INTEL_VMASK
-	    if (rsq > cut_lj_innersq) {
-	    #endif
+            #ifdef INTEL_VMASK
+            if (rsq > cut_lj_innersq) {
+            #endif
               const flt_t drsq = cut_ljsq - rsq;
               const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
               const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
                   inv_denom_lj;
               const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
               if (EFLAG) {
-		#ifndef INTEL_VMASK
-		if (rsq > cut_lj_innersq) {
-		#endif
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq) {
+                #endif
                   forcelj = forcelj * switch1 + evdwl * switch2;
                   evdwl *= switch1;
-		#ifndef INTEL_VMASK
-		}
-		#endif
+                #ifndef INTEL_VMASK
+                }
+                #endif
               } else {
                 const flt_t philj = r6inv * (lji[jtype].z*r6inv -
                     lji[jtype].w);
-		#ifndef INTEL_VMASK
-		if (rsq > cut_lj_innersq)
-		#endif
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq)
+                #endif
                   forcelj =  forcelj * switch1 + philj * switch2;
               }
-	    #ifdef INTEL_VMASK
-	    }
-	    #endif
+            #ifdef INTEL_VMASK
+            }
+            #endif
 
             if (sbindex) {
               const flt_t factor_lj = special_lj[sbindex];
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
-	  #ifdef INTEL_VMASK
-	  }
-	  #else
-	  if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
-	  #endif
-
-	  const flt_t fpair = (forcecoul + forcelj) * r2inv;
-	  const flt_t fpx = fpair * tdelx[jj];
-	  fxtmp += fpx;
-	  if (NEWTON_PAIR) f[j].x -= fpx;
-	  const flt_t fpy = fpair * tdely[jj];
-	  fytmp += fpy;
-	  if (NEWTON_PAIR) f[j].y -= fpy;
-	  const flt_t fpz = fpair * tdelz[jj];
-	  fztmp += fpz;
-	  if (NEWTON_PAIR) f[j].z -= fpz;
-
-	  if (EFLAG) {
-	    sevdwl += evdwl;
-	    secoul += ecoul;
-	    if (eatom) {
-	      fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-	      if (NEWTON_PAIR)
-		f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-	    }
-	  }
-	  if (NEWTON_PAIR == 0)
-	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
-				  fpx, fpy, fpz);
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += ecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
         if (NEWTON_PAIR) {
           f[i].x += fxtmp;
@@ -449,33 +449,33 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
           f[i].y = fytmp;
           f[i].z = fztmp;
         }
-	IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) {
-	oevdwl *= (acc_t)0.5;
-	oecoul *= (acc_t)0.5;
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
       }
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
@@ -556,7 +556,7 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   double cut, cutneigh;
   if (cut_lj > cut_coul)
     error->all(FLERR,
-	 "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
   for (int i = 1; i <= atom->ntypes; i++) {
     for (int j = i; j <= atom->ntypes; j++) {
       if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
@@ -637,7 +637,7 @@ template <class flt_t>
 void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                               const int ntable,
                                                               Memory *memory,
-							      const int cop) {
+                                                              const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -653,12 +653,12 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           otable != NULL && oetable != NULL && odetable != NULL &&
           octable != NULL && odctable != NULL && ospecial_coul != NULL &&
-	  cop >= 0) {
+          cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
-	  nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
-	  nocopy(otable: alloc_if(0) free_if(1)) \
-	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+          nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
+          nocopy(otable: alloc_if(0) free_if(1)) \
+          nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
@@ -694,7 +694,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
           otable !=NULL && oetable != NULL && odetable != NULL &&
           octable != NULL && odctable != NULL && ospecial_coul != NULL &&
-	  cop >= 0) {
+          cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
           nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
index cafc412a91..1b13d78497 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -50,8 +50,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
                const ForceConst<flt_t> &fc);
   template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -75,7 +75,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index 8a0bed2c01..e9775d6ec5 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -68,8 +68,8 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
 
 template <class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -92,7 +92,7 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-				packthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
@@ -124,9 +124,9 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
 
 template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
-				  IntelBuffers<flt_t,acc_t> *buffers,
-				  const ForceConst<flt_t> &fc,
-				  const int astart, const int aend)
+                                  IntelBuffers<flt_t,acc_t> *buffers,
+                                  const ForceConst<flt_t> &fc,
+                                  const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -171,8 +171,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -208,7 +208,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
     in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
-    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)	\
+    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)    \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -220,7 +220,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
-			      f_stride, x, q);
+                              f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
@@ -261,18 +261,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         const int jnum = numneigh[i];
 
         acc_t fxtmp,fytmp,fztmp,fwtmp;
-	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
 
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	if (NEWTON_PAIR == 0)
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
-	int ej = 0;
+        int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
         #pragma ivdep
@@ -282,91 +282,91 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-	  const int jtype = x[j].w;
+          const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
-	  if (rsq < c_forcei[jtype].cutsq) {
-	    trsq[ej]=rsq;
-	    tdelx[ej]=delx;
-	    tdely[ej]=dely;
-	    tdelz[ej]=delz;
-	    tjtype[ej]=jtype;
-	    tj[ej]=jlist[jj];
-	    ej++;
-	  }
-	}
+          if (rsq < c_forcei[jtype].cutsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=jtype;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
 
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
+        #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-		                 sv0, sv1, sv2, sv3, sv4, sv5)
+                                 sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < ej; jj++) {
           flt_t forcecoul, forcelj, evdwl, ecoul;
           forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
 
-	  const int j = tj[jj] & NEIGHMASK;
+          const int j = tj[jj] & NEIGHMASK;
           const int sbindex = tj[jj] >> SBBITS & 3;
-	  const int jtype = tjtype[jj];
-	  const flt_t rsq = trsq[jj];
+          const int jtype = tjtype[jj];
+          const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
           #ifdef INTEL_ALLOW_TABLE
-	  if (!ncoultablebits || rsq <= tabinnersq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
-	    const flt_t A1 =  0.254829592;
-	    const flt_t A2 = -0.284496736;
-	    const flt_t A3 =  1.421413741;
-	    const flt_t A4 = -1.453152027;
-	    const flt_t A5 =  1.061405429;
-	    const flt_t EWALD_F = 1.12837917;
-	    const flt_t INV_EWALD_P = 1.0 / 0.3275911;
-	    
-	    const flt_t r = (flt_t)1.0 / sqrt(r2inv);
-	    const flt_t grij = g_ewald * r;
-	    const flt_t expm2 = exp(-grij * grij);
-	    const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-	    const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-	    forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-	    if (EFLAG) ecoul = prefactor * erfc;
-	    
-	    const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
-	      prefactor;
-	    forcecoul -= adjust;
-	    if (EFLAG) ecoul -= adjust;
+            const flt_t A1 =  0.254829592;
+            const flt_t A2 = -0.284496736;
+            const flt_t A3 =  1.421413741;
+            const flt_t A4 = -1.453152027;
+            const flt_t A5 =  1.061405429;
+            const flt_t EWALD_F = 1.12837917;
+            const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+            const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+            const flt_t grij = g_ewald * r;
+            const flt_t expm2 = exp(-grij * grij);
+            const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+            forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+            if (EFLAG) ecoul = prefactor * erfc;
+
+            const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+              prefactor;
+            forcecoul -= adjust;
+            if (EFLAG) ecoul -= adjust;
 
           #ifdef INTEL_ALLOW_TABLE
           } else {
-	    float rsq_lookup = rsq;
-	    const int itable = (__intel_castf32_u32(rsq_lookup) &
-				ncoulmask) >> ncoulshiftbits;
-	    const flt_t fraction = (rsq_lookup - table[itable].r) *
-	      table[itable].dr;
-
-	    const flt_t tablet = table[itable].f +
-	      fraction * table[itable].df;
-	    forcecoul = qtmp * q[j] * tablet;
-	    if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-					      fraction * detable[itable]);
-	    if (sbindex) {
-	      const flt_t table2 = ctable[itable] +
-		fraction * dctable[itable];
-	      const flt_t prefactor = qtmp * q[j] * table2;
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-		prefactor;
-	      forcecoul -= adjust;
-	      if (EFLAG) ecoul -= adjust;
-	    }
-	  }
+            float rsq_lookup = rsq;
+            const int itable = (__intel_castf32_u32(rsq_lookup) &
+                                ncoulmask) >> ncoulshiftbits;
+            const flt_t fraction = (rsq_lookup - table[itable].r) *
+              table[itable].dr;
+
+            const flt_t tablet = table[itable].f +
+              fraction * table[itable].df;
+            forcecoul = qtmp * q[j] * tablet;
+            if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                                              fraction * detable[itable]);
+            if (sbindex) {
+              const flt_t table2 = ctable[itable] +
+                fraction * dctable[itable];
+              const flt_t prefactor = qtmp * q[j] * table2;
+              const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          }
           #endif
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < c_forcei[jtype].cut_ljsq) {
-	  #endif
+          #ifdef INTEL_VMASK
+          if (rsq < c_forcei[jtype].cut_ljsq) {
+          #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
             forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
-			       c_forcei[jtype].lj2);
+                               c_forcei[jtype].lj2);
             if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
                                       c_energyi[jtype].lj4) -
                                c_energyi[jtype].offset;
@@ -376,14 +376,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
-	  #ifdef INTEL_VMASK
-	  }
-	  #else
-	  if (rsq > c_forcei[jtype].cut_ljsq)
-	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
-	  #endif
-
-	  const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > c_forcei[jtype].cut_ljsq)
+            { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
           const flt_t fpx = fpair * tdelx[jj];
           fxtmp += fpx;
           if (NEWTON_PAIR) f[j].x -= fpx;
@@ -394,58 +394,58 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
           fztmp += fpz;
           if (NEWTON_PAIR) f[j].z -= fpz;
 
-	  if (EFLAG) {
-	    sevdwl += evdwl;
-	    secoul += ecoul;
-	    if (eatom) {
-	      fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-	      if (NEWTON_PAIR)
-		f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += ecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
-          } 
-	  if (NEWTON_PAIR == 0)
- 	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], 
-				  fpx, fpy, fpz);
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
         } // for jj
 
-	if (NEWTON_PAIR) {
+        if (NEWTON_PAIR) {
           f[i].x += fxtmp;
-	  f[i].y += fytmp;
-	  f[i].z += fztmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
         } else {
           f[i].x = fxtmp;
-	  f[i].y = fytmp;
-	  f[i].z = fztmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
         }
 
-	IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) {
-	oevdwl *= (acc_t)0.5;
-	oecoul *= (acc_t)0.5;
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
       }
       ev_global[0] = oevdwl;
       ev_global[1] = oecoul;
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
-      }   
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
       ev_global[4] = ov2;
@@ -547,8 +547,8 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
       if (cutsq[i][j] < cut_ljsq[i][j])
-	error->all(FLERR,
-	 "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
+        error->all(FLERR,
+         "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].lj1 = lj1[i][j];
@@ -598,9 +598,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
 template <class flt_t>
 void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   const int ntable,
-							   Memory *memory,
-							   const int cop) {
+                                                           const int ntable,
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes || ntable != _ntable) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -619,9 +619,9 @@ void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
           ospecial_coul != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
-	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
-	  nocopy(otable: alloc_if(0) free_if(1)) \
-	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+          nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
+          nocopy(otable: alloc_if(0) free_if(1)) \
+          nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
       }
       #endif
 
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
index 2b7d87c040..288a6a7bc4 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@@ -50,8 +50,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
                const ForceConst<flt_t> &fc);
   template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
@@ -76,7 +76,7 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
     ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
 
     void set_ntypes(const int ntypes, const int ntable, Memory *memory,
-		    const int cop);
+                    const int cop);
 
    private:
     int _ntypes, _ntable, _cop;
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index 8620646343..4871821842 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -96,37 +96,37 @@ void PairLJCutIntel::compute(int eflag, int vflag,
   if (_onetype) {
     if (eflag) {
       if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
     if (eflag) {
       if (force->newton_pair) {
-	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
@@ -161,8 +161,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -176,7 +176,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
     #endif
 
     IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
-			      f_stride, x, 0);
+                              f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = (acc_t)0;
@@ -200,23 +200,23 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 
       flt_t cutsq, lj1, lj2, lj3, lj4, offset;
       if (ONETYPE) {
-	cutsq = ljc12o[3].cutsq;
-	lj1 = ljc12o[3].lj1;
-	lj2 = ljc12o[3].lj2;
-	lj3 = lj34[3].lj3;
-	lj4 = lj34[3].lj4;
-	offset = ljc12o[3].offset;
+        cutsq = ljc12o[3].cutsq;
+        lj1 = ljc12o[3].lj1;
+        lj2 = ljc12o[3].lj2;
+        lj3 = lj34[3].lj3;
+        lj4 = lj34[3].lj4;
+        offset = ljc12o[3].offset;
       }
       for (int i = iifrom; i < iito; i += iip) {
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias ljc12oi;
         const FC_PACKED2_T * _noalias lj34i;
-	if (!ONETYPE) {
-	  itype = x[i].w;
+        if (!ONETYPE) {
+          itype = x[i].w;
           ptr_off = itype * ntypes;
           ljc12oi = ljc12o + ptr_off;
           lj34i = lj34 + ptr_off;
-	}
+        }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
@@ -228,113 +228,113 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-	if (NEWTON_PAIR == 0)
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
         for (int jj = 0; jj < jnum; jj++) {
           flt_t forcelj, evdwl;
           forcelj = evdwl = (flt_t)0.0;
 
-	  int j, jtype, sbindex;
-	  if (!ONETYPE) {
-	    sbindex = jlist[jj] >> SBBITS & 3;
-	    j = jlist[jj] & NEIGHMASK;
-	  } else
-	    j = jlist[jj];
+          int j, jtype, sbindex;
+          if (!ONETYPE) {
+            sbindex = jlist[jj] >> SBBITS & 3;
+            j = jlist[jj] & NEIGHMASK;
+          } else
+            j = jlist[jj];
 
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
           if (!ONETYPE) {
-	    jtype = x[j].w;
+            jtype = x[j].w;
             cutsq = ljc12oi[jtype].cutsq;
-	  }
+          }
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           #ifdef INTEL_VMASK
           if (rsq < cutsq) {
-	  #endif
+          #endif
             flt_t factor_lj;
-	    if (!ONETYPE) factor_lj = special_lj[sbindex];
+            if (!ONETYPE) factor_lj = special_lj[sbindex];
             flt_t r2inv = 1.0 / rsq;
             flt_t r6inv = r2inv * r2inv * r2inv;
             #ifndef INTEL_VMASK
-	    if (rsq > cutsq) r6inv = (flt_t)0.0;
-	    #endif
-	    if (!ONETYPE) {
-	      lj1 = ljc12oi[jtype].lj1;
-	      lj2 = ljc12oi[jtype].lj2;
-	    }
+            if (rsq > cutsq) r6inv = (flt_t)0.0;
+            #endif
+            if (!ONETYPE) {
+              lj1 = ljc12oi[jtype].lj1;
+              lj2 = ljc12oi[jtype].lj2;
+            }
             forcelj = r6inv * (lj1 * r6inv - lj2);
             flt_t fpair;
-	    if (!ONETYPE)
-	      fpair = factor_lj * forcelj * r2inv;
-	    else
-	      fpair = forcelj * r2inv;
-
-	    const flt_t fpx = fpair * delx;
-	    fxtmp += fpx;
-	    if (NEWTON_PAIR) f[j].x -= fpx;
-	    const flt_t fpy = fpair * dely;
-	    fytmp += fpy;
-	    if (NEWTON_PAIR) f[j].y -= fpy;
-	    const flt_t fpz = fpair * delz;
-	    fztmp += fpz;
-	    if (NEWTON_PAIR) f[j].z -= fpz;
+            if (!ONETYPE)
+              fpair = factor_lj * forcelj * r2inv;
+            else
+              fpair = forcelj * r2inv;
+
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
 
             if (EFLAG) {
-	      if (!ONETYPE) {
-		lj3 = lj34i[jtype].lj3;
-		lj4 = lj34i[jtype].lj4;
-		offset = ljc12oi[jtype].offset;
-	      }
-	      evdwl = r6inv * (lj3 * r6inv - lj4);
+              if (!ONETYPE) {
+                lj3 = lj34i[jtype].lj3;
+                lj4 = lj34i[jtype].lj4;
+                offset = ljc12oi[jtype].offset;
+              }
+              evdwl = r6inv * (lj3 * r6inv - lj4);
               #ifdef INTEL_VMASK
-	      evdwl -= offset;
+              evdwl -= offset;
               #else
-	      if (rsq < cutsq) evdwl -= offset;
+              if (rsq < cutsq) evdwl -= offset;
               #endif
-	      if (!ONETYPE) evdwl *= factor_lj;
-	      sevdwl += evdwl;
-	      if (eatom) {
+              if (!ONETYPE) evdwl *= factor_lj;
+              sevdwl += evdwl;
+              if (eatom) {
                 fwtmp += (flt_t)0.5 * evdwl;
                 if (NEWTON_PAIR)
-		  f[j].w += (flt_t)0.5 * evdwl;
+                  f[j].w += (flt_t)0.5 * evdwl;
               }
-	    }
+            }
 
-	    if (NEWTON_PAIR == 0)
-	      IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           } // if rsq
           #endif
         } // for jj
-	if (NEWTON_PAIR) {
-	  f[i].x += fxtmp;
-	  f[i].y += fytmp;
-	  f[i].z += fztmp;
-	} else {
-	  f[i].x = fxtmp;
-	  f[i].y = fytmp;
-	  f[i].z = fztmp;
-	}
-
-	IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
       IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end omp
 
     IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
@@ -343,12 +343,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
     }
     if (vflag) {
       if (NEWTON_PAIR == 0) {
-	ov0 *= (acc_t)0.5;
-	ov1 *= (acc_t)0.5;
-	ov2 *= (acc_t)0.5;
-	ov3 *= (acc_t)0.5;
-	ov4 *= (acc_t)0.5;
-	ov5 *= (acc_t)0.5;
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
       }
       ev_global[2] = ov0;
       ev_global[3] = ov1;
@@ -454,7 +454,7 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
 template <class flt_t>
 void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                    Memory *memory,
-						   const int cop) {
+                                                   const int cop) {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
       fc_packed1 *oljc12o = ljc12o[0];
diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
index 99c7045098..86929d41ea 100644
--- a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
@@ -1,50 +1,50 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: William McDoniel (RWTH Aachen University)
-------------------------------------------------------------------------- */
-
-#include <math.h>
-#include "pair_lj_long_coul_long_intel.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "group.h"
-#include "kspace.h"
-#include "memory.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "memory.h"
-#include "suffix.h"
-
-
-using namespace LAMMPS_NS;
-
-#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
-#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
-#define TABLE_T typename ForceConst<flt_t>::table_t
-
-PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
-  PairLJLongCoulLong(lmp)
-{
-  suffix_flag |= Suffix::INTEL;
-  respa_enable = 0;
-  cut_respa = NULL;
-}
-
-
-PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
-{
-}
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_long_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+
+
+using namespace LAMMPS_NS;
+
+#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
+#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
+  PairLJLongCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+
+PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
+{
+}
diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.h b/src/USER-INTEL/pair_lj_long_coul_long_intel.h
index 42eef932ec..b7d3504ecd 100644
--- a/src/USER-INTEL/pair_lj_long_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.h
@@ -1,39 +1,39 @@
-/* *- c++ -*- -----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: William McDoniel (RWTH Aachen University)
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
-
-#else
-
-#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
-#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
-
-#include "pair_lj_long_coul_long.h"
-#include "fix_intel.h"
-
-namespace LAMMPS_NS {
-  class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
-  public:
-    PairLJLongCoulLongIntel(class LAMMPS *);
-    virtual ~PairLJLongCoulLongIntel();
-
-  };
-}
-#endif
-#endif
+/* *- c++ -*- -----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
+
+#include "pair_lj_long_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+  class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
+  public:
+    PairLJLongCoulLongIntel(class LAMMPS *);
+    virtual ~PairLJLongCoulLongIntel();
+
+  };
+}
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 835f78664a..7a6b7afd92 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -77,7 +77,7 @@ void PairSWIntel::compute(int eflag, int vflag)
 {
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
-			  force_const_single);
+                          force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
@@ -131,37 +131,37 @@ void PairSWIntel::compute(int eflag, int vflag,
   if (_onetype) {
     if (_spq) {
       if (eflag) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
       if (eflag) {
-	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   } else {
     if (_spq) {
       if (eflag) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
       if (eflag) {
-	eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   }
@@ -174,7 +174,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
-		       const int aend, const int pad_width)
+                       const int aend, const int pad_width)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -278,23 +278,23 @@ void PairSWIntel::eval(const int offload, const int vflag,
       flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
         cutsq = p2[3].cutsq;
-	cut = p2f[3].cut;
-	sigma = p2f[3].sigma;
-	c1 = p2f2[3].c1;
-	c2 = p2f2[3].c2;
-	c3 = p2f2[3].c3;
-	c4 = p2f2[3].c4;
-	sigma_gamma = p2[3].sigma_gamma;
-	costheta = p3[7].costheta;
-	lambda_epsilon = p3[7].lambda_epsilon;
-	lambda_epsilon2 = p3[7].lambda_epsilon2;
-	if (SPQ == 0) {
+        cut = p2f[3].cut;
+        sigma = p2f[3].sigma;
+        c1 = p2f2[3].c1;
+        c2 = p2f2[3].c2;
+        c3 = p2f2[3].c3;
+        c4 = p2f2[3].c4;
+        sigma_gamma = p2[3].sigma_gamma;
+        costheta = p3[7].costheta;
+        lambda_epsilon = p3[7].lambda_epsilon;
+        lambda_epsilon2 = p3[7].lambda_epsilon2;
+        if (SPQ == 0) {
           powerp = p2f[3].powerp;
-	  powerq = p2f[3].powerq;
+          powerq = p2f[3].powerq;
         }
-	if (EFLAG) {
+        if (EFLAG) {
           c5 = p2e[3].c5;
-	  c6 = p2e[3].c6;
+          c6 = p2e[3].c6;
         }
       }
 
@@ -304,23 +304,23 @@ void PairSWIntel::eval(const int offload, const int vflag,
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
 
-	if (!ONETYPE) {
+        if (!ONETYPE) {
           itype = x[i].w;
-	  itype_offset = itype * ntypes;
-        } 
+          itype_offset = itype * ntypes;
+        }
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
-	const int jnumhalf = numneighhalf[i];
+        const int jnumhalf = numneighhalf[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
         acc_t sevdwl;
         fxtmp = fytmp = fztmp = (acc_t)0.0;
-	if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
 
-	int ejnum = 0, ejnumhalf = 0;
-	#pragma vector aligned
-	#pragma ivdep
+        int ejnum = 0, ejnumhalf = 0;
+        #pragma vector aligned
+        #pragma ivdep
         for (int jj = 0; jj < jnum; jj++) {
           int j = jlist[jj];
           j &= NEIGHMASK;
@@ -329,115 +329,115 @@ void PairSWIntel::eval(const int offload, const int vflag,
           const flt_t delz = x[j].z - ztmp;
           int jtype, ijtype;
           if (!ONETYPE) {
-	    jtype = x[j].w;
-	    ijtype = itype_offset + jtype;
-	    cutsq = p2[ijtype].cutsq;
-	  } 
+            jtype = x[j].w;
+            ijtype = itype_offset + jtype;
+            cutsq = p2[ijtype].cutsq;
+          }
           const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
           if (rsq1 < cutsq) {
-	    tdelx[ejnum] = delx;
-	    tdely[ejnum] = dely;
-	    tdelz[ejnum] = delz;
-	    trsq[ejnum] = rsq1;
-	    tj[ejnum] = j;
-	    if (!ONETYPE) tjtype[ejnum] = jtype;
-	    ejnum++;
-	    if (jj < jnumhalf) ejnumhalf++;
-	  }
-	}
-	int ejnum_pad = ejnum;
-	
-	while ( (ejnum_pad % pad_width) != 0) {
-	  tdelx[ejnum_pad] = (flt_t)0.0;
-	  tdely[ejnum_pad] = (flt_t)0.0;
-	  tdelz[ejnum_pad] = (flt_t)0.0;
-	  trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
-	  tj[ejnum_pad] = nall;
-	  if (!ONETYPE) tjtype[ejnum_pad] = 0;
-	  ejnum_pad++;
-	}
-	
+            tdelx[ejnum] = delx;
+            tdely[ejnum] = dely;
+            tdelz[ejnum] = delz;
+            trsq[ejnum] = rsq1;
+            tj[ejnum] = j;
+            if (!ONETYPE) tjtype[ejnum] = jtype;
+            ejnum++;
+            if (jj < jnumhalf) ejnumhalf++;
+          }
+        }
+        int ejnum_pad = ejnum;
+
+        while ( (ejnum_pad % pad_width) != 0) {
+          tdelx[ejnum_pad] = (flt_t)0.0;
+          tdely[ejnum_pad] = (flt_t)0.0;
+          tdelz[ejnum_pad] = (flt_t)0.0;
+          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
+          tj[ejnum_pad] = nall;
+          if (!ONETYPE) tjtype[ejnum_pad] = 0;
+          ejnum_pad++;
+        }
+
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
+        #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
-	#endif
+        #endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
           fjxtmp = fjytmp = fjztmp = (acc_t)0.0;
           if (EFLAG) fjtmp = (acc_t)0.0;
-	  int ijtype;
+          int ijtype;
 
-	  if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
+          if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
           const flt_t rsq1 = trsq[jj];
 
           const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
           const flt_t r1 = (flt_t)1.0/sqrt(rinvsq1);
-	  if (!ONETYPE) cut = p2f[ijtype].cut;
+          if (!ONETYPE) cut = p2f[ijtype].cut;
           const flt_t rainv1 = (flt_t)1.0 / (r1 - cut);
-	  
-	  // two-body interactions, skip half of them
-	  flt_t rp, rq;
-	  if (SPQ == 1) {
-	    rp = r1 * r1;
-	    rp *= rp;
-	    rp = (flt_t)1.0 / rp;
-	    rq = (flt_t)1.0;
-	  } else {
+
+          // two-body interactions, skip half of them
+          flt_t rp, rq;
+          if (SPQ == 1) {
+            rp = r1 * r1;
+            rp *= rp;
+            rp = (flt_t)1.0 / rp;
+            rq = (flt_t)1.0;
+          } else {
             if (!ONETYPE) {
               powerp = p2f[ijtype].powerp;
-	      powerq = p2f[ijtype].powerq;
+              powerq = p2f[ijtype].powerq;
             }
-	    rp = std::pow(r1, powerp);
-	    rq = std::pow(r1, powerq);
-	  }
+            rp = std::pow(r1, powerp);
+            rq = std::pow(r1, powerq);
+          }
 
-	  if (!ONETYPE) {
+          if (!ONETYPE) {
             sigma = p2f[ijtype].sigma;
-	    c1 = p2f2[ijtype].c1;
-	    c2 = p2f2[ijtype].c2;
-	    c3 = p2f2[ijtype].c3; 
-	    c4 = p2f2[ijtype].c4;
+            c1 = p2f2[ijtype].c1;
+            c2 = p2f2[ijtype].c2;
+            c3 = p2f2[ijtype].c3;
+            c4 = p2f2[ijtype].c4;
           }
 
-	  const flt_t rainvsq = rainv1 * rainv1 * r1;
-	  flt_t expsrainv = exp(sigma * rainv1);
-	  if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
-	  const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * 
-			       rainvsq) * expsrainv * rinvsq1;
-
-	  const flt_t delx = tdelx[jj];
-	  const flt_t dely = tdely[jj];
-	  const flt_t delz = tdelz[jj];
-	  const flt_t fpx = fpair * delx;
-	  fxtmp -= fpx;
-	  fjxtmp += fpx;
-	  const flt_t fpy = fpair * dely;
-	  fytmp -= fpy;
-	  fjytmp += fpy;
-	  const flt_t fpz = fpair * delz;
-	  fztmp -= fpz;
-	  fjztmp += fpz;
-
-	  if (EFLAG) {
-	    flt_t evdwl;
-	    if (!ONETYPE) {
-	      c5 = p2e[ijtype].c5;
-	      c6 = p2e[ijtype].c6;
+          const flt_t rainvsq = rainv1 * rainv1 * r1;
+          flt_t expsrainv = exp(sigma * rainv1);
+          if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
+          const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
+                               rainvsq) * expsrainv * rinvsq1;
+
+          const flt_t delx = tdelx[jj];
+          const flt_t dely = tdely[jj];
+          const flt_t delz = tdelz[jj];
+          const flt_t fpx = fpair * delx;
+          fxtmp -= fpx;
+          fjxtmp += fpx;
+          const flt_t fpy = fpair * dely;
+          fytmp -= fpy;
+          fjytmp += fpy;
+          const flt_t fpz = fpair * delz;
+          fztmp -= fpz;
+          fjztmp += fpz;
+
+          if (EFLAG) {
+            flt_t evdwl;
+            if (!ONETYPE) {
+              c5 = p2e[ijtype].c5;
+              c6 = p2e[ijtype].c6;
             }
-	    evdwl = (c5 * rp - c6 * rq) * expsrainv;
-	    sevdwl += evdwl;
-	    if (eatom) {
-	      fwtmp += (flt_t)0.5 * evdwl;
-	      fjtmp += (flt_t)0.5 * evdwl;
+            evdwl = (c5 * rp - c6 * rq) * expsrainv;
+            sevdwl += evdwl;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl;
+              fjtmp += (flt_t)0.5 * evdwl;
             }
-	  }
+          }
 
-	  /*---------------------------------------------*/
+          /*---------------------------------------------*/
 
-	  int ijkoff;
-	  if (!ONETYPE) {
+          int ijkoff;
+          if (!ONETYPE) {
             sigma_gamma = p2[ijtype].sigma_gamma;
-	    ijkoff = ijtype * ntypes;
+            ijkoff = ijtype * ntypes;
           }
 
           flt_t gsrainv1 = sigma_gamma * rainv1;
@@ -446,15 +446,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
           for (int kk = 0; kk < ejnum; kk++) {
             int iktype, ijktype;
-	    if (!ONETYPE) {
+            if (!ONETYPE) {
               iktype = tjtype[kk];
-	      ijktype = ijkoff + iktype;
-	      iktype += itype_offset;
-	      cut = p2[iktype].cut;
-	      sigma_gamma = p2[iktype].sigma_gamma;
-	      costheta = p3[ijktype].costheta;
-	      lambda_epsilon = p3[ijktype].lambda_epsilon;
-	      lambda_epsilon2 = p3[ijktype].lambda_epsilon2;
+              ijktype = ijkoff + iktype;
+              iktype += itype_offset;
+              cut = p2[iktype].cut;
+              sigma_gamma = p2[iktype].sigma_gamma;
+              costheta = p3[ijktype].costheta;
+              lambda_epsilon = p3[ijktype].lambda_epsilon;
+              lambda_epsilon2 = p3[ijktype].lambda_epsilon2;
             }
 
             flt_t delr2[3];
@@ -463,76 +463,76 @@ void PairSWIntel::eval(const int offload, const int vflag,
             delr2[2] = tdelz[kk];
             const flt_t rsq2 = trsq[kk];
 
-	    const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
-	    const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2);
-	    const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
-	    const flt_t gsrainv2 = sigma_gamma * rainv2;
-	    const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
-	    const flt_t expgsrainv2 = exp(gsrainv2);
+            const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
+            const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2);
+            const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
+            const flt_t gsrainv2 = sigma_gamma * rainv2;
+            const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
+            const flt_t expgsrainv2 = exp(gsrainv2);
 
-	    const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
-	    const flt_t cs = (delx * delr2[0] + dely * delr2[1] +
+            const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
+            const flt_t cs = (delx * delr2[0] + dely * delr2[1] +
                               delz * delr2[2]) * rinv12;
-	    const flt_t delcs = cs - costheta;
-	    const flt_t delcssq = delcs*delcs;
-
-	    flt_t kfactor;
-	    if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0;
-	    else kfactor = (flt_t)1.0;
-
-	    const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
-	    const flt_t facrad = lambda_epsilon * facexp * delcssq;
-	    const flt_t frad1 = facrad*gsrainvsq1;
-	    const flt_t frad2 = facrad*gsrainvsq2;
-	    const flt_t facang = lambda_epsilon2 * facexp * delcs;
-	    const flt_t facang12 = rinv12*facang;
-	    const flt_t csfacang = cs*facang;
-	    const flt_t csfac1 = rinvsq1*csfacang;
-
-	    const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12;
-	    const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12;
-	    const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12;
-
-	    fxtmp -= fjx;
-	    fytmp -= fjy;
-	    fztmp -= fjz;
-	    fjxtmp += fjx;
-	    fjytmp += fjy;
-	    fjztmp += fjz;
-
-	    if (EFLAG) {
-	      const flt_t evdwl = facrad * (flt_t)0.5;
-	      sevdwl += evdwl;
-	      if (eatom) {
-		fwtmp += (acc_t)0.33333333 * evdwl;
-		fjtmp += (acc_t)0.33333333 * facrad;
-	      }
-	    }
-	  } // for kk
-	  const int j = tj[jj];
+            const flt_t delcs = cs - costheta;
+            const flt_t delcssq = delcs*delcs;
+
+            flt_t kfactor;
+            if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0;
+            else kfactor = (flt_t)1.0;
+
+            const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
+            const flt_t facrad = lambda_epsilon * facexp * delcssq;
+            const flt_t frad1 = facrad*gsrainvsq1;
+            const flt_t frad2 = facrad*gsrainvsq2;
+            const flt_t facang = lambda_epsilon2 * facexp * delcs;
+            const flt_t facang12 = rinv12*facang;
+            const flt_t csfacang = cs*facang;
+            const flt_t csfac1 = rinvsq1*csfacang;
+
+            const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12;
+            const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12;
+            const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12;
+
+            fxtmp -= fjx;
+            fytmp -= fjy;
+            fztmp -= fjz;
+            fjxtmp += fjx;
+            fjytmp += fjy;
+            fjztmp += fjz;
+
+            if (EFLAG) {
+              const flt_t evdwl = facrad * (flt_t)0.5;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (acc_t)0.33333333 * evdwl;
+                fjtmp += (acc_t)0.33333333 * facrad;
+              }
+            }
+          } // for kk
+          const int j = tj[jj];
           f[j].x += fjxtmp;
           f[j].y += fjytmp;
           f[j].z += fjztmp;
-          if (EFLAG) 
-	    if (eatom) f[j].w += fjtmp;
+          if (EFLAG)
+            if (eatom) f[j].w += fjtmp;
         } // for jj
 
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
 
-	if (EFLAG) {
-	  f[i].w += fwtmp;
-	  oevdwl += sevdwl;
-	}
+        if (EFLAG) {
+          f[i].w += fwtmp;
+          oevdwl += sevdwl;
+        }
       } // for ii
 
-      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, 
-			      x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
+                              x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
 
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       ev_global[0] = oevdwl;
@@ -561,7 +561,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     fix->add_result_array(f_start, 0, offload);
 }
 
-#else 
+#else
 
 /* ----------------------------------------------------------------------
 
@@ -577,8 +577,8 @@ authors for more details.
 template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
-                       const ForceConst<flt_t> &fc, const int astart, 
-		       const int aend, const int pad_width)
+                       const ForceConst<flt_t> &fc, const int astart,
+                       const int aend, const int pad_width)
 {
   typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
   typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
@@ -646,7 +646,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
     in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
     in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
-    in(ccache_stride3)						\
+    in(ccache_stride3)                                          \
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -669,9 +669,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
     #endif
     {
       int iifrom, iip, iito, tid;
-      IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads, 
-			       swidth);
-      
+      IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads,
+                               swidth);
+
       iifrom += astart;
       iito += astart;
 
@@ -692,22 +692,22 @@ void PairSWIntel::eval(const int offload, const int vflag,
       SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
       SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
       if (ONETYPE) {
-	cutsq = SIMD_set(p2[3].cutsq);
-	cut = SIMD_set(p2f[3].cut);
-	sigma = SIMD_set(p2f[3].sigma);
-	c1 = SIMD_set(p2f2[3].c1);
-	c2 = SIMD_set(p2f2[3].c2);
-	c3 = SIMD_set(p2f2[3].c3);
-	c4 = SIMD_set(p2f2[3].c4);
-	sigma_gamma = SIMD_set(p2[3].sigma_gamma);
-	costheta = SIMD_set(p3[7].costheta);
-	lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
-	lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
-	if (SPQ == 0) {
-	  powerp = SIMD_set(p2f[3].powerp);
-	  powerq = SIMD_set(p2f[3].powerq);
-	}
-	if (EFLAG) {
+        cutsq = SIMD_set(p2[3].cutsq);
+        cut = SIMD_set(p2f[3].cut);
+        sigma = SIMD_set(p2f[3].sigma);
+        c1 = SIMD_set(p2f2[3].c1);
+        c2 = SIMD_set(p2f2[3].c2);
+        c3 = SIMD_set(p2f2[3].c3);
+        c4 = SIMD_set(p2f2[3].c4);
+        sigma_gamma = SIMD_set(p2[3].sigma_gamma);
+        costheta = SIMD_set(p3[7].costheta);
+        lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
+        lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
+        if (SPQ == 0) {
+          powerp = SIMD_set(p2f[3].powerp);
+          powerq = SIMD_set(p2f[3].powerq);
+        }
+        if (EFLAG) {
           c5 = SIMD_set(p2e[3].c5);
           c6 = SIMD_set(p2e[3].c6);
         }
@@ -715,120 +715,120 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
       SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
       const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
-					144,160,176,192,208,224,240);
+                                        144,160,176,192,208,224,240);
       ilist = ilist + iifrom;
       acc_t * const dforce = &(f[0].x);
       for (int i = iifrom; i < iito; i += iip) {
-	SIMD_mask imask = ilist < iito;
-	SIMD_flt_t xtmp, ytmp, ztmp;
-	SIMD_int itype, itype_offset;
-
-	if (ONETYPE)
-	  SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
-	else {
-	  SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
-	  itype_offset = itype * ntypes;
-	}
-
-	#ifdef OUTER_CHUNK
-	const int* ng = firstneigh + cnumneigh[i] - swidth;
-	#else
+        SIMD_mask imask = ilist < iito;
+        SIMD_flt_t xtmp, ytmp, ztmp;
+        SIMD_int itype, itype_offset;
+
+        if (ONETYPE)
+          SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
+        else {
+          SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
+          itype_offset = itype * ntypes;
+        }
+
+        #ifdef OUTER_CHUNK
+        const int* ng = firstneigh + cnumneigh[i] - swidth;
+        #else
         SIMD_int ng = SIMD_load(cnumneigh + i);
-	ng = ng - 1;
-	#endif
-	const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
-	const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
-	const int jnum_max = SIMD_max(jnum);
-
-	SIMD_acc_t fxtmp = SIMD_set((acc_t)0);
-	SIMD_acc_t fytmp = SIMD_set((acc_t)0);
-	SIMD_acc_t fztmp = SIMD_set((acc_t)0);
-	SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2;
-	if (is_same<flt_t,acc_t>::value == 0) {
-	  fxtmp2 = SIMD_set((acc_t)0);
-	  fytmp2 = SIMD_set((acc_t)0);
-	  fztmp2 = SIMD_set((acc_t)0);
+        ng = ng - 1;
+        #endif
+        const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
+        const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
+        const int jnum_max = SIMD_max(jnum);
+
+        SIMD_acc_t fxtmp = SIMD_set((acc_t)0);
+        SIMD_acc_t fytmp = SIMD_set((acc_t)0);
+        SIMD_acc_t fztmp = SIMD_set((acc_t)0);
+        SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2;
+        if (is_same<flt_t,acc_t>::value == 0) {
+          fxtmp2 = SIMD_set((acc_t)0);
+          fytmp2 = SIMD_set((acc_t)0);
+          fztmp2 = SIMD_set((acc_t)0);
           if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
-	}
+        }
 
         SIMD_acc_t sevdwl;
-	if (EFLAG) {
+        if (EFLAG) {
           fwtmp = SIMD_set((acc_t)0);
-	  sevdwl = SIMD_set((acc_t)0);
+          sevdwl = SIMD_set((acc_t)0);
         }
 
-	SIMD_int ejnum = SIMD_set(0);
-	SIMD_int ejnumhalf = SIMD_set(0);
-	SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
-				    11, 12, 13, 14, 15);
+        SIMD_int ejnum = SIMD_set(0);
+        SIMD_int ejnumhalf = SIMD_set(0);
+        SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                                    11, 12, 13, 14, 15);
         for (int jj = 0; jj < jnum_max; jj++) {
           SIMD_mask jmask = jj < jnum;
 
-	  #ifdef OUTER_CHUNK
-	  ng += swidth;
-	  SIMD_int j = SIMD_load(ng);
-	  #else
-	  ng = ng + 1;
-	  SIMD_int j = SIMD_gather(jmask, firstneigh, ng);
-	  #endif
+          #ifdef OUTER_CHUNK
+          ng += swidth;
+          SIMD_int j = SIMD_load(ng);
+          #else
+          ng = ng + 1;
+          SIMD_int j = SIMD_gather(jmask, firstneigh, ng);
+          #endif
           j = j & SIMD_set(NEIGHMASK);
-	  const SIMD_int joffset = j << 4;
-
-	  SIMD_flt_t delx, dely, delz;
-	  SIMD_int jtype, ijtype;
-	  if (ONETYPE)
-	    SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz);
-	  else {
-	    SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz, 
-			     jtype);
-	    ijtype = (jtype + itype_offset) << 2;
-	    cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype);
-	  }
-
-	  delx = delx - xtmp;
-	  dely = dely - ytmp;
-	  delz = delz - ztmp;
+          const SIMD_int joffset = j << 4;
+
+          SIMD_flt_t delx, dely, delz;
+          SIMD_int jtype, ijtype;
+          if (ONETYPE)
+            SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz);
+          else {
+            SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz,
+                             jtype);
+            ijtype = (jtype + itype_offset) << 2;
+            cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype);
+          }
+
+          delx = delx - xtmp;
+          dely = dely - ytmp;
+          delz = delz - ztmp;
           SIMD_flt_t rsq1 = delx * delx;
-	  rsq1 = SIMD_fma(dely, dely, rsq1);
-	  rsq1 = SIMD_fma(delz, delz, rsq1);
-
-	  const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq);
-	  SIMD_scatter(rmask, tdelx, coffset, delx);
-	  SIMD_scatter(rmask, tdely, coffset, dely);
-	  SIMD_scatter(rmask, tdelz, coffset, delz);
-	  SIMD_scatter(rmask, trsq, coffset, rsq1);
-	  SIMD_scatter(rmask, tj, coffset, j);
-	  if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype);
-	  ejnum = SIMD_add(rmask, ejnum, 1);
-	  coffset = SIMD_add(rmask, coffset, swidth);
-	  const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf);
-	  ejnumhalf = SIMD_add(hmask, ejnumhalf, 1);
-	}
-
-	const int ejnum_max = SIMD_max(ejnum);
-	const int ejnumhalf_max = SIMD_max(ejnumhalf);
-	memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
+          rsq1 = SIMD_fma(dely, dely, rsq1);
+          rsq1 = SIMD_fma(delz, delz, rsq1);
+
+          const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq);
+          SIMD_scatter(rmask, tdelx, coffset, delx);
+          SIMD_scatter(rmask, tdely, coffset, dely);
+          SIMD_scatter(rmask, tdelz, coffset, delz);
+          SIMD_scatter(rmask, trsq, coffset, rsq1);
+          SIMD_scatter(rmask, tj, coffset, j);
+          if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype);
+          ejnum = SIMD_add(rmask, ejnum, 1);
+          coffset = SIMD_add(rmask, coffset, swidth);
+          const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf);
+          ejnumhalf = SIMD_add(hmask, ejnumhalf, 1);
+        }
+
+        const int ejnum_max = SIMD_max(ejnum);
+        const int ejnumhalf_max = SIMD_max(ejnumhalf);
+        memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
         for (int jj = 0; jj < ejnum_max; jj++) {
           SIMD_int ijtype;
-	  const int coffset = jj * swidth;
-	  if (!ONETYPE) {
-	    ijtype = SIMD_load(tjtype + coffset);
-	    ijtype = (ijtype + itype_offset) << 2;
-	    cut = SIMD_gather(&(p2f[0].cut), ijtype);
-	  }
-
-	  SIMD_acc_t fjxtmp = SIMD_set((acc_t)0);
-	  SIMD_acc_t fjytmp = SIMD_set((acc_t)0);
-	  SIMD_acc_t fjztmp = SIMD_set((acc_t)0);
-	  SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2;
+          const int coffset = jj * swidth;
+          if (!ONETYPE) {
+            ijtype = SIMD_load(tjtype + coffset);
+            ijtype = (ijtype + itype_offset) << 2;
+            cut = SIMD_gather(&(p2f[0].cut), ijtype);
+          }
+
+          SIMD_acc_t fjxtmp = SIMD_set((acc_t)0);
+          SIMD_acc_t fjytmp = SIMD_set((acc_t)0);
+          SIMD_acc_t fjztmp = SIMD_set((acc_t)0);
+          SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2;
           if (EFLAG) fjtmp = SIMD_set((acc_t)0.0);
 
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    fjxtmp2 = SIMD_set((acc_t)0);
-	    fjytmp2 = SIMD_set((acc_t)0);
-	    fjztmp2 = SIMD_set((acc_t)0);
-	    if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0);
-	  }
+          if (is_same<flt_t,acc_t>::value == 0) {
+            fjxtmp2 = SIMD_set((acc_t)0);
+            fjytmp2 = SIMD_set((acc_t)0);
+            fjztmp2 = SIMD_set((acc_t)0);
+            if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0);
+          }
 
           const SIMD_flt_t delx = SIMD_load(tdelx + coffset);
           const SIMD_flt_t dely = SIMD_load(tdely + coffset);
@@ -836,211 +836,211 @@ void PairSWIntel::eval(const int offload, const int vflag,
           const SIMD_flt_t rsq1 = SIMD_load(trsq + coffset);
 
           const SIMD_flt_t rinvsq1 = SIMD_rcp(rsq1);
-          const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1); 
+          const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1);
           const SIMD_flt_t rainv1 = SIMD_rcp(r1 - cut);
-	  
-	  // two-body interactions, skip half of them
-	  if (jj < ejnumhalf_max) {
+
+          // two-body interactions, skip half of them
+          if (jj < ejnumhalf_max) {
             SIMD_flt_t rp, rq;
-	    if (SPQ == 1) {
+            if (SPQ == 1) {
               rp = r1 * r1;
-	      rp = rp * rp;
-	      rp = SIMD_rcp(rp);
-	      rq = SIMD_set((flt_t)1.0);
+              rp = rp * rp;
+              rp = SIMD_rcp(rp);
+              rq = SIMD_set((flt_t)1.0);
             } else {
-	      if (!ONETYPE) {
-		powerp = SIMD_gather(&(p2f[0].powerp), ijtype);
-		powerq = SIMD_gather(&(p2f[0].powerq), ijtype);
-	      }
-	      rp = SIMD_pow(r1, powerp);
-	      rq = SIMD_pow(r1, powerq);
-	    }
-
-	    if (!ONETYPE) {
-	      sigma = SIMD_gather(&(p2f[0].sigma), ijtype);
-	      c1 = SIMD_gather(&(p2f2[0].c1), ijtype);
-	      c2 = SIMD_gather(&(p2f2[0].c2), ijtype);
-	      c3 = SIMD_gather(&(p2f2[0].c3), ijtype);
-	      c4 = SIMD_gather(&(p2f2[0].c4), ijtype);
-	    }
-
-	    const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1;
-	    const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1);
-	    const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
-				      rainvsq) * expsrainv * rinvsq1;
-
-	    const SIMD_flt_t fjx = delx * fpair;
-	    const SIMD_flt_t fjy = dely * fpair;
-	    const SIMD_flt_t fjz = delz * fpair;
-
-	    const SIMD_mask hmask = jj < ejnumhalf;
-	    SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp,
-			     fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
-			     fztmp2, fjxtmp2, fjytmp2, fjztmp2); 
-          
-	    if (EFLAG) {
-	      if (!ONETYPE) {
-		c5 = SIMD_gather(&(p2e[0].c5), ijtype);
-		c6 = SIMD_gather(&(p2e[0].c6), ijtype);
-	      }            
-	      SIMD_flt_t evdwl;
-	      evdwl = (c5 * rp - c6 * rq) * expsrainv;
-	      SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
-			       fwtmp2, fjtmp2);
-	    }
+              if (!ONETYPE) {
+                powerp = SIMD_gather(&(p2f[0].powerp), ijtype);
+                powerq = SIMD_gather(&(p2f[0].powerq), ijtype);
+              }
+              rp = SIMD_pow(r1, powerp);
+              rq = SIMD_pow(r1, powerq);
+            }
+
+            if (!ONETYPE) {
+              sigma = SIMD_gather(&(p2f[0].sigma), ijtype);
+              c1 = SIMD_gather(&(p2f2[0].c1), ijtype);
+              c2 = SIMD_gather(&(p2f2[0].c2), ijtype);
+              c3 = SIMD_gather(&(p2f2[0].c3), ijtype);
+              c4 = SIMD_gather(&(p2f2[0].c4), ijtype);
+            }
+
+            const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1;
+            const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1);
+            const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
+                                      rainvsq) * expsrainv * rinvsq1;
+
+            const SIMD_flt_t fjx = delx * fpair;
+            const SIMD_flt_t fjy = dely * fpair;
+            const SIMD_flt_t fjz = delz * fpair;
+
+            const SIMD_mask hmask = jj < ejnumhalf;
+            SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp,
+                             fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
+                             fztmp2, fjxtmp2, fjytmp2, fjztmp2);
+
+            if (EFLAG) {
+              if (!ONETYPE) {
+                c5 = SIMD_gather(&(p2e[0].c5), ijtype);
+                c6 = SIMD_gather(&(p2e[0].c6), ijtype);
+              }
+              SIMD_flt_t evdwl;
+              evdwl = (c5 * rp - c6 * rq) * expsrainv;
+              SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
+                               fwtmp2, fjtmp2);
+            }
           }
 
-	  /*---------------------------------------------*/
-	  SIMD_int ijkoff;
-	  if (!ONETYPE) {
-	    sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype);
-	    ijkoff = ijtype * ntypes;
-	  }
+          /*---------------------------------------------*/
+          SIMD_int ijkoff;
+          if (!ONETYPE) {
+            sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype);
+            ijkoff = ijtype * ntypes;
+          }
           const SIMD_flt_t gsrainv1 = sigma_gamma * rainv1;
           const SIMD_flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
           const SIMD_flt_t expgsrainv1 = SIMD_exp(gsrainv1);
 
-	  const SIMD_mask jmask = jj < ejnum;
+          const SIMD_mask jmask = jj < ejnum;
           for (int kk = jj+1; kk < ejnum_max; kk++) {
-	    SIMD_int iktype, ijktype;
-	    const int kcoffset = kk * swidth;
-	    if (!ONETYPE) {
-	      iktype = SIMD_load(tjtype + kcoffset);
-	      ijktype = ijkoff + (iktype << 2);
-	      iktype = (iktype + itype_offset) << 2;
-	      cut = SIMD_gather(&(p2[0].cut), iktype);
-	      sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype);
-	      costheta = SIMD_gather(&(p3[0].costheta), ijktype);
-	      lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype);
-	      lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype);
-	    }
-	    const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset);
-	    const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset);
-	    const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset);
-	    const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset);
-
-	    const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2);
-	    const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2);
-	    const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut);
-	    const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2;
-	    const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
-	    const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2);
-	    const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2);
-	    const SIMD_flt_t cs = (delx * delr2x + dely * delr2y + 
+            SIMD_int iktype, ijktype;
+            const int kcoffset = kk * swidth;
+            if (!ONETYPE) {
+              iktype = SIMD_load(tjtype + kcoffset);
+              ijktype = ijkoff + (iktype << 2);
+              iktype = (iktype + itype_offset) << 2;
+              cut = SIMD_gather(&(p2[0].cut), iktype);
+              sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype);
+              costheta = SIMD_gather(&(p3[0].costheta), ijktype);
+              lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype);
+              lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype);
+            }
+            const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset);
+            const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset);
+            const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset);
+            const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset);
+
+            const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2);
+            const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2);
+            const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut);
+            const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2;
+            const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
+            const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2);
+            const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2);
+            const SIMD_flt_t cs = (delx * delr2x + dely * delr2y +
                               delz * delr2z) * rinv12;
-	    const SIMD_flt_t delcs = cs - costheta;
-	    const SIMD_flt_t delcssq = delcs*delcs;
-
-	    const SIMD_flt_t facexp = expgsrainv1*expgsrainv2;
-	    const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq;
-	    const SIMD_flt_t frad1 = facrad * gsrainvsq1;
-	    const SIMD_flt_t frad2 = facrad * gsrainvsq2;
-	    const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs;
-	    const SIMD_flt_t facang12 = rinv12 * facang;
-	    const SIMD_flt_t csfacang = cs * facang;
-
-	    const SIMD_flt_t csfac1 = rinvsq1 * csfacang;
-	    const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12;
-	    const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12;
-	    const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12;
-
-	    const SIMD_flt_t csfac2 = rinvsq2 * csfacang;
-	    SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2);
-	    SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2);
-	    SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2);
-
-	    const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum);
-
-	    SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp,
-			    fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
-			    fztmp2, fjxtmp2, fjytmp2, fjztmp2, 
-			    tf + kcoffset * 3, swidth); 
-
-	    if (EFLAG) {
-	      SIMD_int k;
-	      if (eatom) {
-		k = SIMD_load(tj + kcoffset);
-		k = k << 4;
-	      }
-	      SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
-			     fwtmp2, fjtmp2, k, dforce);
-	    }
-	  } // for kk
-	  if (is_same<flt_t,acc_t>::value == 1)
-	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
-	  else
-	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, 
-	                fjxtmp2, fjytmp2, fjztmp2);
-
-	  if (EFLAG) {
-	    if (eatom) { 
-	      SIMD_int j = SIMD_load(tj + coffset);
-	      j = j << 4;
-	      SIMD_jeng_update(jmask, dforce + 3, j, fjtmp);
-	      if (is_same<flt_t,acc_t>::value == 0)
-		SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2);
-	    }
-	  }
+            const SIMD_flt_t delcs = cs - costheta;
+            const SIMD_flt_t delcssq = delcs*delcs;
+
+            const SIMD_flt_t facexp = expgsrainv1*expgsrainv2;
+            const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq;
+            const SIMD_flt_t frad1 = facrad * gsrainvsq1;
+            const SIMD_flt_t frad2 = facrad * gsrainvsq2;
+            const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs;
+            const SIMD_flt_t facang12 = rinv12 * facang;
+            const SIMD_flt_t csfacang = cs * facang;
+
+            const SIMD_flt_t csfac1 = rinvsq1 * csfacang;
+            const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12;
+            const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12;
+            const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12;
+
+            const SIMD_flt_t csfac2 = rinvsq2 * csfacang;
+            SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2);
+            SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2);
+            SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2);
+
+            const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum);
+
+            SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp,
+                            fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
+                            fztmp2, fjxtmp2, fjytmp2, fjztmp2,
+                            tf + kcoffset * 3, swidth);
+
+            if (EFLAG) {
+              SIMD_int k;
+              if (eatom) {
+                k = SIMD_load(tj + kcoffset);
+                k = k << 4;
+              }
+              SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
+                             fwtmp2, fjtmp2, k, dforce);
+            }
+          } // for kk
+          if (is_same<flt_t,acc_t>::value == 1)
+            SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
+          else
+            SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp,
+                        fjxtmp2, fjytmp2, fjztmp2);
+
+          if (EFLAG) {
+            if (eatom) {
+              SIMD_int j = SIMD_load(tj + coffset);
+              j = j << 4;
+              SIMD_jeng_update(jmask, dforce + 3, j, fjtmp);
+              if (is_same<flt_t,acc_t>::value == 0)
+                SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2);
+            }
+          }
         } // for jj first loop
 
         for (int jj = 0; jj < ejnum_max; jj++) {
-	  const int coffset = jj * swidth;
-	  const SIMD_mask jmask = jj < ejnum;
+          const int coffset = jj * swidth;
+          const SIMD_mask jmask = jj < ejnum;
           const SIMD_int j = SIMD_load(tj + coffset);
-	  const SIMD_int joffset = j << 4;
-
-	  SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2;
-	  int foffset = swidth;
-	  if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1;
-	  acc_t *p = tf + coffset * 3;
-	  fjxtmp = SIMD_load(p);
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    p = p + foffset;
-	    fjxtmp2 = SIMD_load(p);
-	  }
-	  p = p + foffset;
-	  fjytmp = SIMD_load(p);
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    p = p + foffset;
-	    fjytmp2 = SIMD_load(p);
-	  }
-	  p = p + foffset;
-	  fjztmp = SIMD_load(p);
-	  if (is_same<flt_t,acc_t>::value == 0) {
-	    p = p + foffset;
-	    fjztmp2 = SIMD_load(p);
-	  }
-	  
-	  SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp);
-	  SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp, 
-			     fjztmp);
+          const SIMD_int joffset = j << 4;
+
+          SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2;
+          int foffset = swidth;
+          if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1;
+          acc_t *p = tf + coffset * 3;
+          fjxtmp = SIMD_load(p);
+          if (is_same<flt_t,acc_t>::value == 0) {
+            p = p + foffset;
+            fjxtmp2 = SIMD_load(p);
+          }
+          p = p + foffset;
+          fjytmp = SIMD_load(p);
+          if (is_same<flt_t,acc_t>::value == 0) {
+            p = p + foffset;
+            fjytmp2 = SIMD_load(p);
+          }
+          p = p + foffset;
+          fjztmp = SIMD_load(p);
+          if (is_same<flt_t,acc_t>::value == 0) {
+            p = p + foffset;
+            fjztmp2 = SIMD_load(p);
+          }
+
+          SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp);
+          SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp,
+                             fjztmp);
           if (is_same<flt_t,acc_t>::value == 0) {
-	    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
-	    SIMD_mask jmask2 = jmask >> 8;
-	    SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2, 
-				     fjztmp2);
-	    SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2, 
-			       fjztmp2);
-	  }
-	} // for jj second loop
-
-	SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
-			   EFLAG, eatom, fwtmp);
-	if (is_same<flt_t,acc_t>::value == 0) {
-	  imask = imask >> 8;
-	  SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, 
-			     fztmp2, EFLAG, eatom, fwtmp2);
-	}
-	if (EFLAG) oevdwl += SIMD_sum(sevdwl);
-	ilist = ilist + iip;
+            SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
+            SIMD_mask jmask2 = jmask >> 8;
+            SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2,
+                                     fjztmp2);
+            SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2,
+                               fjztmp2);
+          }
+        } // for jj second loop
+
+        SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
+                           EFLAG, eatom, fwtmp);
+        if (is_same<flt_t,acc_t>::value == 0) {
+          imask = imask >> 8;
+          SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2,
+                             fztmp2, EFLAG, eatom, fwtmp2);
+        }
+        if (EFLAG) oevdwl += SIMD_sum(sevdwl);
+        ilist = ilist + iip;
       } // for ii
 
-      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, 
-			      x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride,
+                              x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
-  
+
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       ev_global[0] = oevdwl;
@@ -1119,7 +1119,7 @@ void PairSWIntel::init_style()
   #if defined(__INTEL_COMPILER)
   if (__INTEL_COMPILER_BUILD_DATE < 20141023)
     error->all(FLERR, "Intel compiler versions before "
-	       "15 Update 1 not supported for sw/intel");
+               "15 Update 1 not supported for sw/intel");
   #endif
 }
 
@@ -1168,7 +1168,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
       }
     }
   }
-  
+
   _onetype = 0;
   if (atom->ntypes == 1) _onetype = 1;
 
@@ -1178,55 +1178,55 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
     for (int jj = 0; jj < tp1; jj++) {
       int j = map[jj];
       if (i < 0 || j < 0 || ii == 0 || jj == 0) {
-	fc.p2[ii][jj].cutsq = 0;
-	fc.p2[ii][jj].cut = 0;
-	fc.p2[ii][jj].sigma_gamma = 0;
-	fc.p2f[ii][jj].cut = 0;
-	fc.p2f[ii][jj].powerp = 0;
-	fc.p2f[ii][jj].powerq = 0;
-	fc.p2f[ii][jj].sigma = 0;
-	fc.p2f2[ii][jj].c1 = 0;
-	fc.p2f2[ii][jj].c2 = 0;
-	fc.p2f2[ii][jj].c3 = 0;
-	fc.p2f2[ii][jj].c4 = 0;
-	fc.p2e[ii][jj].c5 = 0;
-	fc.p2e[ii][jj].c6 = 0;
+        fc.p2[ii][jj].cutsq = 0;
+        fc.p2[ii][jj].cut = 0;
+        fc.p2[ii][jj].sigma_gamma = 0;
+        fc.p2f[ii][jj].cut = 0;
+        fc.p2f[ii][jj].powerp = 0;
+        fc.p2f[ii][jj].powerq = 0;
+        fc.p2f[ii][jj].sigma = 0;
+        fc.p2f2[ii][jj].c1 = 0;
+        fc.p2f2[ii][jj].c2 = 0;
+        fc.p2f2[ii][jj].c3 = 0;
+        fc.p2f2[ii][jj].c4 = 0;
+        fc.p2e[ii][jj].c5 = 0;
+        fc.p2e[ii][jj].c6 = 0;
       } else {
-	int ijparam = elem2param[i][j][j];
-	fc.p2[ii][jj].cutsq = params[ijparam].cutsq;
-	fc.p2[ii][jj].cut = params[ijparam].cut;
-	fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
-	fc.p2f[ii][jj].cut = params[ijparam].cut;
-	fc.p2f[ii][jj].powerp = -params[ijparam].powerp;
-	fc.p2f[ii][jj].powerq = -params[ijparam].powerq;
-	fc.p2f[ii][jj].sigma = params[ijparam].sigma;
-	fc.p2f2[ii][jj].c1 = params[ijparam].c1;
-	fc.p2f2[ii][jj].c2 = params[ijparam].c2;
-	fc.p2f2[ii][jj].c3 = params[ijparam].c3;
-	fc.p2f2[ii][jj].c4 = params[ijparam].c4;
-	fc.p2e[ii][jj].c5 = params[ijparam].c5;
-	fc.p2e[ii][jj].c6 = params[ijparam].c6;
-
-	double cutcut = params[ijparam].cut * params[ijparam].cut;
-	if (params[ijparam].cutsq >= cutcut)
-	  fc.p2[ii][jj].cutsq *= 0.98;
-
-	if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0)
-	  _spq = 0;
+        int ijparam = elem2param[i][j][j];
+        fc.p2[ii][jj].cutsq = params[ijparam].cutsq;
+        fc.p2[ii][jj].cut = params[ijparam].cut;
+        fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
+        fc.p2f[ii][jj].cut = params[ijparam].cut;
+        fc.p2f[ii][jj].powerp = -params[ijparam].powerp;
+        fc.p2f[ii][jj].powerq = -params[ijparam].powerq;
+        fc.p2f[ii][jj].sigma = params[ijparam].sigma;
+        fc.p2f2[ii][jj].c1 = params[ijparam].c1;
+        fc.p2f2[ii][jj].c2 = params[ijparam].c2;
+        fc.p2f2[ii][jj].c3 = params[ijparam].c3;
+        fc.p2f2[ii][jj].c4 = params[ijparam].c4;
+        fc.p2e[ii][jj].c5 = params[ijparam].c5;
+        fc.p2e[ii][jj].c6 = params[ijparam].c6;
+
+        double cutcut = params[ijparam].cut * params[ijparam].cut;
+        if (params[ijparam].cutsq >= cutcut)
+          fc.p2[ii][jj].cutsq *= 0.98;
+
+        if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0)
+          _spq = 0;
       }
 
       for (int kk = 0; kk < tp1; kk++) {
         int k = map[kk];
-	if (i < 0 || j < 0 || k < 0  || ii == 0 || jj == 0 || kk == 0) {
-	  fc.p3[ii][jj][kk].costheta = 0;
-	  fc.p3[ii][jj][kk].lambda_epsilon = 0;
-	  fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
-	} else {
-	  int ijkparam = elem2param[i][j][k];
-	  fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
-	  fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
-	  fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2;
-	}
+        if (i < 0 || j < 0 || k < 0  || ii == 0 || jj == 0 || kk == 0) {
+          fc.p3[ii][jj][kk].costheta = 0;
+          fc.p3[ii][jj][kk].lambda_epsilon = 0;
+          fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
+        } else {
+          int ijkparam = elem2param[i][j][k];
+          fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
+          fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
+          fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2;
+        }
       }
     }
   }
@@ -1247,10 +1247,10 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   int tp1cu = tp1sq * tp1;
-  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
+  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
       op3 != NULL && ocutneighsq != NULL) {
     #pragma offload_transfer target(mic:_cop) \
-      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))	\
+      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
       in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
       in(ocutneighsq: length(tp1sq))
   }
@@ -1272,8 +1272,8 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       fc_packed3 *op3 = p3[0][0];
 
       #ifdef _LMP_INTEL_OFFLOAD
-      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
-	  op3 != NULL && _cop >= 0) {
+      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
+          op3 != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(op2, op2f, op2f2, op2e, op3: alloc_if(0) free_if(1))
       }
@@ -1301,8 +1301,8 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       fc_packed3 *op3 = p3[0][0];
       int tp1sq = ntypes * ntypes;
       int tp1cu = tp1sq * ntypes;
-      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
-	  op3 != NULL && cop >= 0) {
+      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
+          op3 != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(op3: length(tp1cu) alloc_if(1) free_if(0))
diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h
index b55022328f..ffcf9a6fb6 100644
--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@@ -49,7 +49,7 @@ class PairSWIntel : public PairSW {
   template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
-	    const int astart, const int aend, const int pad_width);
+            const int astart, const int aend, const int pad_width);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index f59a6b7c96..9e0a888638 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -47,7 +47,7 @@ void PairTersoffIntel::init_style()
 {
   if (comm->me == 0) {
     error->warning(FLERR, "Tersoff/intel currently requires intel compiler. "
-		   "Using MANYBODY version.");
+                   "Using MANYBODY version.");
   }
   PairTersoff::init_style();
 }
@@ -87,7 +87,7 @@ PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp)
 void PairTersoffIntel::compute(int eflag, int vflag)
 {
   if (fix->precision()==FixIntel::PREC_MODE_MIXED) {
-    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
                           force_const_single);
   } else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) {
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
@@ -104,8 +104,8 @@ void PairTersoffIntel::compute(int eflag, int vflag)
 //  do we need to calculate energy/virial
 template <class flt_t, class acc_t>
 void PairTersoffIntel::compute(int eflag, int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc)
 {
   if (eflag || vflag) {
     ev_setup(eflag,vflag);
@@ -127,13 +127,13 @@ void PairTersoffIntel::compute(int eflag, int vflag,
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				packthreads, sizeof(ATOM_T));
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
-  
+
   int ovflag = 0;
   if (vflag_fdotr) ovflag = 2;
   else if (vflag) ovflag = 1;
@@ -170,14 +170,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   // what's done in here is that they are inlined and vectorized
   // attractive() also provides an option to compute zeta as well
   static fvec zeta_vector(
-      const c_inner_t * param, 
-      ivec xjw, bvec mask, 
-      fvec vrij, fvec rsq2, 
-      fvec vdijx, fvec vdijy, fvec vdijz, 
+      const c_inner_t * param,
+      ivec xjw, bvec mask,
+      fvec vrij, fvec rsq2,
+      fvec vdijx, fvec vdijy, fvec vdijz,
       fvec dikx, fvec diky, fvec dikz
   );
   static void force_zeta_vector(
-      const c_outer_t * param, 
+      const c_outer_t * param,
       ivec xjw,
       bvec mask,
       fvec vrijsq, fvec vzeta_ij,
@@ -202,14 +202,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   // perform the actual computation
   template<bool EFLAG>
   static void kernel(
-      int iito, int iifrom, int eatom, int vflag, 
+      int iito, int iifrom, int eatom, int vflag,
       const int * _noalias const numneigh,
       const int * _noalias const numneighhalf,
-      const int * _noalias const cnumneigh, 
-      const int * _noalias const firstneigh, int ntypes, 
+      const int * _noalias const cnumneigh,
+      const int * _noalias const firstneigh, int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-      const c_inner_t * _noalias const c_inner, 
-      const c_outer_t * _noalias const c_outer, 
+      const c_inner_t * _noalias const c_inner,
+      const c_outer_t * _noalias const c_outer,
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
       acc_t *evdwl
   );
@@ -217,14 +217,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   // perform one step of calculation, pass in i-j pairs of atoms (is, js)
   template<int EFLAG>
   static void kernel_step(
-      int eatom, int vflag, 
+      int eatom, int vflag,
       const int * _noalias const numneigh,
-      const int * _noalias const cnumneigh, 
-      const int * _noalias const firstneigh, 
+      const int * _noalias const cnumneigh,
+      const int * _noalias const firstneigh,
       int ntypes,
       typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-      const c_inner_t * _noalias const c_inner, 
-      const c_outer_t * _noalias const c_outer, 
+      const c_inner_t * _noalias const c_inner,
+      const c_outer_t * _noalias const c_outer,
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
       avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
   );
@@ -233,12 +233,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   //  with fixed i and a number of js
   template<int EFLAG>
   static void kernel_step_const_i(
-    int eatom, int vflag, 
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int eatom, int vflag,
+    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const c_inner_t * _noalias const c_inner, 
-    const c_outer_t * _noalias const c_outer, 
+    const c_inner_t * _noalias const c_inner,
+    const c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
   );
@@ -255,9 +255,9 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
 // This method is nearly identical to what happens in the other /intel styles
 template <int EFLAG, class flt_t, class acc_t>
 void PairTersoffIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -289,8 +289,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
   IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
-		       buffers, offload, fix, separate_flag,
-		       x_size, q_size, ev_size, f_stride);
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
 
   int tc;
   FORCE_T * _noalias f_start;
@@ -326,8 +326,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
     #endif
     #endif
 
-    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, 
-			      f_stride, x, 0);
+    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EFLAG) oevdwl = oecoul = (acc_t)0;
@@ -354,7 +354,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
         // Pick the variable i algorithm under specific conditions
         // do use scalar algorithm with very short vectors
         int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
-        bool pack_i = VL >= 8 && 
+        bool pack_i = VL >= 8 &&
           lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
         bool use_scalar = VL < 4;
         if (use_scalar) {
@@ -364,16 +364,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
         } else {
           IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
         }
-	if (EFLAG) oevdwl += sevdwl;
+        if (EFLAG) oevdwl += sevdwl;
       }
 
       IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
-			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
-			      ov4, ov5);
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
     } // end of omp parallel region
 
     IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
-			ov0, ov1, ov2, ov3, ov4, ov5);
+                        ov0, ov1, ov2, ov3, ov4, ov5);
 
     if (EFLAG) {
       ev_global[0] = oevdwl;
@@ -431,7 +431,7 @@ void PairTersoffIntel::init_style()
     error->all(FLERR,
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
-  
+
   fix->pair_init_check();
   fix->three_body_neighbor(1);
   #ifdef _LMP_INTEL_OFFLOAD
@@ -481,25 +481,25 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
       for (int k = 1; k < tp1; k++) {
         Param * param = &params[elem2param[map[i]][map[j]][map[k]]];
         fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq);
-	fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
+        fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
         fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr);
         fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd);
         fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
         fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
         fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h);
         fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma);
-        fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);  
+        fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint);
 
         fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq);
-	fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
+        fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3);
         fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr);
         fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd);
         fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c);
         fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d);
         fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h);
         fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma);
-        fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);  
- 
+        fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint);
+
       }
       Param * param = &params[elem2param[map[i]][map[j]][map[j]]];
       fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
@@ -515,7 +515,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
       fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2);
       fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3);
       fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4);
-     
+
       fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq);
       fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr);
       fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd);
@@ -563,8 +563,8 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
 // As in any other /intel pair style
 template <class flt_t>
 void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
-							   Memory *memory,
-							   const int cop) {
+                                                           Memory *memory,
+                                                           const int cop) {
   if ( (ntypes != _ntypes) ) {
     if (_ntypes > 0) {
       #ifdef _LMP_INTEL_OFFLOAD
@@ -575,12 +575,12 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0];
       c_inner_t * oc_inner = c_inner[0][0];
       c_outer_t * oc_outer = c_outer[0];
-      if (c_first_loop != NULL && c_second_loop != NULL && 
+      if (c_first_loop != NULL && c_second_loop != NULL &&
           c_inner_loop != NULL &&  _cop >= 0) {
 
         #pragma offload_transfer target(mic:cop) \
-	  nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
-	  nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
+          nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \
+          nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \
           nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0))
       }
       #endif
@@ -614,7 +614,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       int tp1sq = ntypes * ntypes;
       int tp1cb = ntypes * ntypes * ntypes;
       int tp1cb_pad = ntypes * ntypes * ntypes_pad;
-      if (oc_first_loop != NULL && oc_second_loop != NULL && 
+      if (oc_first_loop != NULL && oc_second_loop != NULL &&
           oc_inner_loop != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
           nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \
@@ -642,15 +642,15 @@ static const int N_CACHE = 8;
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<int EFLAG>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
-    int eatom, int vflag, 
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int eatom, int vflag,
+    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, 
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, 
-    int compress_idx, 
+    avec *vsevdwl,
+    int compress_idx,
     iarr is,
     iarr js,
     bvec vmask_repulsive
@@ -662,7 +662,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   ivec v_i0(0);
   ivec v_i_ntypes(ntypes);
   ivec v_i_NEIGHMASK(NEIGHMASK);
-  
+
   farr fx, fy, fz, fw;
   int cache_idx = 0;
   fvec vfkx_cache[N_CACHE];
@@ -672,7 +672,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   bvec vmask_cache[N_CACHE];
   ivec vkks_final_cache;
   bvec vmask_final_cache;
-  iarr ts; 
+  iarr ts;
   // compute all the stuff we know from i and j
   // TDO: We could extract this from the driver routine
   ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
@@ -738,7 +738,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
             &vfix,&vfiy,&vfiz,
             &vfjx,&vfjy,&vfjz,
             &vfkx,&vfky,&vfkz,
-	    &vzeta_contrib);
+            &vzeta_contrib);
         vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
         vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
         vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@@ -749,9 +749,9 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
         vfkx_cache[cache_idx] = vfkx;
         vfky_cache[cache_idx] = vfky;
         vfkz_cache[cache_idx] = vfkz;
-	vks_cache[cache_idx] = vks;
-	vmask_cache[cache_idx] = veff_mask;
-	cache_idx += 1;
+        vks_cache[cache_idx] = vks;
+        vmask_cache[cache_idx] = veff_mask;
+        cache_idx += 1;
 
         vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
         vkks = vkks + v_i1;
@@ -799,7 +799,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair;
   vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
   vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
- 
+
   if (EFLAG) {
     *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
     if (eatom) {
@@ -833,7 +833,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     fvec vx_k, vy_k, vz_k, vcutsq;
     while (! v::mask_testz(vactive_mask)) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
-      vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & 
+      vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
           v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = vx_k - vx_i;
@@ -855,7 +855,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
             &vfix,&vfiy,&vfiz,
             &vfjx,&vfjy,&vfjz,
             &vfkx,&vfky,&vfkz,
-	    0);
+            0);
         vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
         vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy);
         vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@@ -917,15 +917,15 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<int EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
-    int eatom, int vflag, 
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int eatom, int vflag,
+    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, 
-    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
+    const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, 
-    int compress_idx, 
+    avec *vsevdwl,
+    int compress_idx,
     int i,
     iarr js,
     bvec vmask_repulsive
@@ -951,7 +951,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
   int kk_final_cache;
 
   aarr fx, fy, fz, fw;
-  iarr ts; 
+  iarr ts;
 
   bvec vmask = v::mask_enable_lower(compress_idx);
   fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z);
@@ -997,7 +997,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
       fvec vfix, vfiy, vfiz;
       fvec vfjx, vfjy, vfjz;
       fvec vfkx, vfky, vfkz;
-      
+
       attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.),
           vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik,
           &vfix,&vfiy,&vfiz,
@@ -1010,7 +1010,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
       vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx);
       vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy);
       vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz);
-      
+
       vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero());
       vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero());
       vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero());
@@ -1037,7 +1037,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
     if (! v::mask_testz(veff_mask)) {
-      fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq, 
+      fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq,
           vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik);
       vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib);
     }
@@ -1051,7 +1051,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
   vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair);
   vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
   vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
- 
+
   if (EFLAG) {
     *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
     if (eatom) {
@@ -1093,7 +1093,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
            &vfix,&vfiy,&vfiz,
            &vfjx,&vfjy,&vfjz,
            &vfkx,&vfky,&vfkz,
-	   0);
+           0);
        vfxtmp  = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix);
        vfytmp  = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy);
        vfztmp  = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz);
@@ -1129,14 +1129,14 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<bool EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
-    int iito, int iifrom, int eatom, int vflag, 
-    const int * _noalias const numneigh, 
-    const int * _noalias const numneighhalf, 
-    const int * _noalias const cnumneigh, 
-    const int * _noalias const firstneigh, int ntypes, 
+    int iito, int iifrom, int eatom, int vflag,
+    const int * _noalias const numneigh,
+    const int * _noalias const numneighhalf,
+    const int * _noalias const cnumneigh,
+    const int * _noalias const firstneigh, int ntypes,
     typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
-    const c_inner_t * _noalias const c_inner, 
-    const c_outer_t * _noalias const c_outer, 
+    const c_inner_t * _noalias const c_inner,
+    const c_outer_t * _noalias const c_outer,
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     acc_t *evdwl
 ) {
@@ -1181,10 +1181,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
         if (compress_idx == v::VL) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
           kernel_step<EFLAG>(
-              eatom, vflag, 
+              eatom, vflag,
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
-              &vsevdwl, compress_idx, 
+              &vsevdwl, compress_idx,
               is, js, vmask_repulsive
           );
           compress_idx = 0;
@@ -1194,10 +1194,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
         if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
           kernel_step_const_i<EFLAG>(
-              eatom, vflag, 
+              eatom, vflag,
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
-              &vsevdwl, compress_idx, 
+              &vsevdwl, compress_idx,
               i, js, vmask_repulsive
           );
           compress_idx = 0;
@@ -1209,10 +1209,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
   if (compress_idx > 0) {
         vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
         IntelKernelTersoff::kernel_step<EFLAG>(
-            eatom, vflag, 
+            eatom, vflag,
             numneigh, cnumneigh, firstneigh, ntypes,
             x, c_inner, c_outer, f,
-            &vsevdwl, compress_idx, 
+            &vsevdwl, compress_idx,
             is, js, vmask_repulsive
         );
   }
@@ -1224,10 +1224,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
-    const c_inner_t * param, 
-    ivec xjw, bvec mask, 
-    fvec vrij, fvec rsq2, 
-    fvec vdijx, fvec vdijy, fvec vdijz, 
+    const c_inner_t * param,
+    ivec xjw, bvec mask,
+    fvec vrij, fvec rsq2,
+    fvec vdijx, fvec vdijy, fvec vdijz,
     fvec dikx, fvec diky, fvec dikz
 ) {
   fvec v_1_0(1.0);
@@ -1250,7 +1250,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
   // Its kind of important to check the mask.
   // Some simulations never/rarely invoke this branch.
   if (! v::mask_testz(vmask_need_sine)) {
-    vfc = v::blend(vmask_need_sine, vfc, 
+    vfc = v::blend(vmask_need_sine, vfc,
         v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd))));
   }
   return vgijk * vex_delr * vfc;
@@ -1258,7 +1258,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
-    const c_outer_t * param, 
+    const c_outer_t * param,
     ivec xjw,
     bvec mask,
     fvec vrij, fvec vzeta_ij,
@@ -1402,9 +1402,9 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
     vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos);
   }
 
-  fvec vzeta_d_fc = vfc_d * vgijk * vex_delr; 
-  fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr; 
-  fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d; 
+  fvec vzeta_d_fc = vfc_d * vgijk * vex_delr;
+  fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr;
+  fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d;
   if (ZETA) *zeta = vfc * vgijk * vex_delr;
 
   fvec vminus_costheta = - vcostheta;
@@ -1417,7 +1417,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
   fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
   fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
   fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
-  
+
   *fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx);
   *fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty);
   *fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz);
diff --git a/src/USER-INTEL/pair_tersoff_intel.h b/src/USER-INTEL/pair_tersoff_intel.h
index c725487ae7..6da478c10f 100644
--- a/src/USER-INTEL/pair_tersoff_intel.h
+++ b/src/USER-INTEL/pair_tersoff_intel.h
@@ -75,14 +75,14 @@ class PairTersoffIntel : public PairTersoff {
   };
   ForceConst<float> force_const_single;
   ForceConst<double> force_const_double;
-  
+
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
   template <int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
-	    IntelBuffers<flt_t,acc_t> * buffers,
-	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
 
   template <class flt_t, class acc_t>
   void pack_force_const(ForceConst<flt_t> &fc,
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
index 110649f8ee..ec5f5150c2 100644
--- a/src/USER-INTEL/pppm_disp_intel.cpp
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -1,3034 +1,3034 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: William McDoniel (RWTH Aachen University)
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <stdlib.h>
-#include <math.h>
-#include "pppm_disp_intel.h"
-#include "atom.h"
-#include "error.h"
-#include "fft3d_wrap.h"
-#include "gridcomm.h"
-#include "math_const.h"
-#include "math_special.h"
-#include "memory.h"
-#include "suffix.h"
-
-using namespace LAMMPS_NS;
-using namespace MathConst;
-using namespace MathSpecial;
-
-#define MAXORDER   7
-#define OFFSET 16384
-#define SMALL 0.00001
-#define LARGE 10000.0
-#define EPS_HOC 1.0e-7
-
-enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
-enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
-enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
-     FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
-     FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
-     FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE, 
-     FORWARD_AD_PERATOM_NONE};
-
-#ifdef FFT_SINGLE
-#define ZEROF 0.0f
-#define ONEF  1.0f
-#else
-#define ZEROF 0.0
-#define ONEF  1.0
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) : 
-  PPPMDisp(lmp, narg, arg)
-{
-  suffix_flag |= Suffix::INTEL;
-
-  order = 7;
-  order_6 = 7; //sets default stencil sizes to 7
-
-  perthread_density = NULL;
-  particle_ekx = particle_eky = particle_ekz = NULL;
-  particle_ekx0 = particle_eky0 = particle_ekz0 = NULL;
-  particle_ekx1 = particle_eky1 = particle_ekz1 = NULL;
-  particle_ekx2 = particle_eky2 = particle_ekz2 = NULL;
-  particle_ekx3 = particle_eky3 = particle_ekz3 = NULL;
-  particle_ekx4 = particle_eky4 = particle_ekz4 = NULL;
-  particle_ekx5 = particle_eky5 = particle_ekz5 = NULL;
-  particle_ekx6 = particle_eky6 = particle_ekz6 = NULL;
-  
-  rho_lookup = drho_lookup = NULL;
-  rho6_lookup = drho6_lookup = NULL;
-  rho_points = 0;
-
-  _use_table = _use_packing = _use_lrt = 0;
-}
-
-PPPMDispIntel::~PPPMDispIntel()
-{
-  memory->destroy(perthread_density);
-  memory->destroy(particle_ekx);
-  memory->destroy(particle_eky);
-  memory->destroy(particle_ekz);
-
-  memory->destroy(rho_lookup);
-  memory->destroy(drho_lookup);
-  memory->destroy(rho6_lookup);
-  memory->destroy(drho6_lookup);  
-}
-
-
-
-/* ----------------------------------------------------------------------
-   called once before run
-------------------------------------------------------------------------- */
-
-
-void PPPMDispIntel::init()
-{
-
-  PPPMDisp::init();
-  int ifix = modify->find_fix("package_intel");
-  if (ifix < 0)
-    error->all(FLERR,
-               "The 'package intel' command is required for /intel styles");
-  fix = static_cast<FixIntel *>(modify->fix[ifix]);
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  _use_base = 0;
-  if (fix->offload_balance() != 0.0) {
-    _use_base = 1;
-    return;
-  }
-  #endif
-
-  fix->kspace_init_check();
-
-  _use_lrt = fix->lrt();
-  if (_use_lrt)
-    error->all(FLERR,
-               "LRT mode is currently not supported for pppm/disp/intel");
-
-  
-  // For vectorization, we need some padding in the end
-  // The first thread computes on the global density
-  if ((comm->nthreads > 1) && !_use_lrt) {
-    memory->destroy(perthread_density);
-    memory->create(perthread_density, comm->nthreads-1, 
-		   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
-                   "pppmdispintel:perthread_density");
-  }
-
-  _use_table = fix->pppm_table();
-  if (_use_table) {
-    rho_points = 5000;
-    memory->destroy(rho_lookup);
-    memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
-		   "pppmdispintel:rho_lookup");
-    memory->destroy(rho6_lookup);
-    memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
-		   "pppmdispintel:rho6_lookup");
-
-    if(differentiation_flag == 1) {
-      memory->destroy(drho_lookup);
-      memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
-		     "pppmdispintel:drho_lookup");
-      memory->destroy(drho6_lookup);
-      memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
-		     "pppmdispintel:drho6_lookup");
-    }
-    precompute_rho();
-  }
-  if (order > INTEL_P3M_MAXORDER)
-    error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
-}
-
-/* ----------------------------------------------------------------------
-   compute the PPPMDispIntel long-range force, energy, virial
-------------------------------------------------------------------------- */
-
-void PPPMDispIntel::compute(int eflag, int vflag)
-{
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (_use_base) {
-    PPPMDisp::compute(eflag, vflag);
-    return;
-  }
-  #endif
-  int i;
-  // convert atoms from box to lamda coords
-
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = evflag_atom = eflag_global = vflag_global =
-	 eflag_atom = vflag_atom = 0;
-
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    if (function[0]) {
-      cg_peratom->ghost_notify();
-      cg_peratom->setup();
-    }
-    if (function[1] + function[2] + function[3]) {
-      cg_peratom_6->ghost_notify();
-      cg_peratom_6->setup();
-    }
-    peratom_allocate_flag = 1;
-  }
-  if (triclinic == 0) boxlo = domain->boxlo;
-  else {
-    boxlo = domain->boxlo_lamda;
-    domain->x2lamda(atom->nlocal);
-  }
-  // extend size of per-atom arrays if necessary
-
-  if (atom->nmax > nmax) {
-
-    if (function[0]) memory->destroy(part2grid);
-    if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
-    if (differentiation_flag == 1) {
-      memory->destroy(particle_ekx);
-      memory->destroy(particle_eky);
-      memory->destroy(particle_ekz);
-      if (function[2] == 1){
-	memory->destroy(particle_ekx0);
-	memory->destroy(particle_eky0);
-	memory->destroy(particle_ekz0);
-	memory->destroy(particle_ekx1);
-	memory->destroy(particle_eky1);
-	memory->destroy(particle_ekz1);
-	memory->destroy(particle_ekx2);
-	memory->destroy(particle_eky2);
-	memory->destroy(particle_ekz2);
-	memory->destroy(particle_ekx3);
-	memory->destroy(particle_eky3);
-	memory->destroy(particle_ekz3);
-	memory->destroy(particle_ekx4);
-	memory->destroy(particle_eky4);
-	memory->destroy(particle_ekz4);
-	memory->destroy(particle_ekx5);
-	memory->destroy(particle_eky5);
-	memory->destroy(particle_ekz5);
-	memory->destroy(particle_ekx6);
-	memory->destroy(particle_eky6);
-	memory->destroy(particle_ekz6);	
-      }
-      
-    }    
-    nmax = atom->nmax;
-    if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid");
-    if (function[1] + function[2] + function[3])
-      memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
-    if (differentiation_flag == 1) {
-      memory->create(particle_ekx, nmax, "pppmdispintel:pekx");
-      memory->create(particle_eky, nmax, "pppmdispintel:peky");
-      memory->create(particle_ekz, nmax, "pppmdispintel:pekz");
-      if (function[2] == 1){
-	memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0");
-	memory->create(particle_eky0, nmax, "pppmdispintel:peky0");
-	memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0");
-	memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1");
-	memory->create(particle_eky1, nmax, "pppmdispintel:peky1");
-	memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1");
-	memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2");
-	memory->create(particle_eky2, nmax, "pppmdispintel:peky2");
-	memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2");
-	memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3");
-	memory->create(particle_eky3, nmax, "pppmdispintel:peky3");
-	memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3");
-	memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4");
-	memory->create(particle_eky4, nmax, "pppmdispintel:peky4");
-	memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4");
-	memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5");
-	memory->create(particle_eky5, nmax, "pppmdispintel:peky5");
-	memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5");
-	memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6");
-	memory->create(particle_eky6, nmax, "pppmdispintel:peky6");
-	memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6");	
-      }
-    }    
-  }
-  energy = 0.0;
-  energy_1 = 0.0;
-  energy_6 = 0.0;
-  if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0;
-
-  // find grid points for all my particles
-  // distribute partcles' charges/dispersion coefficients on the grid
-  // communication between processors and remapping two fft
-  // Solution of poissons equation in k-space and backtransformation
-  // communication between processors
-  // calculation of forces
-  
-  if (function[0]) {
-
-    //perform calculations for coulomb interactions only
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid, 
-				 nupper, nlower, nxlo_out, nylo_out, nzlo_out, 
-				 nxhi_out, nyhi_out, nzhi_out, 
-				 fix->get_mixed_buffers());
-      make_rho_c<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid, 
-				  nupper, nlower, nxlo_out, nylo_out, 
-				  nzlo_out, nxhi_out, nyhi_out, nzhi_out, 
-				  fix->get_double_buffers());
-      make_rho_c<double,double>(fix->get_double_buffers());
-    } else {
-      particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid, 
-				nupper, nlower, nxlo_out, nylo_out, nzlo_out, 
-				nxhi_out, nyhi_out, nzhi_out, 
-				fix->get_single_buffers());
-      make_rho_c<float,float>(fix->get_single_buffers());
-    }
-  
-    cg->reverse_comm(this,REVERSE_RHO);
-
-    brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
-	      density_brick, density_fft, work1,remap);
-    
-    if (differentiation_flag == 1) {
-      poisson_ad(work1, work2, density_fft, fft1, fft2,
-                 nx_pppm, ny_pppm, nz_pppm, nfft,
-                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
-                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
-                 energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick, 
-		 v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
-
-      cg->forward_comm(this,FORWARD_AD);
-      
-      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
-      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_c_ad<double,double>(fix->get_double_buffers());
-      } else {
-	fieldforce_c_ad<float,float>(fix->get_single_buffers());
-      }      
-
-      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
-
-    } else {
-      poisson_ik(work1, work2, density_fft, fft1, fft2,
-                 nx_pppm, ny_pppm, nz_pppm, nfft,
-                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
-                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
-                 energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2,
-                 vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
-                 u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, 
-		 v5_brick);
-
-      cg->forward_comm(this, FORWARD_IK);
-
-      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
-      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_c_ik<double,double>(fix->get_double_buffers());
-      } else {
-	fieldforce_c_ik<float,float>(fix->get_single_buffers());
-      }       
-
-      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
-    }
-    if (evflag_atom) fieldforce_c_peratom();
-  }
-
-  if (function[1]) {
-    //perfrom calculations for geometric mixing
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				 part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
-				 nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				 nyhi_out_6, nzhi_out_6, 
-				 fix->get_mixed_buffers());
-      make_rho_g<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				  part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
-				  nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				  nyhi_out_6, nzhi_out_6,
-				  fix->get_double_buffers());
-      make_rho_g<double,double>(fix->get_double_buffers());      
-    } else {
-      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
-				nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				nyhi_out_6, nzhi_out_6, 
-				fix->get_single_buffers());
-      make_rho_g<float,float>(fix->get_single_buffers());      
-    }    
-
-
-    cg_6->reverse_comm(this, REVERSE_RHO_G);
-
-    brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
-	      density_brick_g, density_fft_g, work1_6,remap_6);
-
-    if (differentiation_flag == 1) {
-
-      poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
-                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
-                 nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, 
-		 nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6, 
-		 nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6,
-                 virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g, 
-		 v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
-
-      cg_6->forward_comm(this,FORWARD_AD_G);
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_g_ad<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_g_ad<float,float>(fix->get_single_buffers());
-    }          
-
-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
-
-    } else {
-      poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
-                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, 
-		 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
-                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, 
-		 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,
-		 fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g, 
-		 vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, 
-		 v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
-
-      cg_6->forward_comm(this,FORWARD_IK_G);
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_g_ik<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_g_ik<float,float>(fix->get_single_buffers());
-    }        
-
-
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
-    }
-    if (evflag_atom) fieldforce_g_peratom();
-  }
-
-  if (function[2]) {
-    //perform calculations for arithmetic mixing
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				 part2grid_6, nupper_6, nlower_6,
-				 nxlo_out_6, nylo_out_6, nzlo_out_6, 
-				 nxhi_out_6, nyhi_out_6, nzhi_out_6,
-				 fix->get_mixed_buffers());
-      make_rho_a<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
-				  nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				  nyhi_out_6, nzhi_out_6,
-				  fix->get_double_buffers());
-      make_rho_a<double,double>(fix->get_double_buffers());      
-    } else {
-      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
-				nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				nyhi_out_6, nzhi_out_6,
-				fix->get_single_buffers());
-      make_rho_a<float,float>(fix->get_single_buffers());      
-    }        
-
-    cg_6->reverse_comm(this, REVERSE_RHO_A);
-
-    brick2fft_a();
-
-    if ( differentiation_flag == 1) {
-
-      poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
-                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, 
-		 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
-                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, 
-		 nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6,
-                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, 
-		 v3_brick_a3, v4_brick_a3, v5_brick_a3);
-      poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0, 
-		    v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, 
-		    v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6, 
-		    v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
-      poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1, 
-		    v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, 
-		    v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5, 
-		    v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
-      poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2, 
-		    v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, 
-		    v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4, 
-		    v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
-
-      cg_6->forward_comm(this, FORWARD_AD_A);
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_a_ad<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_a_ad<float,float>(fix->get_single_buffers());
-    }        
-
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
-
-    }  else {
-
-      poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
-                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, 
-		 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
-                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, 
-		 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6,
-		 fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, 
-		 virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3, 
-		 v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
-      poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0, 
-		    vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6, 
-		    vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0, 
-		    v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
-                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, 
-		    v3_brick_a6, v4_brick_a6, v5_brick_a6);
-      poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1, 
-		    vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5, 
-		    vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1, 
-		    v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
-                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, 
-		    v3_brick_a5, v4_brick_a5, v5_brick_a5);
-      poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2, 
-		    vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4, 
-		    vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2, 
-		    v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
-                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, 
-		    v3_brick_a4, v4_brick_a4, v5_brick_a4);
-
-      cg_6->forward_comm(this, FORWARD_IK_A);
-
-      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-	fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
-      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-	fieldforce_a_ik<double,double>(fix->get_double_buffers());
-      } else {
-	fieldforce_a_ik<float,float>(fix->get_single_buffers());
-      }             
-
-      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
-    }
-    if (evflag_atom) fieldforce_a_peratom();
-  }
-  
-  if (function[3]) {
-    //perform calculations if no mixing rule applies
-    
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				 part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
-				 nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				 nyhi_out_6, nzhi_out_6, 
-				 fix->get_mixed_buffers());
-      make_rho_none<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
-				  nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				  nyhi_out_6, nzhi_out_6,
-				  fix->get_double_buffers());
-      make_rho_none<double,double>(fix->get_double_buffers());      
-    } else {
-      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
-				part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
-				nylo_out_6, nzlo_out_6, nxhi_out_6, 
-				nyhi_out_6, nzhi_out_6,
-				fix->get_single_buffers());
-      make_rho_none<float,float>(fix->get_single_buffers());      
-    }         
-
-    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
-
-    brick2fft_none();
-
-    if (differentiation_flag == 1) {
-
-      int n = 0;
-      for (int k = 0; k<nsplit_alloc/2; k++) {
-        poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
-                        u_brick_none[n],u_brick_none[n+1],
-                        v0_brick_none, v1_brick_none, v2_brick_none,
-                        v3_brick_none, v4_brick_none, v5_brick_none);
-        n += 2;
-      }
-
-      cg_6->forward_comm(this,FORWARD_AD_NONE);
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_none_ad<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_none_ad<float,float>(fix->get_single_buffers());
-    }          
-
-      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
-
-    } else {
-      int n = 0;
-      for (int k = 0; k<nsplit_alloc/2; k++) {
-
-        poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1],
-                        vdx_brick_none[n], vdy_brick_none[n], 
-			vdz_brick_none[n], vdx_brick_none[n+1], 
-			vdy_brick_none[n+1], vdz_brick_none[n+1],
-                        u_brick_none, v0_brick_none, v1_brick_none, 
-			v2_brick_none, v3_brick_none, v4_brick_none, 
-			v5_brick_none);
-        n += 2;
-      }
-
-      cg_6->forward_comm(this,FORWARD_IK_NONE);
-
-    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
-      fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
-    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
-      fieldforce_none_ik<double,double>(fix->get_double_buffers());
-    } else {
-      fieldforce_none_ik<float,float>(fix->get_single_buffers());
-    }          
-
-      if (evflag_atom)
-        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
-    }
-    if (evflag_atom) fieldforce_none_peratom();
-  }
-
-  // update qsum and qsqsum, if atom count has changed and energy needed
-
-  if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) {
-    qsum_qsq();
-    natoms_original = atom->natoms;
-  }
-
-  // sum energy across procs and add in volume-dependent term
-
-  const double qscale = force->qqrd2e * scale;
-  if (eflag_global) {
-    double energy_all;
-    MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
-    energy_1 = energy_all;
-    MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
-    energy_6 = energy_all;
-
-    energy_1 *= 0.5*volume;
-    energy_6 *= 0.5*volume;
-
-    energy_1 -= g_ewald*qsqsum/MY_PIS +
-      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
-    energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij +
-      1.0/12.0*pow(g_ewald_6,6)*csum;
-    energy_1 *= qscale;
-  }
-
-  // sum virial across procs
-
-  if (vflag_global) {
-    double virial_all[6];
-    MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
-    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
-    MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
-    for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
-    if (function[1]+function[2]+function[3]){
-      double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
-      virial[0] -= a;
-      virial[1] -= a;
-      virial[2] -= a;
-    }
-  }
-
-  if (eflag_atom) {
-    if (function[0]) {
-      double *q = atom->q;
-      for (i = 0; i < atom->nlocal; i++) {
-        eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]*
-	  qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction
-      }
-    }
-    if (function[1] + function[2] + function[3]) {
-      int tmp;
-      for (i = 0; i < atom->nlocal; i++) {
-        tmp = atom->type[i];
-        eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] +
-                      1.0/12.0*pow(g_ewald_6,6)*cii[tmp];
-      }
-    }
-  }
-
-  if (vflag_atom) {
-    if (function[1] + function[2] + function[3]) {
-      int tmp;
-      for (i = 0; i < atom->nlocal; i++) {
-        tmp = atom->type[i];
-	//dispersion self virial correction
-        for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*
-				      pow(g_ewald_6,3)*csumi[tmp]; 
-      }
-    }
-  }
-
-
-  // 2d slab correction
-
-  if (slabflag) slabcorr(eflag);
-  if (function[0]) energy += energy_1;
-  if (function[1] + function[2] + function[3]) energy += energy_6;
-
-  // convert atoms back from lamda to box coords
-
-  if (triclinic) domain->lamda2x(atom->nlocal);
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   find center grid pt for each of my particles
-   check that full stencil for the particle will fit in my 3d brick
-   store central grid pt indices in part2grid array
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t>
-void PPPMDispIntel::particle_map(double delx, double dely, double delz,
-				 double sft, int** p2g, int nup, int nlow,
-				 int nxlo, int nylo, int nzlo,
-				 int nxhi, int nyhi, int nzhi,
-				 IntelBuffers<flt_t,acc_t> *buffers)
-{
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
-    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
-
-  int flag = 0;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\
-	   nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt)
-  #endif
-  {
-    double **x = atom->x;
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delx;
-    const flt_t yi = dely;
-    const flt_t zi = delz;
-    const flt_t fshift = sft;
-
-
-    int iifrom, iito, tid;
-    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
-
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma vector aligned
-    #pragma simd reduction(+:flag)
-    #endif    
-    for (int i = iifrom; i < iito; i++) {
-
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // current particle coord can be outside global and local box
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET;
-    int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET;
-    int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET;
-
-    p2g[i][0] = nx;
-    p2g[i][1] = ny;
-    p2g[i][2] = nz;
-
-    // check that entire stencil around nx,ny,nz will fit in my 3d brick
-
-    if (nx+nlow < nxlo || nx+nup > nxhi ||
-	ny+nlow < nylo || ny+nup > nyhi ||
-	nz+nlow < nzlo || nz+nup > nzhi)
-      flag = 1;
-  }
-  }
-
-  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp");
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = charge "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
-{
-  // clear 3d density array
-
-  FFT_SCALAR * _noalias global_density = 
-    &(density_brick[nzlo_out][nylo_out][nxlo_out]);
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  //double *q = atom->q;
-  //double **x = atom->x;
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nthr, nlocal, global_density) if(!_use_lrt)
-  #endif
-  {
-  double *q = atom->q;
-  double **x = atom->x;
-    
-    const int nix = nxhi_out - nxlo_out + 1;
-    const int niy = nyhi_out - nylo_out + 1;
-    
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv;
-    const flt_t yi = delyinv;
-    const flt_t zi = delzinv;
-    const flt_t fshift = shift;
-    const flt_t fshiftone = shiftone;
-    const flt_t fdelvolinv = delvolinv;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : 
-      perthread_density[tid - 1];
-    // clear 3d density array
-    memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
-
-    for (int i = ifrom; i < ito; i++) {
-  
-      int nx = part2grid[i][0];
-      int ny = part2grid[i][1];
-      int nz = part2grid[i][2];
-
-      int nysum = nlower + ny - nylo_out;
-      int nxsum = nlower + nx - nxlo_out;
-      int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho_lookup[idx][k];
-          rho[1][k] = rho_lookup[idy][k];
-          rho[2][k] = rho_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower; k <= nupper; k++) {
-          FFT_SCALAR r1,r2,r3;
-          r1 = r2 = r3 = ZEROF;
-  
-          for (int l = order-1; l >= 0; l--) {
-            r1 = rho_coeff[l][k] + r1*dx;
-            r2 = rho_coeff[l][k] + r2*dy;
-            r3 = rho_coeff[l][k] + r3*dz;
-          }
-          rho[0][k-nlower] = r1;
-          rho[1][k-nlower] = r2;
-          rho[2][k-nlower] = r3;
-        }
-      }
-  
-      FFT_SCALAR z0 = fdelvolinv * q[i];
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order; n++) {
-        int mz = n*nix*niy + nzsum;
-        FFT_SCALAR y0 = z0*rho[2][n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order; m++) {
-          int mzy = m*nix + mz;
-          FFT_SCALAR x0 = y0*rho[1][m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mzyx = l + mzy;
-            my_density[mzyx] += x0*rho[0][l];
-          }
-        }
-      }
-    }
-  }
-
-  // reduce all the perthread_densities into global_density
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nthr, global_density) if(!_use_lrt)
-  #endif
-  {
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
-
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int i = ifrom; i < ito; i++) {
-      for(int j = 1; j < nthr; j++) {
-        global_density[i] += perthread_density[j-1][i];
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = dispersion "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid --- geometric mixing
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
-{
-  // clear 3d density array
-
-  FFT_SCALAR * _noalias global_density = 
-    &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nthr, nlocal, global_density) if(!_use_lrt)
-  #endif
-  {
-    int type;
-    double **x = atom->x;
-    
-    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
-    const int niy = nyhi_out_6 - nylo_out_6 + 1;
-    
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshift = shift_6;
-    const flt_t fshiftone = shiftone_6;
-    const flt_t fdelvolinv = delvolinv_6;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : 
-      perthread_density[tid - 1];
-
-    // clear 3d density array
-    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
-
-    for (int i = ifrom; i < ito; i++) {
-  
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-
-      int nysum = nlower_6 + ny - nylo_out_6;
-      int nxsum = nlower_6 + nx - nxlo_out_6;
-      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho6_lookup[idx][k];
-          rho[1][k] = rho6_lookup[idy][k];
-          rho[2][k] = rho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1,r2,r3;
-          r1 = r2 = r3 = ZEROF;
-  
-          for (int l = order_6-1; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1*dx;
-            r2 = rho_coeff_6[l][k] + r2*dy;
-            r3 = rho_coeff_6[l][k] + r3*dz;
-          }
-          rho[0][k-nlower_6] = r1;
-          rho[1][k-nlower_6] = r2;
-          rho[2][k-nlower_6] = r3;
-        }
-      }
-
-      type = atom->type[i];
-      FFT_SCALAR z0 = fdelvolinv * B[type];
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n*nix*niy + nzsum;
-        FFT_SCALAR y0 = z0*rho[2][n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int mzy = m*nix + mz;
-          FFT_SCALAR x0 = y0*rho[1][m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mzyx = l + mzy;
-            my_density[mzyx] += x0*rho[0][l];
-          }
-        }
-      }
-    }
-  }
-
-  // reduce all the perthread_densities into global_density
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nthr, global_density) if(!_use_lrt)
-  #endif
-  {
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
-
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int i = ifrom; i < ito; i++) {
-      for(int j = 1; j < nthr; j++) {
-        global_density[i] += perthread_density[j-1][i];
-      }
-    }
-  }
-  
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = dispersion "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid --- arithmetic mixing
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
-{
-  // clear 3d density array
-
-  memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-  memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-  memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-  memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-  memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-  memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-  memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
-	 ngrid_6*sizeof(FFT_SCALAR));
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  int nlocal = atom->nlocal;
-
-    double **x = atom->x;
-    
-    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
-    const int niy = nyhi_out_6 - nylo_out_6 + 1;
-    
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshift = shift_6;
-    const flt_t fshiftone = shiftone_6;
-    const flt_t fdelvolinv = delvolinv_6;
-
-    for (int i = 0; i < nlocal; i++) {
-  
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho6_lookup[idx][k];
-          rho[1][k] = rho6_lookup[idy][k];
-          rho[2][k] = rho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1,r2,r3;
-          r1 = r2 = r3 = ZEROF;
-  
-          for (int l = order_6-1; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1*dx;
-            r2 = rho_coeff_6[l][k] + r2*dy;
-            r3 = rho_coeff_6[l][k] + r3*dz;
-          }
-          rho[0][k-nlower_6] = r1;
-          rho[1][k-nlower_6] = r2;
-          rho[2][k-nlower_6] = r3;
-        }
-      }
-
-      const int type = atom->type[i];
-      FFT_SCALAR z0 = fdelvolinv;
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n + nzsum;
-        FFT_SCALAR y0 = z0*rho[2][n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int my = m + nysum;
-          FFT_SCALAR x0 = y0*rho[1][m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l + nxsum;
-	    FFT_SCALAR w = x0*rho[0][l];
-            density_brick_a0[mz][my][mx] += w*B[7*type];
-	    density_brick_a1[mz][my][mx] += w*B[7*type+1];
-	    density_brick_a2[mz][my][mx] += w*B[7*type+2];
-	    density_brick_a3[mz][my][mx] += w*B[7*type+3];
-	    density_brick_a4[mz][my][mx] += w*B[7*type+4];
-	    density_brick_a5[mz][my][mx] += w*B[7*type+5];
-	    density_brick_a6[mz][my][mx] += w*B[7*type+6];
-          }
-        }
-      }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = dispersion "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid --- case when mixing rules don't apply
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]);
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nthr, nlocal, global_density) if(!_use_lrt)
-  #endif
-  {
-    int type;
-    double **x = atom->x;
-    
-    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
-    const int niy = nyhi_out_6 - nylo_out_6 + 1;
-    
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshift = shift_6;
-    const flt_t fshiftone = shiftone_6;
-    const flt_t fdelvolinv = delvolinv_6;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : 
-      perthread_density[tid - 1];
-    // clear 3d density array
-    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
-
-    for (int i = ifrom; i < ito; i++) {
-  
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-
-      int nysum = nlower_6 + ny - nylo_out_6;
-      int nxsum = nlower_6 + nx - nxlo_out_6;
-      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho6_lookup[idx][k];
-          rho[1][k] = rho6_lookup[idy][k];
-          rho[2][k] = rho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1,r2,r3;
-          r1 = r2 = r3 = ZEROF;
-  
-          for (int l = order_6-1; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1*dx;
-            r2 = rho_coeff_6[l][k] + r2*dy;
-            r3 = rho_coeff_6[l][k] + r3*dz;
-          }
-          rho[0][k-nlower_6] = r1;
-          rho[1][k-nlower_6] = r2;
-          rho[2][k-nlower_6] = r3;
-        }
-      }
-
-      type = atom->type[i];
-      FFT_SCALAR z0 = fdelvolinv;
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n*nix*niy + nzsum;
-        FFT_SCALAR y0 = z0*rho[2][n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int mzy = m*nix + mz;
-          FFT_SCALAR x0 = y0*rho[1][m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mzyx = l + mzy;
-	    FFT_SCALAR w0 = x0*rho[0][l];
-	    for(int k = 0; k < nsplit; k++)
-	      my_density[mzyx + k*ngrid_6] += x0*rho[0][l];
-          }
-        }
-      }
-    }
-  }
-
-  // reduce all the perthread_densities into global_density
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nthr, global_density) if(!_use_lrt)
-  #endif
-  {
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
-
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int i = ifrom; i < ito; i++) {
-      for(int j = 1; j < nthr; j++) {
-        global_density[i] += perthread_density[j-1][i];
-      }
-    }
-  }
-  
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
-   for ik scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of E-field on particle
-
-  //double *q = atom->q;
-  //double **x = atom->x;
-  //double **f = atom->f;
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-
-    double *q = atom->q;
-    double **x = atom->x;
-    double **f = atom->f;
-  
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv;
-    const flt_t yi = delyinv;
-    const flt_t zi = delzinv;
-    const flt_t fshiftone = shiftone;
-    const flt_t fqqrd2es = qqrd2e * scale;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid[i][0];
-      int ny = part2grid[i][1];
-      int nz = part2grid[i][2];
-
-      int nxsum = nx + nlower;
-      int nysum = ny + nlower;
-      int nzsum = nz + nlower;;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho0[k] = rho_lookup[idx][k];
-          rho1[k] = rho_lookup[idy][k];
-          rho2[k] = rho_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = nlower; k <= nupper; k++) {
-          FFT_SCALAR r1 = rho_coeff[order-1][k];
-          FFT_SCALAR r2 = rho_coeff[order-1][k];
-          FFT_SCALAR r3 = rho_coeff[order-1][k];
-          for (int l = order-2; l >= 0; l--) {
-            r1 = rho_coeff[l][k] + r1*dx;
-            r2 = rho_coeff[l][k] + r2*dy;
-            r3 = rho_coeff[l][k] + r3*dz;
-          }
-
-          rho0[k-nlower] = r1;
-          rho1[k-nlower] = r2;
-          rho2[k-nlower] = r3;
-        }
-      }
-
-      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order; n++) {
-        int mz = n+nzsum;
-        FFT_SCALAR z0 = rho2[n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order; m++) {
-          int my = m+nysum;
-          FFT_SCALAR y0 = z0*rho1[m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l+nxsum;
-            FFT_SCALAR x0 = y0*rho0[l];
-              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
-              eky_arr[l] -= x0*vdy_brick[mz][my][mx];
-              ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
-           
-          }
-        }
-      }
-
-      FFT_SCALAR ekx, eky, ekz;
-      ekx = eky = ekz = ZEROF;
-
-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx += ekx_arr[l];
-          eky += eky_arr[l];
-          ekz += ekz_arr[l];
-        }
-
-      // convert E-field to force
-
-      const flt_t qfactor = fqqrd2es * q[i];
-      f[i][0] += qfactor*ekx;
-      f[i][1] += qfactor*eky;
-      if (slabflag != 2) f[i][2] += qfactor*ekz;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
-   for ad scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of E-field on particle
-
-  //double *q = atom->q;
-  //double **x = atom->x;
-  //double **f = atom->f;
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
-  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
-  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-
-    double *prd;
-    if (triclinic == 0) prd = domain->prd;
-    else prd = domain->prd_lamda;
-    
-    double *q = atom->q;
-    double **x = atom->x;
-    double **f = atom->f;    
-    const flt_t ftwo_pi = MY_PI * 2.0;
-    const flt_t ffour_pi = MY_PI * 4.0;
-
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv;
-    const flt_t yi = delyinv;
-    const flt_t zi = delzinv;
-    const flt_t fshiftone = shiftone;
-    const flt_t fqqrd2es = qqrd2e * scale;
-
-    const double xprd = prd[0];
-    const double yprd = prd[1];
-    const double zprd = prd[2]*slab_volfactor;
-
-    const flt_t hx_inv = nx_pppm/xprd;
-    const flt_t hy_inv = ny_pppm/yprd;
-    const flt_t hz_inv = nz_pppm/zprd;
-
-    const flt_t fsf_coeff0 = sf_coeff[0];
-    const flt_t fsf_coeff1 = sf_coeff[1];
-    const flt_t fsf_coeff2 = sf_coeff[2];
-    const flt_t fsf_coeff3 = sf_coeff[3];
-    const flt_t fsf_coeff4 = sf_coeff[4];
-    const flt_t fsf_coeff5 = sf_coeff[5];
-  
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid[i][0];
-      int ny = part2grid[i][1];
-      int nz = part2grid[i][2];
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      int nxsum = nx + nlower;
-      int nysum = ny + nlower;
-      int nzsum = nz + nlower;
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho_lookup[idx][k];
-          rho[1][k] = rho_lookup[idy][k];
-          rho[2][k] = rho_lookup[idz][k];
-          drho[0][k] = drho_lookup[idx][k];
-          drho[1][k] = drho_lookup[idy][k];
-          drho[2][k] = drho_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower; k <= nupper; k++) {
-          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
-          dr1 = dr2 = dr3 = ZEROF;
-  
-          r1 = rho_coeff[order-1][k];
-          r2 = rho_coeff[order-1][k];
-          r3 = rho_coeff[order-1][k];
-          for (int l = order-2; l >= 0; l--) {
-            r1 = rho_coeff[l][k] + r1 * dx;
-            r2 = rho_coeff[l][k] + r2 * dy;
-            r3 = rho_coeff[l][k] + r3 * dz;
-            dr1 = drho_coeff[l][k] + dr1 * dx;
-            dr2 = drho_coeff[l][k] + dr2 * dy;
-            dr3 = drho_coeff[l][k] + dr3 * dz;
-          }
-          rho[0][k-nlower] = r1;
-          rho[1][k-nlower] = r2;
-          rho[2][k-nlower] = r3;
-          drho[0][k-nlower] = dr1;
-          drho[1][k-nlower] = dr2;
-          drho[2][k-nlower] = dr3;
-        }
-      }
-      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-
-      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order; n++) {
-        int mz = n + nzsum;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order; m++) {
-          int my = m + nysum;
-          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
-          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
-          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l + nxsum;
-            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
-            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
-            ekz[l] +=  rho[0][l] * ekz_p * u_brick[mz][my][mx];
-          }
-        }
-      }
-  
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
-      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
-      	particle_ekx[i] += ekx[l];
-      	particle_eky[i] += eky[l];
-      	particle_ekz[i] += ekz[l];
-      }
-    }
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int i = ifrom; i < ito; i++) {
-      particle_ekx[i] *= hx_inv;
-      particle_eky[i] *= hy_inv;
-      particle_ekz[i] *= hz_inv;
-  
-      // convert E-field to force
-  
-      const flt_t qfactor = fqqrd2es * q[i];
-      const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
-  
-      const flt_t s1 = x[i][0] * hx_inv;
-      const flt_t s2 = x[i][1] * hy_inv;
-      const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
-      sf *= twoqsq;
-      f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;
-  
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
-      sf *= twoqsq;
-      f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;
-  
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
-      sf *= twoqsq;
-  
-      if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get dispersion field & force on my particles
-   for geometric mixing rule
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of dispersion field on particle
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-
-    double lj;
-    int type;
-    double **x = atom->x;
-    double **f = atom->f;
-  
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshiftone = shiftone_6;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho0[k] = rho6_lookup[idx][k];
-          rho1[k] = rho6_lookup[idy][k];
-          rho2[k] = rho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
-          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
-          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
-          for (int l = order_6-2; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1*dx;
-            r2 = rho_coeff_6[l][k] + r2*dy;
-            r3 = rho_coeff_6[l][k] + r3*dz;
-          }
-
-          rho0[k-nlower_6] = r1;
-          rho1[k-nlower_6] = r2;
-          rho2[k-nlower_6] = r3;
-        }
-      }
-
-      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n+nzsum;
-        FFT_SCALAR z0 = rho2[n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int my = m+nysum;
-          FFT_SCALAR y0 = z0*rho1[m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l+nxsum;
-            FFT_SCALAR x0 = y0*rho0[l];
-              ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
-              eky_arr[l] -= x0*vdy_brick_g[mz][my][mx];
-              ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx];
-           
-          }
-        }
-      }
-
-      FFT_SCALAR ekx, eky, ekz;
-      ekx = eky = ekz = ZEROF;
-
-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx += ekx_arr[l];
-          eky += eky_arr[l];
-          ekz += ekz_arr[l];
-        }
-
-      // convert E-field to force
-
-      type = atom->type[i];
-      lj = B[type];
-      f[i][0] += lj*ekx;
-      f[i][1] += lj*eky;
-      if (slabflag != 2) f[i][2] += lj*ekz;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get dispersion field & force on my particles
-   for geometric mixing rule for ad scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of dispersion field on particle  
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
-  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
-  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-
-    double *prd;
-    if (triclinic == 0) prd = domain->prd;
-    else prd = domain->prd_lamda;
-    
-    double **x = atom->x;
-    double **f = atom->f;
-    const flt_t ftwo_pi = MY_PI * 2.0;
-    const flt_t ffour_pi = MY_PI * 4.0;
-
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshiftone = shiftone_6;
-
-    const double xprd = prd[0];
-    const double yprd = prd[1];
-    const double zprd = prd[2]*slab_volfactor;
-
-    const flt_t hx_inv = nx_pppm_6/xprd;
-    const flt_t hy_inv = ny_pppm_6/yprd;
-    const flt_t hz_inv = nz_pppm_6/zprd;
-
-    const flt_t fsf_coeff0 = sf_coeff_6[0];
-    const flt_t fsf_coeff1 = sf_coeff_6[1];
-    const flt_t fsf_coeff2 = sf_coeff_6[2];
-    const flt_t fsf_coeff3 = sf_coeff_6[3];
-    const flt_t fsf_coeff4 = sf_coeff_6[4];
-    const flt_t fsf_coeff5 = sf_coeff_6[5];
-  
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho6_lookup[idx][k];
-          rho[1][k] = rho6_lookup[idy][k];
-          rho[2][k] = rho6_lookup[idz][k];
-          drho[0][k] = drho6_lookup[idx][k];
-          drho[1][k] = drho6_lookup[idy][k];
-          drho[2][k] = drho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
-          dr1 = dr2 = dr3 = ZEROF;
-  
-          r1 = rho_coeff_6[order_6-1][k];
-          r2 = rho_coeff_6[order_6-1][k];
-          r3 = rho_coeff_6[order_6-1][k];
-          for (int l = order_6-2; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1 * dx;
-            r2 = rho_coeff_6[l][k] + r2 * dy;
-            r3 = rho_coeff_6[l][k] + r3 * dz;
-            dr1 = drho_coeff_6[l][k] + dr1 * dx;
-            dr2 = drho_coeff_6[l][k] + dr2 * dy;
-            dr3 = drho_coeff_6[l][k] + dr3 * dz;
-          }
-          rho[0][k-nlower_6] = r1;
-          rho[1][k-nlower_6] = r2;
-          rho[2][k-nlower_6] = r3;
-          drho[0][k-nlower_6] = dr1;
-          drho[1][k-nlower_6] = dr2;
-          drho[2][k-nlower_6] = dr3;
-        }
-      }
-      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-
-      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n + nzsum;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int my = m + nysum;
-          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
-          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
-          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l + nxsum;
-            ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
-            eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
-            ekz[l] +=  rho[0][l] * ekz_p * u_brick_g[mz][my][mx];
-          }
-        }
-      }
-  
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
-      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
-      	particle_ekx[i] += ekx[l];
-      	particle_eky[i] += eky[l];
-      	particle_ekz[i] += ekz[l];
-      }
-    }
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int i = ifrom; i < ito; i++) {
-      particle_ekx[i] *= hx_inv;
-      particle_eky[i] *= hy_inv;
-      particle_ekz[i] *= hz_inv;
-  
-      // convert E-field to force
-
-      const int type = atom->type[i];
-      const flt_t lj = B[type];
-      const flt_t twoljsq = 2.*lj*lj;
-  
-      const flt_t s1 = x[i][0] * hx_inv;
-      const flt_t s2 = x[i][1] * hy_inv;
-      const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
-      sf *= twoljsq;
-      f[i][0] += lj * particle_ekx[i] - sf;
-  
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
-      sf *= twoljsq;
-      f[i][1] += lj * particle_eky[i] - sf;
-  
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
-      sf *= twoljsq;
-  
-      if (slabflag != 2) f[i][2] += lj * particle_ekz[i] -  sf;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get dispersion field & force on my particles
-   for arithmetic mixing rule and ik scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of dispersion field on particle
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-    double **x = atom->x;
-    double **f = atom->f;
-  
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshiftone = shiftone_6;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho0[k] = rho6_lookup[idx][k];
-          rho1[k] = rho6_lookup[idy][k];
-          rho2[k] = rho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
-          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
-          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
-          for (int l = order_6-2; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1*dx;
-            r2 = rho_coeff_6[l][k] + r2*dy;
-            r3 = rho_coeff_6[l][k] + r3*dz;
-          }
-
-          rho0[k-nlower_6] = r1;
-          rho1[k-nlower_6] = r2;
-          rho2[k-nlower_6] = r3;
-        }
-      }
-
-      _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};   
-      
-
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n+nzsum;
-        FFT_SCALAR z0 = rho2[n];
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int my = m+nysum;
-          FFT_SCALAR y0 = z0*rho1[m];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l+nxsum;
-            FFT_SCALAR x0 = y0*rho0[l];
-              ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
-              eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx];
-              ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx];
-              ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx];
-              eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx];
-              ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx];	      
-              ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx];
-              eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx];
-              ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx];
-              ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx];
-              eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx];
-              ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx];
-              ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx];
-              eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx];
-              ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx];
-              ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx];
-              eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx];
-              ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx];	      
-              ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx];
-              eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx];
-              ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx];     
-          }
-        }
-      }
-
-      FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
-      FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
-      FFT_SCALAR ekx6, eky6, ekz6;
-      ekx0 = eky0 = ekz0 = ZEROF;
-      ekx1 = eky1 = ekz1 = ZEROF;
-      ekx2 = eky2 = ekz2 = ZEROF;
-      ekx3 = eky3 = ekz3 = ZEROF;
-      ekx4 = eky4 = ekz4 = ZEROF;
-      ekx5 = eky5 = ekz5 = ZEROF;
-      ekx6 = eky6 = ekz6 = ZEROF;      
-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-          ekx0 += ekx0_arr[l];
-          eky0 += eky0_arr[l];
-          ekz0 += ekz0_arr[l];
-          ekx1 += ekx1_arr[l];
-          eky1 += eky1_arr[l];
-          ekz1 += ekz1_arr[l];
-          ekx2 += ekx2_arr[l];
-          eky2 += eky2_arr[l];
-          ekz2 += ekz2_arr[l];
-          ekx3 += ekx3_arr[l];
-          eky3 += eky3_arr[l];
-          ekz3 += ekz3_arr[l];
-          ekx4 += ekx4_arr[l];
-          eky4 += eky4_arr[l];
-          ekz4 += ekz4_arr[l];
-          ekx5 += ekx5_arr[l];
-          eky5 += eky5_arr[l];
-          ekz5 += ekz5_arr[l];
-          ekx6 += ekx6_arr[l];
-          eky6 += eky6_arr[l];
-          ekz6 += ekz6_arr[l];	  
-        }
-
-      // convert D-field to force
-
-      const int type = atom->type[i];
-      const FFT_SCALAR lj0 = B[7*type+6];
-      const FFT_SCALAR lj1 = B[7*type+5];
-      const FFT_SCALAR lj2 = B[7*type+4];
-      const FFT_SCALAR lj3 = B[7*type+3];
-      const FFT_SCALAR lj4 = B[7*type+2];
-      const FFT_SCALAR lj5 = B[7*type+1];
-      const FFT_SCALAR lj6 = B[7*type];
-      
-      f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + 
-	lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
-      f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + 
-	lj4*eky4 + lj5*eky5 + lj6*eky6;
-      if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + 
-			   lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get dispersion field & force on my particles
-   for arithmetic mixing rule for the ad scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of dispersion field on particle  
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-  FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0;
-  FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0;
-  FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0;
-  FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1;
-  FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1;
-  FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1;  
-  FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2;
-  FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2;
-  FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2;
-  FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3;
-  FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3;
-  FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3;
-  FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4;
-  FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4;
-  FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4;
-  FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5;
-  FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5;
-  FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5;
-  FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6;
-  FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6;
-  FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6;
-  
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-
-    double *prd;
-    if (triclinic == 0) prd = domain->prd;
-    else prd = domain->prd_lamda;
-    
-    double **x = atom->x;
-    double **f = atom->f;
-    const flt_t ftwo_pi = MY_PI * 2.0;
-    const flt_t ffour_pi = MY_PI * 4.0;
-
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshiftone = shiftone_6;
-
-    const double xprd = prd[0];
-    const double yprd = prd[1];
-    const double zprd = prd[2]*slab_volfactor;
-
-    const flt_t hx_inv = nx_pppm_6/xprd;
-    const flt_t hy_inv = ny_pppm_6/yprd;
-    const flt_t hz_inv = nz_pppm_6/zprd;
-
-    const flt_t fsf_coeff0 = sf_coeff_6[0];
-    const flt_t fsf_coeff1 = sf_coeff_6[1];
-    const flt_t fsf_coeff2 = sf_coeff_6[2];
-    const flt_t fsf_coeff3 = sf_coeff_6[3];
-    const flt_t fsf_coeff4 = sf_coeff_6[4];
-    const flt_t fsf_coeff5 = sf_coeff_6[5];
-  
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho6_lookup[idx][k];
-          rho[1][k] = rho6_lookup[idy][k];
-          rho[2][k] = rho6_lookup[idz][k];
-          drho[0][k] = drho6_lookup[idx][k];
-          drho[1][k] = drho6_lookup[idy][k];
-          drho[2][k] = drho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
-          dr1 = dr2 = dr3 = ZEROF;
-  
-          r1 = rho_coeff_6[order_6-1][k];
-          r2 = rho_coeff_6[order_6-1][k];
-          r3 = rho_coeff_6[order_6-1][k];
-          for (int l = order_6-2; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1 * dx;
-            r2 = rho_coeff_6[l][k] + r2 * dy;
-            r3 = rho_coeff_6[l][k] + r3 * dz;
-            dr1 = drho_coeff_6[l][k] + dr1 * dx;
-            dr2 = drho_coeff_6[l][k] + dr2 * dy;
-            dr3 = drho_coeff_6[l][k] + dr3 * dz;
-          }
-          rho[0][k-nlower_6] = r1;
-          rho[1][k-nlower_6] = r2;
-          rho[2][k-nlower_6] = r3;
-          drho[0][k-nlower_6] = dr1;
-          drho[1][k-nlower_6] = dr2;
-          drho[2][k-nlower_6] = dr3;
-        }
-      }
-      _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};      
-      _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-      _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};     
-
-      particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF;
-      particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF; 
-      particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF;
-      particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF;
-      particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF;
-      particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF;
-      particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
-      
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma loop_count=7
-      #endif   
-      for (int n = 0; n < order_6; n++) {
-        int mz = n + nzsum;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-        for (int m = 0; m < order_6; m++) {
-          int my = m + nysum;
-          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
-          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
-          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma simd
-          #endif   
-          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-            int mx = l + nxsum;
-	    FFT_SCALAR x0 = drho[0][l] * ekx_p;
-	    FFT_SCALAR y0 = rho[0][l] * eky_p;
-	    FFT_SCALAR z0 = rho[0][l] * ekz_p;
-	    
-            ekx0[l] +=  x0 * u_brick_a0[mz][my][mx];
-            eky0[l] +=  y0 * u_brick_a0[mz][my][mx];
-            ekz0[l] +=  z0 * u_brick_a0[mz][my][mx];
-            ekx1[l] +=  x0 * u_brick_a1[mz][my][mx];
-            eky1[l] +=  y0 * u_brick_a1[mz][my][mx];
-            ekz1[l] +=  z0 * u_brick_a1[mz][my][mx];
-            ekx2[l] +=  x0 * u_brick_a2[mz][my][mx];
-            eky2[l] +=  y0 * u_brick_a2[mz][my][mx];
-            ekz2[l] +=  z0 * u_brick_a2[mz][my][mx];
-            ekx3[l] +=  x0 * u_brick_a3[mz][my][mx];
-            eky3[l] +=  y0 * u_brick_a3[mz][my][mx];
-            ekz3[l] +=  z0 * u_brick_a3[mz][my][mx];
-            ekx4[l] +=  x0 * u_brick_a4[mz][my][mx];
-            eky4[l] +=  y0 * u_brick_a4[mz][my][mx];
-            ekz4[l] +=  z0 * u_brick_a4[mz][my][mx];
-            ekx5[l] +=  x0 * u_brick_a5[mz][my][mx];
-            eky5[l] +=  y0 * u_brick_a5[mz][my][mx];
-            ekz5[l] +=  z0 * u_brick_a5[mz][my][mx];
-            ekx6[l] +=  x0 * u_brick_a6[mz][my][mx];
-            eky6[l] +=  y0 * u_brick_a6[mz][my][mx];
-            ekz6[l] +=  z0 * u_brick_a6[mz][my][mx];	    	    
-          }
-        }
-      }
-  
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
-      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
-      	particle_ekx0[i] += ekx0[l];
-      	particle_eky0[i] += eky0[l];
-      	particle_ekz0[i] += ekz0[l];
-      	particle_ekx1[i] += ekx1[l];
-      	particle_eky1[i] += eky1[l];
-      	particle_ekz1[i] += ekz1[l];
-      	particle_ekx2[i] += ekx2[l];
-      	particle_eky2[i] += eky2[l];
-      	particle_ekz2[i] += ekz2[l];
-      	particle_ekx3[i] += ekx3[l];
-      	particle_eky3[i] += eky3[l];
-      	particle_ekz3[i] += ekz3[l];
-      	particle_ekx4[i] += ekx4[l];
-      	particle_eky4[i] += eky4[l];
-      	particle_ekz4[i] += ekz4[l];
-      	particle_ekx5[i] += ekx5[l];
-      	particle_eky5[i] += eky5[l];
-      	particle_ekz5[i] += ekz5[l];
-      	particle_ekx6[i] += ekx6[l];
-      	particle_eky6[i] += eky6[l];
-      	particle_ekz6[i] += ekz6[l];	
-      }
-    }
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int i = ifrom; i < ito; i++) {
-      particle_ekx0[i] *= hx_inv;
-      particle_eky0[i] *= hy_inv;
-      particle_ekz0[i] *= hz_inv;
-      particle_ekx1[i] *= hx_inv;
-      particle_eky1[i] *= hy_inv;
-      particle_ekz1[i] *= hz_inv;
-      particle_ekx2[i] *= hx_inv;
-      particle_eky2[i] *= hy_inv;
-      particle_ekz2[i] *= hz_inv;
-      particle_ekx3[i] *= hx_inv;
-      particle_eky3[i] *= hy_inv;
-      particle_ekz3[i] *= hz_inv;
-      particle_ekx4[i] *= hx_inv;
-      particle_eky4[i] *= hy_inv;
-      particle_ekz4[i] *= hz_inv;
-      particle_ekx5[i] *= hx_inv;
-      particle_eky5[i] *= hy_inv;
-      particle_ekz5[i] *= hz_inv;
-      particle_ekx6[i] *= hx_inv;
-      particle_eky6[i] *= hy_inv;
-      particle_ekz6[i] *= hz_inv;      
-  
-      // convert D-field to force
-
-      const int type = atom->type[i];
-      const FFT_SCALAR lj0 = B[7*type+6];
-      const FFT_SCALAR lj1 = B[7*type+5];
-      const FFT_SCALAR lj2 = B[7*type+4];
-      const FFT_SCALAR lj3 = B[7*type+3];
-      const FFT_SCALAR lj4 = B[7*type+2];
-      const FFT_SCALAR lj5 = B[7*type+1];
-      const FFT_SCALAR lj6 = B[7*type];
-  
-      const flt_t s1 = x[i][0] * hx_inv;
-      const flt_t s2 = x[i][1] * hy_inv;
-      const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf += fsf_coeff1 * sin(ffour_pi * s1);
-      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
-      f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] + 
-	lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] + 
-	lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf;      
-  
-      sf = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf += fsf_coeff3 * sin(ffour_pi * s2);
-      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
-      f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] + 
-	lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] + 
-	lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf;
-  
-      sf = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf += fsf_coeff5 * sin(ffour_pi * s3);
-      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
-      if (slabflag != 2)
-      f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] + 
-	lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] + 
-	lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get dispersion field & force on my particles
-   for no mixing rule and ik scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
-{
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of dispersion field on particle
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) \
-    shared(nlocal, nthr) if(!_use_lrt)
-  #endif
-  {
-
-    double lj;
-    int type;
-    double **x = atom->x;
-    double **f = atom->f;
-  
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshiftone = shiftone_6;
-
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
-
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho0[k] = rho6_lookup[idx][k];
-          rho1[k] = rho6_lookup[idy][k];
-          rho2[k] = rho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
-          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
-          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
-          for (int l = order_6-2; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1*dx;
-            r2 = rho_coeff_6[l][k] + r2*dy;
-            r3 = rho_coeff_6[l][k] + r3*dz;
-          }
-
-          rho0[k-nlower_6] = r1;
-          rho1[k-nlower_6] = r2;
-          rho2[k-nlower_6] = r3;
-        }
-      }
-
-
-      _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
-      _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
-      _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
-
-      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
-	ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF;
-      }
-
-      for (int k = 0; k < nsplit; k++) {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-	for (int n = 0; n < order_6; n++) {
-	  int mz = n+nzsum;
-	  FFT_SCALAR z0 = rho2[n];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
-          #endif   
-	  for (int m = 0; m < order_6; m++) {
-	    int my = m+nysum;
-	    FFT_SCALAR y0 = z0*rho1[m];
-            #if defined(LMP_SIMD_COMPILER)
-            #pragma simd
-            #endif   
-	    for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-	      int mx = l+nxsum;
-	      FFT_SCALAR x0 = y0*rho0[l];
-              ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= 
-		x0*vdx_brick_none[k][mz][my][mx];
-              eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= 
-		x0*vdy_brick_none[k][mz][my][mx];
-              ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= 
-		x0*vdz_brick_none[k][mz][my][mx];
-	    }
-	  }
-	}
-      }
-
-      _alignvar(FFT_SCALAR ekx[nsplit], 64);
-      _alignvar(FFT_SCALAR eky[nsplit], 64);
-      _alignvar(FFT_SCALAR ekz[nsplit], 64);
-      for (int k = 0; k < nsplit; k++) {
-	ekx[k] = eky[k] = ekz[k] = ZEROF;
-      }      
-
-        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-	  for (int k = 0; k < nsplit; k++) {
-	    ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-	    eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-	    ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
-	  }
-        }
-
-      // convert E-field to force
-
-      type = atom->type[i];
-      for (int k = 0; k < nsplit; k++) {
-	lj = B[nsplit*type + k];
-	f[i][0] += lj*ekx[k];
-	f[i][1] += lj*eky[k];
-	if (slabflag != 2) f[i][2] += lj*ekz[k];
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get dispersion field & force on my particles
-   for no mixing rule for the ad scheme
-------------------------------------------------------------------------- */
-
-template<class flt_t, class acc_t, int use_table>
-void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
-{
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of dispersion field on particle  
-
-  int nlocal = atom->nlocal;
-  int nthr = comm->nthreads;
-
-   #if defined(_OPENMP)
-   #pragma omp parallel default(none)		\
-     shared(nlocal, nthr) if(!_use_lrt)
-   #endif
-  {
-
-    double *prd;
-    if (triclinic == 0) prd = domain->prd;
-    else prd = domain->prd_lamda;
-    
-    double **x = atom->x;
-    double **f = atom->f;
-    const flt_t ftwo_pi = MY_PI * 2.0;
-    const flt_t ffour_pi = MY_PI * 4.0;
-
-    const flt_t lo0 = boxlo[0];
-    const flt_t lo1 = boxlo[1];
-    const flt_t lo2 = boxlo[2];
-    const flt_t xi = delxinv_6;
-    const flt_t yi = delyinv_6;
-    const flt_t zi = delzinv_6;
-    const flt_t fshiftone = shiftone_6;
-
-    const double xprd = prd[0];
-    const double yprd = prd[1];
-    const double zprd = prd[2]*slab_volfactor;
-
-    const flt_t hx_inv = nx_pppm_6/xprd;
-    const flt_t hy_inv = ny_pppm_6/yprd;
-    const flt_t hz_inv = nz_pppm_6/zprd;
-
-    const flt_t fsf_coeff0 = sf_coeff_6[0];
-    const flt_t fsf_coeff1 = sf_coeff_6[1];
-    const flt_t fsf_coeff2 = sf_coeff_6[2];
-    const flt_t fsf_coeff3 = sf_coeff_6[3];
-    const flt_t fsf_coeff4 = sf_coeff_6[4];
-    const flt_t fsf_coeff5 = sf_coeff_6[5];
-  
-    int ifrom, ito, tid;
-    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
-
-    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
-    for (int i = ifrom; i < ito; i++) {
-      int nx = part2grid_6[i][0];
-      int ny = part2grid_6[i][1];
-      int nz = part2grid_6[i][2];
-      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
-      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
-      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
-  
-      int nxsum = nx + nlower_6;
-      int nysum = ny + nlower_6;
-      int nzsum = nz + nlower_6;
-  
-      if (use_table) {
-        dx = dx*half_rho_scale + half_rho_scale_plus;
-        int idx = dx;
-        dy = dy*half_rho_scale + half_rho_scale_plus;
-        int idy = dy;
-        dz = dz*half_rho_scale + half_rho_scale_plus;
-        int idz = dz;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-          rho[0][k] = rho6_lookup[idx][k];
-          rho[1][k] = rho6_lookup[idy][k];
-          rho[2][k] = rho6_lookup[idz][k];
-          drho[0][k] = drho6_lookup[idx][k];
-          drho[1][k] = drho6_lookup[idy][k];
-          drho[2][k] = drho6_lookup[idz][k];
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma simd
-        #endif   
-        for (int k = nlower_6; k <= nupper_6; k++) {
-          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
-          dr1 = dr2 = dr3 = ZEROF;
-  
-          r1 = rho_coeff_6[order_6-1][k];
-          r2 = rho_coeff_6[order_6-1][k];
-          r3 = rho_coeff_6[order_6-1][k];
-          for (int l = order_6-2; l >= 0; l--) {
-            r1 = rho_coeff_6[l][k] + r1 * dx;
-            r2 = rho_coeff_6[l][k] + r2 * dy;
-            r3 = rho_coeff_6[l][k] + r3 * dz;
-            dr1 = drho_coeff_6[l][k] + dr1 * dx;
-            dr2 = drho_coeff_6[l][k] + dr2 * dy;
-            dr3 = drho_coeff_6[l][k] + dr3 * dz;
-          }
-          rho[0][k-nlower_6] = r1;
-          rho[1][k-nlower_6] = r2;
-          rho[2][k-nlower_6] = r3;
-          drho[0][k-nlower_6] = dr1;
-          drho[1][k-nlower_6] = dr2;
-          drho[2][k-nlower_6] = dr3;
-        }
-      }
-      _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
-      _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
-      _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
-
-      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
-	ekx[k]=eky[k]=ekz[k]=ZEROF;
-      }
-
-      for (int k = 0; k < nsplit; k++) { 
-	particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
-        #if defined(LMP_SIMD_COMPILER)
-        #pragma loop_count=7
-        #endif   
-	for (int n = 0; n < order_6; n++) {
-	  int mz = n + nzsum;
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count=7
-          #endif   
-	  for (int m = 0; m < order_6; m++) {
-	    int my = m + nysum;
-	    FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
-	    FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
-	    FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
-            #if defined(LMP_SIMD_COMPILER)
-            #pragma simd
-            #endif   
-	    for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
-	      int mx = l + nxsum;
-	      ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p * 
-		u_brick_none[k][mz][my][mx];
-	      eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * eky_p * 
-		u_brick_none[k][mz][my][mx];
-	      ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * ekz_p * 
-		u_brick_none[k][mz][my][mx];
-	    }
-	  }
-	}
-      }
-      
-      _alignvar(FFT_SCALAR ekx_tot[nsplit], 64);
-      _alignvar(FFT_SCALAR eky_tot[nsplit], 64);
-      _alignvar(FFT_SCALAR ekz_tot[nsplit], 64);
-      for (int k = 0; k < nsplit; k++) {
-	ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF;
-      }
-      
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
-      #endif
-      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
-	for (int k = 0; k < nsplit; k++) {
-	  ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
-	  eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
-	  ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l];
-	}
-      }
-
-      for (int k = 0; k < nsplit; k++) {
-	ekx_tot[k] *= hx_inv;
-	eky_tot[k] *= hy_inv;
-	ekz_tot[k] *= hz_inv;
-      }
-      // convert D-field to force
-
-      const int type = atom->type[i];
-  
-      const flt_t s1 = x[i][0] * hx_inv;
-      const flt_t s2 = x[i][1] * hy_inv;
-      const flt_t s3 = x[i][2] * hz_inv;
-      flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
-      sf1 += fsf_coeff1 * sin(ffour_pi * s1);
-  
-      flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
-      sf2 += fsf_coeff3 * sin(ffour_pi * s2);
-  
-      flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
-      sf3 += fsf_coeff5 * sin(ffour_pi * s3); 
-      for (int k = 0; k < nsplit; k++) {
-	const flt_t lj = B[nsplit*type + k];
-	const flt_t twoljsq = lj*lj * B[k] * 2;
-	flt_t sf = sf1*twoljsq;
-	f[i][0] += lj * ekx_tot[k] - sf;
-	sf = sf2*twoljsq;
-	f[i][1] += lj * eky_tot[k] - sf;
-	sf = sf3*twoljsq;
-	if (slabflag != 2) f[i][2] += lj * ekz_tot[k] -  sf;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   precompute rho coefficients as a lookup table to save time in make_rho
-   and fieldforce.  Instead of doing this polynomial for every atom 6 times
-   per time step, precompute it for some number of points.
-------------------------------------------------------------------------- */
-
-void PPPMDispIntel::precompute_rho()
-{
-
-  half_rho_scale = (rho_points - 1.)/2.;
-  half_rho_scale_plus = half_rho_scale + 0.5;
-
-  for (int i = 0; i < rho_points; i++) {
-    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int k=nlower; k<=nupper;k++){
-      FFT_SCALAR r1 = ZEROF;
-      for(int l=order-1; l>=0; l--){
-        r1 = rho_coeff[l][k] + r1*dx;
-      }
-      rho_lookup[i][k-nlower] = r1;
-    }
-    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-      rho_lookup[i][k] = 0;
-    }
-    if (differentiation_flag == 1) {
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
-      #endif
-      for(int k=nlower; k<=nupper;k++){
-        FFT_SCALAR r1 = ZEROF;
-        for(int l=order-2; l>=0; l--){
-          r1 = drho_coeff[l][k] + r1*dx;
-        }
-        drho_lookup[i][k-nlower] = r1;
-      }
-      for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-        drho_lookup[i][k] = 0;
-      }
-    }
-  }
-  for (int i = 0; i < rho_points; i++) {
-    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
-    #if defined(LMP_SIMD_COMPILER)
-    #pragma simd
-    #endif
-    for (int k=nlower_6; k<=nupper_6;k++){
-      FFT_SCALAR r1 = ZEROF;
-      for(int l=order_6-1; l>=0; l--){
-        r1 = rho_coeff_6[l][k] + r1*dx;
-      }
-      rho6_lookup[i][k-nlower_6] = r1;
-    }
-    for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-      rho6_lookup[i][k] = 0;
-    }
-    if (differentiation_flag == 1) {
-      #if defined(LMP_SIMD_COMPILER)
-      #pragma simd
-      #endif
-      for(int k=nlower_6; k<=nupper_6;k++){
-        FFT_SCALAR r1 = ZEROF;
-        for(int l=order_6-2; l>=0; l--){
-          r1 = drho_coeff_6[l][k] + r1*dx;
-        }
-        drho6_lookup[i][k-nlower_6] = r1;
-      }
-      for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
-        drho6_lookup[i][k] = 0;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   Returns 0 if Intel optimizations for PPPM ignored due to offload
-------------------------------------------------------------------------- */
-
-#ifdef _LMP_INTEL_OFFLOAD
-int PPPMDispIntel::use_base() {
-  return _use_base;
-}
-#endif
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pppm_disp_intel.h"
+#include "atom.h"
+#include "error.h"
+#include "fft3d_wrap.h"
+#include "gridcomm.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER   7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
+enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
+enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
+     FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
+     FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
+     FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE,
+     FORWARD_AD_PERATOM_NONE};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) :
+  PPPMDisp(lmp, narg, arg)
+{
+  suffix_flag |= Suffix::INTEL;
+
+  order = 7;
+  order_6 = 7; //sets default stencil sizes to 7
+
+  perthread_density = NULL;
+  particle_ekx = particle_eky = particle_ekz = NULL;
+  particle_ekx0 = particle_eky0 = particle_ekz0 = NULL;
+  particle_ekx1 = particle_eky1 = particle_ekz1 = NULL;
+  particle_ekx2 = particle_eky2 = particle_ekz2 = NULL;
+  particle_ekx3 = particle_eky3 = particle_ekz3 = NULL;
+  particle_ekx4 = particle_eky4 = particle_ekz4 = NULL;
+  particle_ekx5 = particle_eky5 = particle_ekz5 = NULL;
+  particle_ekx6 = particle_eky6 = particle_ekz6 = NULL;
+
+  rho_lookup = drho_lookup = NULL;
+  rho6_lookup = drho6_lookup = NULL;
+  rho_points = 0;
+
+  _use_table = _use_packing = _use_lrt = 0;
+}
+
+PPPMDispIntel::~PPPMDispIntel()
+{
+  memory->destroy(perthread_density);
+  memory->destroy(particle_ekx);
+  memory->destroy(particle_eky);
+  memory->destroy(particle_ekz);
+
+  memory->destroy(rho_lookup);
+  memory->destroy(drho_lookup);
+  memory->destroy(rho6_lookup);
+  memory->destroy(drho6_lookup);
+}
+
+
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+
+void PPPMDispIntel::init()
+{
+
+  PPPMDisp::init();
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->kspace_init_check();
+
+  _use_lrt = fix->lrt();
+  if (_use_lrt)
+    error->all(FLERR,
+               "LRT mode is currently not supported for pppm/disp/intel");
+
+
+  // For vectorization, we need some padding in the end
+  // The first thread computes on the global density
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    memory->destroy(perthread_density);
+    memory->create(perthread_density, comm->nthreads-1,
+                   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:perthread_density");
+  }
+
+  _use_table = fix->pppm_table();
+  if (_use_table) {
+    rho_points = 5000;
+    memory->destroy(rho_lookup);
+    memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:rho_lookup");
+    memory->destroy(rho6_lookup);
+    memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:rho6_lookup");
+
+    if(differentiation_flag == 1) {
+      memory->destroy(drho_lookup);
+      memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                     "pppmdispintel:drho_lookup");
+      memory->destroy(drho6_lookup);
+      memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+                     "pppmdispintel:drho6_lookup");
+    }
+    precompute_rho();
+  }
+  if (order > INTEL_P3M_MAXORDER)
+    error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDispIntel long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDispIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    PPPMDisp::compute(eflag, vflag);
+    return;
+  }
+  #endif
+  int i;
+  // convert atoms from box to lamda coords
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    if (function[0]) {
+      cg_peratom->ghost_notify();
+      cg_peratom->setup();
+    }
+    if (function[1] + function[2] + function[3]) {
+      cg_peratom_6->ghost_notify();
+      cg_peratom_6->setup();
+    }
+    peratom_allocate_flag = 1;
+  }
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+
+    if (function[0]) memory->destroy(part2grid);
+    if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
+    if (differentiation_flag == 1) {
+      memory->destroy(particle_ekx);
+      memory->destroy(particle_eky);
+      memory->destroy(particle_ekz);
+      if (function[2] == 1){
+        memory->destroy(particle_ekx0);
+        memory->destroy(particle_eky0);
+        memory->destroy(particle_ekz0);
+        memory->destroy(particle_ekx1);
+        memory->destroy(particle_eky1);
+        memory->destroy(particle_ekz1);
+        memory->destroy(particle_ekx2);
+        memory->destroy(particle_eky2);
+        memory->destroy(particle_ekz2);
+        memory->destroy(particle_ekx3);
+        memory->destroy(particle_eky3);
+        memory->destroy(particle_ekz3);
+        memory->destroy(particle_ekx4);
+        memory->destroy(particle_eky4);
+        memory->destroy(particle_ekz4);
+        memory->destroy(particle_ekx5);
+        memory->destroy(particle_eky5);
+        memory->destroy(particle_ekz5);
+        memory->destroy(particle_ekx6);
+        memory->destroy(particle_eky6);
+        memory->destroy(particle_ekz6);
+      }
+
+    }
+    nmax = atom->nmax;
+    if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid");
+    if (function[1] + function[2] + function[3])
+      memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
+    if (differentiation_flag == 1) {
+      memory->create(particle_ekx, nmax, "pppmdispintel:pekx");
+      memory->create(particle_eky, nmax, "pppmdispintel:peky");
+      memory->create(particle_ekz, nmax, "pppmdispintel:pekz");
+      if (function[2] == 1){
+        memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0");
+        memory->create(particle_eky0, nmax, "pppmdispintel:peky0");
+        memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0");
+        memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1");
+        memory->create(particle_eky1, nmax, "pppmdispintel:peky1");
+        memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1");
+        memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2");
+        memory->create(particle_eky2, nmax, "pppmdispintel:peky2");
+        memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2");
+        memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3");
+        memory->create(particle_eky3, nmax, "pppmdispintel:peky3");
+        memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3");
+        memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4");
+        memory->create(particle_eky4, nmax, "pppmdispintel:peky4");
+        memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4");
+        memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5");
+        memory->create(particle_eky5, nmax, "pppmdispintel:peky5");
+        memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5");
+        memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6");
+        memory->create(particle_eky6, nmax, "pppmdispintel:peky6");
+        memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6");
+      }
+    }
+  }
+  energy = 0.0;
+  energy_1 = 0.0;
+  energy_6 = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0;
+
+  // find grid points for all my particles
+  // distribute partcles' charges/dispersion coefficients on the grid
+  // communication between processors and remapping two fft
+  // Solution of poissons equation in k-space and backtransformation
+  // communication between processors
+  // calculation of forces
+
+  if (function[0]) {
+
+    //perform calculations for coulomb interactions only
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid,
+                                 nupper, nlower, nxlo_out, nylo_out, nzlo_out,
+                                 nxhi_out, nyhi_out, nzhi_out,
+                                 fix->get_mixed_buffers());
+      make_rho_c<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid,
+                                  nupper, nlower, nxlo_out, nylo_out,
+                                  nzlo_out, nxhi_out, nyhi_out, nzhi_out,
+                                  fix->get_double_buffers());
+      make_rho_c<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid,
+                                nupper, nlower, nxlo_out, nylo_out, nzlo_out,
+                                nxhi_out, nyhi_out, nzhi_out,
+                                fix->get_single_buffers());
+      make_rho_c<float,float>(fix->get_single_buffers());
+    }
+
+    cg->reverse_comm(this,REVERSE_RHO);
+
+    brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+              density_brick, density_fft, work1,remap);
+
+    if (differentiation_flag == 1) {
+      poisson_ad(work1, work2, density_fft, fft1, fft2,
+                 nx_pppm, ny_pppm, nz_pppm, nfft,
+                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
+                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+                 energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick,
+                 v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
+
+      cg->forward_comm(this,FORWARD_AD);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+        fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+        fieldforce_c_ad<double,double>(fix->get_double_buffers());
+      } else {
+        fieldforce_c_ad<float,float>(fix->get_single_buffers());
+      }
+
+      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
+
+    } else {
+      poisson_ik(work1, work2, density_fft, fft1, fft2,
+                 nx_pppm, ny_pppm, nz_pppm, nfft,
+                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
+                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+                 energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2,
+                 vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
+                 u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick,
+                 v5_brick);
+
+      cg->forward_comm(this, FORWARD_IK);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+        fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+        fieldforce_c_ik<double,double>(fix->get_double_buffers());
+      } else {
+        fieldforce_c_ik<float,float>(fix->get_single_buffers());
+      }
+
+      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
+    }
+    if (evflag_atom) fieldforce_c_peratom();
+  }
+
+  if (function[1]) {
+    //perfrom calculations for geometric mixing
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                 part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                 nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                 nyhi_out_6, nzhi_out_6,
+                                 fix->get_mixed_buffers());
+      make_rho_g<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                  nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                  nyhi_out_6, nzhi_out_6,
+                                  fix->get_double_buffers());
+      make_rho_g<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                nyhi_out_6, nzhi_out_6,
+                                fix->get_single_buffers());
+      make_rho_g<float,float>(fix->get_single_buffers());
+    }
+
+
+    cg_6->reverse_comm(this, REVERSE_RHO_G);
+
+    brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
+              density_brick_g, density_fft_g, work1_6,remap_6);
+
+    if (differentiation_flag == 1) {
+
+      poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
+                 nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6,
+                 nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6,
+                 nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6,
+                 virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g,
+                 v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+
+      cg_6->forward_comm(this,FORWARD_AD_G);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_g_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_g_ad<float,float>(fix->get_single_buffers());
+    }
+
+      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
+
+    } else {
+      poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
+                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
+                 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,
+                 fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g,
+                 vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g,
+                 v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+
+      cg_6->forward_comm(this,FORWARD_IK_G);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_g_ik<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_g_ik<float,float>(fix->get_single_buffers());
+    }
+
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
+    }
+    if (evflag_atom) fieldforce_g_peratom();
+  }
+
+  if (function[2]) {
+    //perform calculations for arithmetic mixing
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                 part2grid_6, nupper_6, nlower_6,
+                                 nxlo_out_6, nylo_out_6, nzlo_out_6,
+                                 nxhi_out_6, nyhi_out_6, nzhi_out_6,
+                                 fix->get_mixed_buffers());
+      make_rho_a<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                  nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                  nyhi_out_6, nzhi_out_6,
+                                  fix->get_double_buffers());
+      make_rho_a<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                nyhi_out_6, nzhi_out_6,
+                                fix->get_single_buffers());
+      make_rho_a<float,float>(fix->get_single_buffers());
+    }
+
+    cg_6->reverse_comm(this, REVERSE_RHO_A);
+
+    brick2fft_a();
+
+    if ( differentiation_flag == 1) {
+
+      poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
+                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
+                 nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6,
+                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3,
+                 v3_brick_a3, v4_brick_a3, v5_brick_a3);
+      poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0,
+                    v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0,
+                    v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6,
+                    v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
+      poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1,
+                    v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1,
+                    v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5,
+                    v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
+      poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2,
+                    v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2,
+                    v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4,
+                    v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
+
+      cg_6->forward_comm(this, FORWARD_AD_A);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_a_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_a_ad<float,float>(fix->get_single_buffers());
+    }
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
+
+    }  else {
+
+      poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6,
+                 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6,
+                 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6,
+                 fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3,
+                 virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3,
+                 v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
+      poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0,
+                    vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6,
+                    vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0,
+                    v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6,
+                    v3_brick_a6, v4_brick_a6, v5_brick_a6);
+      poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1,
+                    vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5,
+                    vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1,
+                    v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5,
+                    v3_brick_a5, v4_brick_a5, v5_brick_a5);
+      poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2,
+                    vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4,
+                    vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2,
+                    v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4,
+                    v3_brick_a4, v4_brick_a4, v5_brick_a4);
+
+      cg_6->forward_comm(this, FORWARD_IK_A);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+        fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+        fieldforce_a_ik<double,double>(fix->get_double_buffers());
+      } else {
+        fieldforce_a_ik<float,float>(fix->get_single_buffers());
+      }
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
+    }
+    if (evflag_atom) fieldforce_a_peratom();
+  }
+
+  if (function[3]) {
+    //perform calculations if no mixing rule applies
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                 part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                 nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                 nyhi_out_6, nzhi_out_6,
+                                 fix->get_mixed_buffers());
+      make_rho_none<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                  nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                  nyhi_out_6, nzhi_out_6,
+                                  fix->get_double_buffers());
+      make_rho_none<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6,
+                                part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+                                nylo_out_6, nzlo_out_6, nxhi_out_6,
+                                nyhi_out_6, nzhi_out_6,
+                                fix->get_single_buffers());
+      make_rho_none<float,float>(fix->get_single_buffers());
+    }
+
+    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
+
+    brick2fft_none();
+
+    if (differentiation_flag == 1) {
+
+      int n = 0;
+      for (int k = 0; k<nsplit_alloc/2; k++) {
+        poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
+                        u_brick_none[n],u_brick_none[n+1],
+                        v0_brick_none, v1_brick_none, v2_brick_none,
+                        v3_brick_none, v4_brick_none, v5_brick_none);
+        n += 2;
+      }
+
+      cg_6->forward_comm(this,FORWARD_AD_NONE);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_none_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_none_ad<float,float>(fix->get_single_buffers());
+    }
+
+      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
+
+    } else {
+      int n = 0;
+      for (int k = 0; k<nsplit_alloc/2; k++) {
+
+        poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1],
+                        vdx_brick_none[n], vdy_brick_none[n],
+                        vdz_brick_none[n], vdx_brick_none[n+1],
+                        vdy_brick_none[n+1], vdz_brick_none[n+1],
+                        u_brick_none, v0_brick_none, v1_brick_none,
+                        v2_brick_none, v3_brick_none, v4_brick_none,
+                        v5_brick_none);
+        n += 2;
+      }
+
+      cg_6->forward_comm(this,FORWARD_IK_NONE);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_none_ik<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_none_ik<float,float>(fix->get_single_buffers());
+    }
+
+      if (evflag_atom)
+        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
+    }
+    if (evflag_atom) fieldforce_none_peratom();
+  }
+
+  // update qsum and qsqsum, if atom count has changed and energy needed
+
+  if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atom->natoms;
+  }
+
+  // sum energy across procs and add in volume-dependent term
+
+  const double qscale = force->qqrd2e * scale;
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy_1 = energy_all;
+    MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy_6 = energy_all;
+
+    energy_1 *= 0.5*volume;
+    energy_6 *= 0.5*volume;
+
+    energy_1 -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij +
+      1.0/12.0*pow(g_ewald_6,6)*csum;
+    energy_1 *= qscale;
+  }
+
+  // sum virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+    MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
+    if (function[1]+function[2]+function[3]){
+      double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
+      virial[0] -= a;
+      virial[1] -= a;
+      virial[2] -= a;
+    }
+  }
+
+  if (eflag_atom) {
+    if (function[0]) {
+      double *q = atom->q;
+      for (i = 0; i < atom->nlocal; i++) {
+        eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]*
+          qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction
+      }
+    }
+    if (function[1] + function[2] + function[3]) {
+      int tmp;
+      for (i = 0; i < atom->nlocal; i++) {
+        tmp = atom->type[i];
+        eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] +
+                      1.0/12.0*pow(g_ewald_6,6)*cii[tmp];
+      }
+    }
+  }
+
+  if (vflag_atom) {
+    if (function[1] + function[2] + function[3]) {
+      int tmp;
+      for (i = 0; i < atom->nlocal; i++) {
+        tmp = atom->type[i];
+        //dispersion self virial correction
+        for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*
+                                      pow(g_ewald_6,3)*csumi[tmp];
+      }
+    }
+  }
+
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+  if (function[0]) energy += energy_1;
+  if (function[1] + function[2] + function[3]) energy += energy_6;
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PPPMDispIntel::particle_map(double delx, double dely, double delz,
+                                 double sft, int** p2g, int nup, int nlow,
+                                 int nxlo, int nylo, int nzlo,
+                                 int nxhi, int nyhi, int nzhi,
+                                 IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
+    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
+
+  int flag = 0;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\
+           nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt)
+  #endif
+  {
+    double **x = atom->x;
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delx;
+    const flt_t yi = dely;
+    const flt_t zi = delz;
+    const flt_t fshift = sft;
+
+
+    int iifrom, iito, tid;
+    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma simd reduction(+:flag)
+    #endif
+    for (int i = iifrom; i < iito; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET;
+    int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET;
+    int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET;
+
+    p2g[i][0] = nx;
+    p2g[i][1] = ny;
+    p2g[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlow < nxlo || nx+nup > nxhi ||
+        ny+nlow < nylo || ny+nup > nyhi ||
+        nz+nlow < nzlo || nz+nup > nzhi)
+      flag = 1;
+  }
+  }
+
+  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias global_density =
+    &(density_brick[nzlo_out][nylo_out][nxlo_out]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+  double *q = atom->q;
+  double **x = atom->x;
+
+    const int nix = nxhi_out - nxlo_out + 1;
+    const int niy = nyhi_out - nylo_out + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
+    const flt_t fshiftone = shiftone;
+    const flt_t fdelvolinv = delvolinv;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
+      perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nysum = nlower + ny - nylo_out;
+      int nxsum = nlower + nx - nxlo_out;
+      int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order-1; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+        }
+      }
+
+      FFT_SCALAR z0 = fdelvolinv * q[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- geometric mixing
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias global_density =
+    &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    int type;
+    double **x = atom->x;
+
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
+      perthread_density[tid - 1];
+
+    // clear 3d density array
+    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nysum = nlower_6 + ny - nylo_out_6;
+      int nxsum = nlower_6 + nx - nxlo_out_6;
+      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv * B[type];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- arithmetic mixing
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+         ngrid_6*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+
+    double **x = atom->x;
+
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    for (int i = 0; i < nlocal; i++) {
+
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      const int type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            FFT_SCALAR w = x0*rho[0][l];
+            density_brick_a0[mz][my][mx] += w*B[7*type];
+            density_brick_a1[mz][my][mx] += w*B[7*type+1];
+            density_brick_a2[mz][my][mx] += w*B[7*type+2];
+            density_brick_a3[mz][my][mx] += w*B[7*type+3];
+            density_brick_a4[mz][my][mx] += w*B[7*type+4];
+            density_brick_a5[mz][my][mx] += w*B[7*type+5];
+            density_brick_a6[mz][my][mx] += w*B[7*type+6];
+          }
+        }
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- case when mixing rules don't apply
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    int type;
+    double **x = atom->x;
+
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density :
+      perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nysum = nlower_6 + ny - nylo_out_6;
+      int nxsum = nlower_6 + nx - nxlo_out_6;
+      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            FFT_SCALAR w0 = x0*rho[0][l];
+            for(int k = 0; k < nsplit; k++)
+              my_density[mzyx + k*ngrid_6] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  //double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho_lookup[idx][k];
+          rho1[k] = rho_lookup[idy][k];
+          rho2[k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1 = rho_coeff[order-1][k];
+          FFT_SCALAR r2 = rho_coeff[order-1][k];
+          FFT_SCALAR r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower] = r1;
+          rho1[k-nlower] = r2;
+          rho2[k-nlower] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
+
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      f[i][0] += qfactor*ekx;
+      f[i][1] += qfactor*eky;
+      if (slabflag != 2) f[i][2] += qfactor*ekz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  //double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm/xprd;
+    const flt_t hy_inv = ny_pppm/yprd;
+    const flt_t hz_inv = nz_pppm/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff[0];
+    const flt_t fsf_coeff1 = sf_coeff[1];
+    const flt_t fsf_coeff2 = sf_coeff[2];
+    const flt_t fsf_coeff3 = sf_coeff[3];
+    const flt_t fsf_coeff4 = sf_coeff[4];
+    const flt_t fsf_coeff5 = sf_coeff[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+          drho[0][k] = drho_lookup[idx][k];
+          drho[1][k] = drho_lookup[idy][k];
+          drho[2][k] = drho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff[order-1][k];
+          r2 = rho_coeff[order-1][k];
+          r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1 * dx;
+            r2 = rho_coeff[l][k] + r2 * dy;
+            r3 = rho_coeff[l][k] + r3 * dz;
+            dr1 = drho_coeff[l][k] + dr1 * dx;
+            dr2 = drho_coeff[l][k] + dr2 * dy;
+            dr3 = drho_coeff[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+          drho[0][k-nlower] = dr1;
+          drho[1][k-nlower] = dr2;
+          drho[2][k-nlower] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx[i] += ekx[l];
+        particle_eky[i] += eky[l];
+        particle_ekz[i] += ekz[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoqsq;
+      f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;
+
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoqsq;
+      f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;
+
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoqsq;
+
+      if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for geometric mixing rule
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double lj;
+    int type;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick_g[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx];
+
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+
+      // convert E-field to force
+
+      type = atom->type[i];
+      lj = B[type];
+      f[i][0] += lj*ekx;
+      f[i][1] += lj*eky;
+      if (slabflag != 2) f[i][2] += lj*ekz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for geometric mixing rule for ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick_g[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx[i] += ekx[l];
+        particle_eky[i] += eky[l];
+        particle_ekz[i] += ekz[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+
+      // convert E-field to force
+
+      const int type = atom->type[i];
+      const flt_t lj = B[type];
+      const flt_t twoljsq = 2.*lj*lj;
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoljsq;
+      f[i][0] += lj * particle_ekx[i] - sf;
+
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoljsq;
+      f[i][1] += lj * particle_eky[i] - sf;
+
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoljsq;
+
+      if (slabflag != 2) f[i][2] += lj * particle_ekz[i] -  sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for arithmetic mixing rule and ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
+              eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx];
+              ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx];
+              ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx];
+              eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx];
+              ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx];
+              ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx];
+              eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx];
+              ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx];
+              ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx];
+              eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx];
+              ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx];
+              ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx];
+              eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx];
+              ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx];
+              ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx];
+              eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx];
+              ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx];
+              ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx];
+              eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx];
+              ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx];
+          }
+        }
+      }
+
+      FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+      FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+      FFT_SCALAR ekx6, eky6, ekz6;
+      ekx0 = eky0 = ekz0 = ZEROF;
+      ekx1 = eky1 = ekz1 = ZEROF;
+      ekx2 = eky2 = ekz2 = ZEROF;
+      ekx3 = eky3 = ekz3 = ZEROF;
+      ekx4 = eky4 = ekz4 = ZEROF;
+      ekx5 = eky5 = ekz5 = ZEROF;
+      ekx6 = eky6 = ekz6 = ZEROF;
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx0 += ekx0_arr[l];
+          eky0 += eky0_arr[l];
+          ekz0 += ekz0_arr[l];
+          ekx1 += ekx1_arr[l];
+          eky1 += eky1_arr[l];
+          ekz1 += ekz1_arr[l];
+          ekx2 += ekx2_arr[l];
+          eky2 += eky2_arr[l];
+          ekz2 += ekz2_arr[l];
+          ekx3 += ekx3_arr[l];
+          eky3 += eky3_arr[l];
+          ekz3 += ekz3_arr[l];
+          ekx4 += ekx4_arr[l];
+          eky4 += eky4_arr[l];
+          ekz4 += ekz4_arr[l];
+          ekx5 += ekx5_arr[l];
+          eky5 += eky5_arr[l];
+          ekz5 += ekz5_arr[l];
+          ekx6 += ekx6_arr[l];
+          eky6 += eky6_arr[l];
+          ekz6 += ekz6_arr[l];
+        }
+
+      // convert D-field to force
+
+      const int type = atom->type[i];
+      const FFT_SCALAR lj0 = B[7*type+6];
+      const FFT_SCALAR lj1 = B[7*type+5];
+      const FFT_SCALAR lj2 = B[7*type+4];
+      const FFT_SCALAR lj3 = B[7*type+3];
+      const FFT_SCALAR lj4 = B[7*type+2];
+      const FFT_SCALAR lj5 = B[7*type+1];
+      const FFT_SCALAR lj6 = B[7*type];
+
+      f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 +
+        lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
+      f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 +
+        lj4*eky4 + lj5*eky5 + lj6*eky6;
+      if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 +
+                           lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for arithmetic mixing rule for the ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0;
+  FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0;
+  FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0;
+  FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1;
+  FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1;
+  FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1;
+  FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2;
+  FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2;
+  FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2;
+  FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3;
+  FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3;
+  FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3;
+  FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4;
+  FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4;
+  FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4;
+  FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5;
+  FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5;
+  FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5;
+  FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6;
+  FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6;
+  FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF;
+      particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF;
+      particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF;
+      particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF;
+      particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF;
+      particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF;
+      particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            FFT_SCALAR x0 = drho[0][l] * ekx_p;
+            FFT_SCALAR y0 = rho[0][l] * eky_p;
+            FFT_SCALAR z0 = rho[0][l] * ekz_p;
+
+            ekx0[l] +=  x0 * u_brick_a0[mz][my][mx];
+            eky0[l] +=  y0 * u_brick_a0[mz][my][mx];
+            ekz0[l] +=  z0 * u_brick_a0[mz][my][mx];
+            ekx1[l] +=  x0 * u_brick_a1[mz][my][mx];
+            eky1[l] +=  y0 * u_brick_a1[mz][my][mx];
+            ekz1[l] +=  z0 * u_brick_a1[mz][my][mx];
+            ekx2[l] +=  x0 * u_brick_a2[mz][my][mx];
+            eky2[l] +=  y0 * u_brick_a2[mz][my][mx];
+            ekz2[l] +=  z0 * u_brick_a2[mz][my][mx];
+            ekx3[l] +=  x0 * u_brick_a3[mz][my][mx];
+            eky3[l] +=  y0 * u_brick_a3[mz][my][mx];
+            ekz3[l] +=  z0 * u_brick_a3[mz][my][mx];
+            ekx4[l] +=  x0 * u_brick_a4[mz][my][mx];
+            eky4[l] +=  y0 * u_brick_a4[mz][my][mx];
+            ekz4[l] +=  z0 * u_brick_a4[mz][my][mx];
+            ekx5[l] +=  x0 * u_brick_a5[mz][my][mx];
+            eky5[l] +=  y0 * u_brick_a5[mz][my][mx];
+            ekz5[l] +=  z0 * u_brick_a5[mz][my][mx];
+            ekx6[l] +=  x0 * u_brick_a6[mz][my][mx];
+            eky6[l] +=  y0 * u_brick_a6[mz][my][mx];
+            ekz6[l] +=  z0 * u_brick_a6[mz][my][mx];
+          }
+        }
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        particle_ekx0[i] += ekx0[l];
+        particle_eky0[i] += eky0[l];
+        particle_ekz0[i] += ekz0[l];
+        particle_ekx1[i] += ekx1[l];
+        particle_eky1[i] += eky1[l];
+        particle_ekz1[i] += ekz1[l];
+        particle_ekx2[i] += ekx2[l];
+        particle_eky2[i] += eky2[l];
+        particle_ekz2[i] += ekz2[l];
+        particle_ekx3[i] += ekx3[l];
+        particle_eky3[i] += eky3[l];
+        particle_ekz3[i] += ekz3[l];
+        particle_ekx4[i] += ekx4[l];
+        particle_eky4[i] += eky4[l];
+        particle_ekz4[i] += ekz4[l];
+        particle_ekx5[i] += ekx5[l];
+        particle_eky5[i] += eky5[l];
+        particle_ekz5[i] += ekz5[l];
+        particle_ekx6[i] += ekx6[l];
+        particle_eky6[i] += eky6[l];
+        particle_ekz6[i] += ekz6[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx0[i] *= hx_inv;
+      particle_eky0[i] *= hy_inv;
+      particle_ekz0[i] *= hz_inv;
+      particle_ekx1[i] *= hx_inv;
+      particle_eky1[i] *= hy_inv;
+      particle_ekz1[i] *= hz_inv;
+      particle_ekx2[i] *= hx_inv;
+      particle_eky2[i] *= hy_inv;
+      particle_ekz2[i] *= hz_inv;
+      particle_ekx3[i] *= hx_inv;
+      particle_eky3[i] *= hy_inv;
+      particle_ekz3[i] *= hz_inv;
+      particle_ekx4[i] *= hx_inv;
+      particle_eky4[i] *= hy_inv;
+      particle_ekz4[i] *= hz_inv;
+      particle_ekx5[i] *= hx_inv;
+      particle_eky5[i] *= hy_inv;
+      particle_ekz5[i] *= hz_inv;
+      particle_ekx6[i] *= hx_inv;
+      particle_eky6[i] *= hy_inv;
+      particle_ekz6[i] *= hz_inv;
+
+      // convert D-field to force
+
+      const int type = atom->type[i];
+      const FFT_SCALAR lj0 = B[7*type+6];
+      const FFT_SCALAR lj1 = B[7*type+5];
+      const FFT_SCALAR lj2 = B[7*type+4];
+      const FFT_SCALAR lj3 = B[7*type+3];
+      const FFT_SCALAR lj4 = B[7*type+2];
+      const FFT_SCALAR lj5 = B[7*type+1];
+      const FFT_SCALAR lj6 = B[7*type];
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] +
+        lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] +
+        lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf;
+
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] +
+        lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] +
+        lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf;
+
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      if (slabflag != 2)
+      f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] +
+        lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] +
+        lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for no mixing rule and ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double lj;
+    int type;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+
+      _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+      _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+      _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+
+      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF;
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int n = 0; n < order_6; n++) {
+          int mz = n+nzsum;
+          FFT_SCALAR z0 = rho2[n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count=7
+          #endif
+          for (int m = 0; m < order_6; m++) {
+            int my = m+nysum;
+            FFT_SCALAR y0 = z0*rho1[m];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma simd
+            #endif
+            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+              int mx = l+nxsum;
+              FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
+                x0*vdx_brick_none[k][mz][my][mx];
+              eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
+                x0*vdy_brick_none[k][mz][my][mx];
+              ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -=
+                x0*vdz_brick_none[k][mz][my][mx];
+            }
+          }
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx[nsplit], 64);
+      _alignvar(FFT_SCALAR eky[nsplit], 64);
+      _alignvar(FFT_SCALAR ekz[nsplit], 64);
+      for (int k = 0; k < nsplit; k++) {
+        ekx[k] = eky[k] = ekz[k] = ZEROF;
+      }
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          for (int k = 0; k < nsplit; k++) {
+            ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+            eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+            ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+          }
+        }
+
+      // convert E-field to force
+
+      type = atom->type[i];
+      for (int k = 0; k < nsplit; k++) {
+        lj = B[nsplit*type + k];
+        f[i][0] += lj*ekx[k];
+        f[i][1] += lj*eky[k];
+        if (slabflag != 2) f[i][2] += lj*ekz[k];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for no mixing rule for the ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+   #if defined(_OPENMP)
+   #pragma omp parallel default(none)           \
+     shared(nlocal, nthr) if(!_use_lrt)
+   #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+      _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+      _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+
+      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        ekx[k]=eky[k]=ekz[k]=ZEROF;
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif
+        for (int n = 0; n < order_6; n++) {
+          int mz = n + nzsum;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count=7
+          #endif
+          for (int m = 0; m < order_6; m++) {
+            int my = m + nysum;
+            FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+            FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+            FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma simd
+            #endif
+            for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+              int mx = l + nxsum;
+              ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p *
+                u_brick_none[k][mz][my][mx];
+              eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * eky_p *
+                u_brick_none[k][mz][my][mx];
+              ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * ekz_p *
+                u_brick_none[k][mz][my][mx];
+            }
+          }
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_tot[nsplit], 64);
+      _alignvar(FFT_SCALAR eky_tot[nsplit], 64);
+      _alignvar(FFT_SCALAR ekz_tot[nsplit], 64);
+      for (int k = 0; k < nsplit; k++) {
+        ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF;
+      }
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+        for (int k = 0; k < nsplit; k++) {
+          ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+          eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+          ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+        }
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        ekx_tot[k] *= hx_inv;
+        eky_tot[k] *= hy_inv;
+        ekz_tot[k] *= hz_inv;
+      }
+      // convert D-field to force
+
+      const int type = atom->type[i];
+
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf1 += fsf_coeff1 * sin(ffour_pi * s1);
+
+      flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf2 += fsf_coeff3 * sin(ffour_pi * s2);
+
+      flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf3 += fsf_coeff5 * sin(ffour_pi * s3);
+      for (int k = 0; k < nsplit; k++) {
+        const flt_t lj = B[nsplit*type + k];
+        const flt_t twoljsq = lj*lj * B[k] * 2;
+        flt_t sf = sf1*twoljsq;
+        f[i][0] += lj * ekx_tot[k] - sf;
+        sf = sf2*twoljsq;
+        f[i][1] += lj * eky_tot[k] - sf;
+        sf = sf3*twoljsq;
+        if (slabflag != 2) f[i][2] += lj * ekz_tot[k] -  sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   precompute rho coefficients as a lookup table to save time in make_rho
+   and fieldforce.  Instead of doing this polynomial for every atom 6 times
+   per time step, precompute it for some number of points.
+------------------------------------------------------------------------- */
+
+void PPPMDispIntel::precompute_rho()
+{
+
+  half_rho_scale = (rho_points - 1.)/2.;
+  half_rho_scale_plus = half_rho_scale + 0.5;
+
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower; k<=nupper;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order-1; l>=0; l--){
+        r1 = rho_coeff[l][k] + r1*dx;
+      }
+      rho_lookup[i][k-nlower] = r1;
+    }
+    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower; k<=nupper;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order-2; l>=0; l--){
+          r1 = drho_coeff[l][k] + r1*dx;
+        }
+        drho_lookup[i][k-nlower] = r1;
+      }
+      for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho_lookup[i][k] = 0;
+      }
+    }
+  }
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower_6; k<=nupper_6;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order_6-1; l>=0; l--){
+        r1 = rho_coeff_6[l][k] + r1*dx;
+      }
+      rho6_lookup[i][k-nlower_6] = r1;
+    }
+    for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho6_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower_6; k<=nupper_6;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order_6-2; l>=0; l--){
+          r1 = drho_coeff_6[l][k] + r1*dx;
+        }
+        drho6_lookup[i][k-nlower_6] = r1;
+      }
+      for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho6_lookup[i][k] = 0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Returns 0 if Intel optimizations for PPPM ignored due to offload
+------------------------------------------------------------------------- */
+
+#ifdef _LMP_INTEL_OFFLOAD
+int PPPMDispIntel::use_base() {
+  return _use_base;
+}
+#endif
diff --git a/src/USER-INTEL/pppm_disp_intel.h b/src/USER-INTEL/pppm_disp_intel.h
index 166152004e..65c43dd486 100644
--- a/src/USER-INTEL/pppm_disp_intel.h
+++ b/src/USER-INTEL/pppm_disp_intel.h
@@ -1,238 +1,238 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: William McDoniel (RWTH Aachen University)
-------------------------------------------------------------------------- */
-
-#ifdef KSPACE_CLASS
-
-KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
-
-#else
-
-#ifndef LMP_PPPMINTEL_DISP_H
-#define LMP_PPPMINTEL_DISP_H
-
-#include "pppm_disp.h"
-#include "fix_intel.h"
-
-namespace LAMMPS_NS {
-
-  class PPPMDispIntel : public PPPMDisp {
-  public:
-    PPPMDispIntel(class LAMMPS *, int, char **);
-    virtual ~PPPMDispIntel();
-    virtual void init();
-    virtual void compute(int, int);
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    int use_base();
-    #endif
-    
-  protected:
-    FixIntel *fix;
-
-    int _use_lrt;
-    FFT_SCALAR **perthread_density;
-    FFT_SCALAR *particle_ekx;
-    FFT_SCALAR *particle_eky;
-    FFT_SCALAR *particle_ekz;
-    FFT_SCALAR *particle_ekx0;
-    FFT_SCALAR *particle_eky0;
-    FFT_SCALAR *particle_ekz0;
-    FFT_SCALAR *particle_ekx1;
-    FFT_SCALAR *particle_eky1;
-    FFT_SCALAR *particle_ekz1;
-    FFT_SCALAR *particle_ekx2;
-    FFT_SCALAR *particle_eky2;
-    FFT_SCALAR *particle_ekz2;
-    FFT_SCALAR *particle_ekx3;
-    FFT_SCALAR *particle_eky3;
-    FFT_SCALAR *particle_ekz3;
-    FFT_SCALAR *particle_ekx4;
-    FFT_SCALAR *particle_eky4;
-    FFT_SCALAR *particle_ekz4;
-    FFT_SCALAR *particle_ekx5;
-    FFT_SCALAR *particle_eky5;
-    FFT_SCALAR *particle_ekz5;
-    FFT_SCALAR *particle_ekx6;
-    FFT_SCALAR *particle_eky6;
-    FFT_SCALAR *particle_ekz6;  
-            
-    
-
-    int _use_table;
-    int rho_points;
-    FFT_SCALAR **rho_lookup;
-    FFT_SCALAR **rho6_lookup;
-    FFT_SCALAR **drho_lookup;
-    FFT_SCALAR **drho6_lookup;
-    FFT_SCALAR half_rho_scale, half_rho_scale_plus;
-
-    int _use_packing;
-
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    int _use_base;
-    #endif
-    
-    template<class flt_t, class acc_t>
-    void particle_map(double, double, double,
-                      double, int **, int, int,
-                      int, int, int,
-		      int, int, int,
-		      IntelBuffers<flt_t,acc_t> *buffers);
-    
-    template<class flt_t, class acc_t, int use_table>
-    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        make_rho_c<flt_t,acc_t,1>(buffers);
-      } else {
-        make_rho_c<flt_t,acc_t,0>(buffers);
-      }
-    }
-  
-    template<class flt_t, class acc_t, int use_table>
-    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        make_rho_g<flt_t,acc_t,1>(buffers);
-      } else {
-        make_rho_g<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        make_rho_a<flt_t,acc_t,1>(buffers);
-      } else {
-        make_rho_a<flt_t,acc_t,0>(buffers);
-      }
-    }
-
-    
-    template<class flt_t, class acc_t, int use_table>
-    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        make_rho_none<flt_t,acc_t,1>(buffers);
-      } else {
-        make_rho_none<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_c_ik<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_c_ik<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_c_ad<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_c_ad<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_g_ik<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_g_ik<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_g_ad<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_g_ad<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_a_ik<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_a_ik<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_a_ad<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_a_ad<flt_t,acc_t,0>(buffers);
-      }
-    }    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
-     template<class flt_t, class acc_t>
-    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_none_ik<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_none_ik<flt_t,acc_t,0>(buffers);
-      }
-    }
-    
-    template<class flt_t, class acc_t, int use_table>
-    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
-    template<class flt_t, class acc_t>
-    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
-      if (_use_table == 1) {
-        fieldforce_none_ad<flt_t,acc_t,1>(buffers);
-      } else {
-        fieldforce_none_ad<flt_t,acc_t,0>(buffers);
-      }
-    }
-
-    void precompute_rho();
-    
-  };
-
-}
-#endif
-#endif
-    
-  
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
+
+#else
+
+#ifndef LMP_PPPMINTEL_DISP_H
+#define LMP_PPPMINTEL_DISP_H
+
+#include "pppm_disp.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMDispIntel : public PPPMDisp {
+  public:
+    PPPMDispIntel(class LAMMPS *, int, char **);
+    virtual ~PPPMDispIntel();
+    virtual void init();
+    virtual void compute(int, int);
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    int use_base();
+    #endif
+
+  protected:
+    FixIntel *fix;
+
+    int _use_lrt;
+    FFT_SCALAR **perthread_density;
+    FFT_SCALAR *particle_ekx;
+    FFT_SCALAR *particle_eky;
+    FFT_SCALAR *particle_ekz;
+    FFT_SCALAR *particle_ekx0;
+    FFT_SCALAR *particle_eky0;
+    FFT_SCALAR *particle_ekz0;
+    FFT_SCALAR *particle_ekx1;
+    FFT_SCALAR *particle_eky1;
+    FFT_SCALAR *particle_ekz1;
+    FFT_SCALAR *particle_ekx2;
+    FFT_SCALAR *particle_eky2;
+    FFT_SCALAR *particle_ekz2;
+    FFT_SCALAR *particle_ekx3;
+    FFT_SCALAR *particle_eky3;
+    FFT_SCALAR *particle_ekz3;
+    FFT_SCALAR *particle_ekx4;
+    FFT_SCALAR *particle_eky4;
+    FFT_SCALAR *particle_ekz4;
+    FFT_SCALAR *particle_ekx5;
+    FFT_SCALAR *particle_eky5;
+    FFT_SCALAR *particle_ekz5;
+    FFT_SCALAR *particle_ekx6;
+    FFT_SCALAR *particle_eky6;
+    FFT_SCALAR *particle_ekz6;
+
+
+
+    int _use_table;
+    int rho_points;
+    FFT_SCALAR **rho_lookup;
+    FFT_SCALAR **rho6_lookup;
+    FFT_SCALAR **drho_lookup;
+    FFT_SCALAR **drho6_lookup;
+    FFT_SCALAR half_rho_scale, half_rho_scale_plus;
+
+    int _use_packing;
+
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    int _use_base;
+    #endif
+
+    template<class flt_t, class acc_t>
+    void particle_map(double, double, double,
+                      double, int **, int, int,
+                      int, int, int,
+                      int, int, int,
+                      IntelBuffers<flt_t,acc_t> *buffers);
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_c<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_c<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_g<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_g<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_a<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_a<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_none<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_none<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_c_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_c_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_c_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_c_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_g_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_g_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_g_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_g_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_a_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_a_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_a_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_a_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
+     template<class flt_t, class acc_t>
+    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_none_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_none_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_none_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_none_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    void precompute_rho();
+
+  };
+
+}
+#endif
+#endif
+
+
diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp
index 42bdec46ee..8416b6f3a3 100644
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@@ -14,7 +14,7 @@
 /* ----------------------------------------------------------------------
    Contributing authors: William McDoniel (RWTH Aachen University)
                          Rodrigo Canales (RWTH Aachen University)
-			 Markus Hoehnerbach (RWTH Aachen University)
+                         Markus Hoehnerbach (RWTH Aachen University)
                          W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
@@ -62,10 +62,10 @@ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
 
   perthread_density = NULL;
   particle_ekx = particle_eky = particle_ekz = NULL;
-  
+
   rho_lookup = drho_lookup = NULL;
   rho_points = 0;
-  
+
   vdxy_brick = vdz0_brick = NULL;
   work3 = NULL;
   cg_pack = NULL;
@@ -120,20 +120,20 @@ void PPPMIntel::init()
   if ((comm->nthreads > 1) && !_use_lrt) {
     memory->destroy(perthread_density);
     memory->create(perthread_density, comm->nthreads-1,
-		   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
                    "pppmintel:perthread_density");
   }
-  
+
   _use_table = fix->pppm_table();
   if (_use_table) {
     rho_points = 5000;
     memory->destroy(rho_lookup);
     memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
-		   "pppmintel:rho_lookup");
+                   "pppmintel:rho_lookup");
     if(differentiation_flag == 1) {
       memory->destroy(drho_lookup);
       memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
-		     "pppmintel:drho_lookup");
+                     "pppmintel:drho_lookup");
     }
     precompute_rho();
   }
@@ -141,7 +141,7 @@ void PPPMIntel::init()
   if (order > INTEL_P3M_MAXORDER)
     error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
 
-  _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16) 
+  _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16)
                               && (sizeof(FFT_SCALAR) == sizeof(float))
                               && (differentiation_flag == 0);
   if (_use_packing) {
@@ -149,13 +149,13 @@ void PPPMIntel::init()
     memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
     memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
     memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, 
-			    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
-			    "pppmintel:vdxy_brick");
+    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2,
+                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+                            "pppmintel:vdxy_brick");
     memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
-    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, 
-			    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
-			    "pppmintel:vdz0_brick");
+    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2,
+                            nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+                            "pppmintel:vdz0_brick");
     memory->destroy(work3);
     memory->create(work3, 2*nfft_both, "pppmintel:work3");
 
@@ -163,10 +163,10 @@ void PPPMIntel::init()
     delete cg_pack;
     int (*procneigh)[2] = comm->procneigh;
     cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
-			   nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
-			   nylo_out,nyhi_out,nzlo_out,nzhi_out,
-			   procneigh[0][0],procneigh[0][1],procneigh[1][0],
-			   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+                           nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
+                           nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
 
     cg_pack->ghost_notify();
     cg_pack->setup();
@@ -484,7 +484,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
   {
     const int nix = nxhi_out - nxlo_out + 1;
     const int niy = nyhi_out - nylo_out + 1;
-  
+
     const flt_t lo0 = boxlo[0];
     const flt_t lo1 = boxlo[1];
     const flt_t lo2 = boxlo[2];
@@ -503,7 +503,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
     memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
 
     for (int i = ifrom; i < ito; i++) {
-  
+
       int nx = part2grid[i][0];
       int ny = part2grid[i][1];
       int nz = part2grid[i][2];
@@ -515,9 +515,9 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
       FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
       FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
-  
+
       _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
+
       if (use_table) {
         dx = dx*half_rho_scale + half_rho_scale_plus;
         int idx = dx;
@@ -527,7 +527,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
         #pragma simd
-        #endif   
+        #endif
         for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
           rho[1][k] = rho_lookup[idy][k];
@@ -536,11 +536,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
       } else {
         #if defined(LMP_SIMD_COMPILER)
         #pragma simd
-        #endif   
+        #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3;
           r1 = r2 = r3 = ZEROF;
-  
+
           for (int l = order-1; l >= 0; l--) {
             r1 = rho_coeff[l][k] + r1*dx;
             r2 = rho_coeff[l][k] + r2*dy;
@@ -551,24 +551,24 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
           rho[2][k-nlower] = r3;
         }
       }
-  
+
       FFT_SCALAR z0 = fdelvolinv * q[i];
 
       #if defined(LMP_SIMD_COMPILER)
       #pragma loop_count=7
-      #endif   
+      #endif
       for (int n = 0; n < order; n++) {
         int mz = n*nix*niy + nzsum;
         FFT_SCALAR y0 = z0*rho[2][n];
         #if defined(LMP_SIMD_COMPILER)
         #pragma loop_count=7
-        #endif   
+        #endif
         for (int m = 0; m < order; m++) {
           int mzy = m*nix + mz;
           FFT_SCALAR x0 = y0*rho[1][m];
           #if defined(LMP_SIMD_COMPILER)
           #pragma simd
-          #endif   
+          #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mzyx = l + mzy;
             my_density[mzyx] += x0*rho[0][l];
@@ -709,21 +709,21 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
 
       #if defined(LMP_SIMD_COMPILER)
       #pragma loop_count=7
-      #endif   
+      #endif
       for (int n = 0; n < order; n++) {
         int mz = n+nzsum;
         FFT_SCALAR z0 = rho2[n];
         #if defined(LMP_SIMD_COMPILER)
         #pragma loop_count=7
-        #endif   
+        #endif
         for (int m = 0; m < order; m++) {
           int my = m+nysum;
           FFT_SCALAR y0 = z0*rho1[m];
           #if defined(LMP_SIMD_COMPILER)
           #pragma simd
-          #endif   
+          #endif
           for (int l = 0; l < (use_packing ? 2 : 1) *
-		 INTEL_P3M_ALIGNED_MAXORDER; l++) {
+                 INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mx = l+nxsum;
             FFT_SCALAR x0 = y0*rho0[l];
             if (use_packing) {
@@ -824,13 +824,13 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
     const flt_t fsf_coeff3 = sf_coeff[3];
     const flt_t fsf_coeff4 = sf_coeff[4];
     const flt_t fsf_coeff5 = sf_coeff[5];
-  
+
     int ifrom, ito, tid;
     IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
 
     _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
     _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
+
     for (int i = ifrom; i < ito; i++) {
       int nx = part2grid[i][0];
       int ny = part2grid[i][1];
@@ -838,11 +838,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
       FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
       FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
-  
+
       int nxsum = nx + nlower;
       int nysum = ny + nlower;
       int nzsum = nz + nlower;
-  
+
       if (use_table) {
         dx = dx*half_rho_scale + half_rho_scale_plus;
         int idx = dx;
@@ -852,7 +852,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
         int idz = dz;
         #if defined(LMP_SIMD_COMPILER)
         #pragma simd
-        #endif   
+        #endif
         for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
           rho[0][k] = rho_lookup[idx][k];
           rho[1][k] = rho_lookup[idy][k];
@@ -864,11 +864,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       } else {
         #if defined(LMP_SIMD_COMPILER)
         #pragma simd
-        #endif   
+        #endif
         for (int k = nlower; k <= nupper; k++) {
           FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
           dr1 = dr2 = dr3 = ZEROF;
-  
+
           r1 = rho_coeff[order-1][k];
           r2 = rho_coeff[order-1][k];
           r3 = rho_coeff[order-1][k];
@@ -888,21 +888,21 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
           drho[2][k-nlower] = dr3;
         }
       }
-  
+
       _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
       _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
       _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
-  
+
       particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
-  
+
       #if defined(LMP_SIMD_COMPILER)
       #pragma loop_count=7
-      #endif   
+      #endif
       for (int n = 0; n < order; n++) {
         int mz = n + nzsum;
         #if defined(LMP_SIMD_COMPILER)
         #pragma loop_count=7
-        #endif   
+        #endif
         for (int m = 0; m < order; m++) {
           int my = m + nysum;
           FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
@@ -910,7 +910,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
           FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
           #if defined(LMP_SIMD_COMPILER)
           #pragma simd
-          #endif   
+          #endif
           for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
             int mx = l + nxsum;
             ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
@@ -919,17 +919,17 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
           }
         }
       }
-  
+
       #if defined(LMP_SIMD_COMPILER)
       #pragma simd
       #endif
       for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
-      	particle_ekx[i] += ekx[l];
-      	particle_eky[i] += eky[l];
-      	particle_ekz[i] += ekz[l];
+        particle_ekx[i] += ekx[l];
+        particle_eky[i] += eky[l];
+        particle_ekz[i] += ekz[l];
       }
     }
-  
+
     #if defined(LMP_SIMD_COMPILER)
     #pragma simd
     #endif
@@ -937,12 +937,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       particle_ekx[i] *= hx_inv;
       particle_eky[i] *= hy_inv;
       particle_ekz[i] *= hz_inv;
-  
+
       // convert E-field to force
-  
+
       const flt_t qfactor = fqqrd2es * q[i];
       const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
-  
+
       const flt_t s1 = x[i].x * hx_inv;
       const flt_t s2 = x[i].y * hy_inv;
       const flt_t s3 = x[i].z * hz_inv;
@@ -950,16 +950,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
       sf += fsf_coeff1 * sin(ffour_pi * s1);
       sf *= twoqsq;
       f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
-  
+
       sf = fsf_coeff2 * sin(ftwo_pi * s2);
       sf += fsf_coeff3 * sin(ffour_pi * s2);
       sf *= twoqsq;
       f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
-  
+
       sf = fsf_coeff4 * sin(ftwo_pi * s3);
       sf += fsf_coeff5 * sin(ffour_pi * s3);
       sf *= twoqsq;
-  
+
       if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
     }
   }
@@ -1000,7 +1000,7 @@ void PPPMIntel::poisson_ik_intel()
       n = 0;
       for (i = 0; i < nfft; i++) {
         eng = s2 * greensfn[i] * (work1[n]*work1[n] +
-				  work1[n+1]*work1[n+1]);
+                                  work1[n+1]*work1[n+1]);
         for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
         if (eflag_global) energy += eng;
         n += 2;
@@ -1069,10 +1069,10 @@ void PPPMIntel::poisson_ik_intel()
     for (j = nylo_in; j <= nyhi_in; j++)
       for (i = nxlo_in; i <= nxhi_in; i++) {
         vdxy_brick[k][j][2*i] = work2[n];
-	vdxy_brick[k][j][2*i+1] = work3[n];
+        vdxy_brick[k][j][2*i+1] = work3[n];
         n += 2;
       }
-  
+
   // z direction gradient
 
   n = 0;
@@ -1091,7 +1091,7 @@ void PPPMIntel::poisson_ik_intel()
     for (j = nylo_in; j <= nyhi_in; j++)
       for (i = nxlo_in; i <= nxhi_in; i++) {
         vdz0_brick[k][j][2*i] = work2[n];
-	vdz0_brick[k][j][2*i+1] = 0.;
+        vdz0_brick[k][j][2*i+1] = 0.;
         n += 2;
       }
 }
@@ -1202,7 +1202,7 @@ double PPPMIntel::memory_usage()
     }
   }
   if (_use_packing) {
-    bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1) 
+    bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1)
                * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
     bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
                * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
@@ -1228,7 +1228,7 @@ void PPPMIntel::pack_buffers()
   {
     int ifrom, ito, tid;
     IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
-                              packthreads, 
+                              packthreads,
                               sizeof(IntelBuffers<float,double>::atom_t));
     if (fix->precision() == FixIntel::PREC_MODE_MIXED)
       fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);
diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h
index d48a6b709e..e152486b29 100644
--- a/src/USER-INTEL/pppm_intel.h
+++ b/src/USER-INTEL/pppm_intel.h
@@ -14,7 +14,7 @@
 /* ----------------------------------------------------------------------
    Contributing authors: William McDoniel (RWTH Aachen University)
                          Rodrigo Canales (RWTH Aachen University)
-			 Markus Hoehnerbach (RWTH Aachen University)
+                         Markus Hoehnerbach (RWTH Aachen University)
                          W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
@@ -77,7 +77,7 @@ class PPPMIntel : public PPPM {
     template<class flt_t, class acc_t>
     void test_function(IntelBuffers<flt_t,acc_t> *buffers);
 
-  
+
   void precompute_rho();
   template<class flt_t, class acc_t>
   void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index b44870e9b0..81f4586143 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -51,7 +51,7 @@ VerletLRTIntel::VerletLRTIntel(LAMMPS *lmp, int narg, char **arg) :
 
 /* ---------------------------------------------------------------------- */
 
-VerletLRTIntel::~VerletLRTIntel() 
+VerletLRTIntel::~VerletLRTIntel()
 {
   #if defined(_LMP_INTEL_LRT_PTHREAD)
   pthread_mutex_destroy(&_kmutex);
@@ -67,10 +67,10 @@ void VerletLRTIntel::init()
   Verlet::init();
 
   _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
-  
+
   #ifdef LMP_INTEL_NOLRT
-  error->all(FLERR, 
-	     "LRT otion for Intel package disabled at compile time");
+  error->all(FLERR,
+             "LRT otion for Intel package disabled at compile time");
   #endif
 }
 
@@ -83,7 +83,7 @@ void VerletLRTIntel::setup(int flag)
   if (_intel_kspace == 0) {
     Verlet::setup(flag);
     return;
-  } 
+  }
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_intel_kspace->use_base()) {
@@ -154,15 +154,15 @@ void VerletLRTIntel::setup(int flag)
   _intel_kspace->setup();
 
   #if defined(_LMP_INTEL_LRT_PTHREAD)
-  pthread_create(&_kspace_thread, &_kspace_attr, 
-		 &VerletLRTIntel::k_launch_loop, this);
+  pthread_create(&_kspace_thread, &_kspace_attr,
+                 &VerletLRTIntel::k_launch_loop, this);
   #elif defined(_LMP_INTEL_LRT_11)
   std::thread kspace_thread;
-  if (kspace_compute_flag) 
-    _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag, 
+  if (kspace_compute_flag)
+    _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag,
                                                                  vflag); });
-  else 
-    _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag, 
+  else
+    _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag,
                                                                  vflag); });
   #endif
 
@@ -297,8 +297,8 @@ void VerletLRTIntel::run(int n)
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
     std::thread kspace_thread;
-    if (kspace_compute_flag) 
-      kspace_thread=std::thread([=] { 
+    if (kspace_compute_flag)
+      kspace_thread=std::thread([=] {
         _intel_kspace->compute_first(eflag, vflag);
         timer->stamp(Timer::KSPACE);
       } );
@@ -329,7 +329,7 @@ void VerletLRTIntel::run(int n)
     _kspace_done = 0;
     pthread_mutex_unlock(&_kmutex);
     #elif defined(_LMP_INTEL_LRT_11)
-    if (kspace_compute_flag) 
+    if (kspace_compute_flag)
       kspace_thread.join();
     #endif
 
@@ -367,7 +367,7 @@ void VerletLRTIntel::run(int n)
   }
 
   #if defined(_LMP_INTEL_LRT_PTHREAD)
-  if (run_cancelled) 
+  if (run_cancelled)
     pthread_cancel(_kspace_thread);
   else {
     pthread_mutex_lock(&_kmutex);
@@ -390,9 +390,9 @@ void * VerletLRTIntel::k_launch_loop(void *context)
 {
   VerletLRTIntel * const c = (VerletLRTIntel *)context;
 
-  if (c->kspace_compute_flag) 
+  if (c->kspace_compute_flag)
     c->_intel_kspace->compute_first(c->eflag, c->vflag);
-  else 
+  else
     c->_intel_kspace->compute_dummy(c->eflag, c->vflag);
 
   pthread_mutex_lock(&(c->_kmutex));
@@ -408,7 +408,7 @@ void * VerletLRTIntel::k_launch_loop(void *context)
   pthread_mutex_unlock(&(c->_kmutex));
 
   for (int i = 0; i < n; i++) {
-    
+
     if (c->kspace_compute_flag) {
       c->_intel_kspace->compute_first(c->eflag, c->vflag);
       c->timer->stamp(Timer::KSPACE);
-- 
GitLab