diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 302b50124a0429d0f64df1a9979a5265051f8112..7ec83b3207b06c4bbda7d56f2a7d9d94a15d115d 100755
Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ
diff --git a/doc/src/Section_commands.txt b/doc/src/Section_commands.txt
index f1eb225fe50aa6fdbf78fc878a5aea75598f6372..571c6c4920eb658b20d03c3540e0341fe8e7ddad 100644
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@@ -892,8 +892,8 @@ KOKKOS, o = USER-OMP, t = OPT.
 "hybrid"_pair_hybrid.html,
 "hybrid/overlay"_pair_hybrid.html,
 "adp (o)"_pair_adp.html,
-"airebo (o)"_pair_airebo.html,
-"airebo/morse (o)"_pair_airebo.html,
+"airebo (oi)"_pair_airebo.html,
+"airebo/morse (oi)"_pair_airebo.html,
 "beck (go)"_pair_beck.html,
 "body"_pair_body.html,
 "bop"_pair_bop.html,
@@ -927,8 +927,8 @@ KOKKOS, o = USER-OMP, t = OPT.
 "dpd/tstat (go)"_pair_dpd.html,
 "dsmc"_pair_dsmc.html,
 "eam (gkiot)"_pair_eam.html,
-"eam/alloy (gkot)"_pair_eam.html,
-"eam/fs (gkot)"_pair_eam.html,
+"eam/alloy (gkiot)"_pair_eam.html,
+"eam/fs (gkiot)"_pair_eam.html,
 "eim (o)"_pair_eim.html,
 "gauss (go)"_pair_gauss.html,
 "gayberne (gio)"_pair_gayberne.html,
@@ -942,9 +942,9 @@ KOKKOS, o = USER-OMP, t = OPT.
 "kim"_pair_kim.html,
 "lcbop"_pair_lcbop.html,
 "line/lj"_pair_line_lj.html,
-"lj/charmm/coul/charmm (ko)"_pair_charmm.html,
+"lj/charmm/coul/charmm (kio)"_pair_charmm.html,
 "lj/charmm/coul/charmm/implicit (ko)"_pair_charmm.html,
-"lj/charmm/coul/long (giko)"_pair_charmm.html,
+"lj/charmm/coul/long (gkio)"_pair_charmm.html,
 "lj/charmm/coul/msm"_pair_charmm.html,
 "lj/charmmfsw/coul/charmmfsh"_pair_charmm.html,
 "lj/charmmfsw/coul/long"_pair_charmm.html,
@@ -990,7 +990,7 @@ KOKKOS, o = USER-OMP, t = OPT.
 "polymorphic"_pair_polymorphic.html,
 "python"_pair_python.html,
 "reax"_pair_reax.html,
-"rebo (o)"_pair_airebo.html,
+"rebo (oi)"_pair_airebo.html,
 "resquared (go)"_pair_resquared.html,
 "snap"_pair_snap.html,
 "soft (go)"_pair_soft.html,
diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index 9eb295e0d08d022201149b2cc2912cae7f78dcf5..a7c3382caab2322da14aba5204fbb953c4a96c8b 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -29,8 +29,10 @@ Bond Styles: fene, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
-Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
+buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
+sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule
 
diff --git a/doc/src/pair_airebo.txt b/doc/src/pair_airebo.txt
index e66ecb637f7714fd2e41c89ffc8a4c88414bee3c..1aa017f2786ac95e60556d84b5a900fcab5a1704 100644
--- a/doc/src/pair_airebo.txt
+++ b/doc/src/pair_airebo.txt
@@ -7,10 +7,13 @@
 :line
 
 pair_style airebo command :h3
+pair_style airebo/intel command :h3
 pair_style airebo/omp command :h3
 pair_style airebo/morse command :h3
+pair_style airebo/morse/intel command :h3
 pair_style airebo/morse/omp command :h3
 pair_style rebo command :h3
+pair_style rebo/intel command :h3
 pair_style rebo/omp command :h3
 
 [Syntax:]
diff --git a/doc/src/pair_charmm.txt b/doc/src/pair_charmm.txt
index ef4ef41c954db0aef17b2998012f85adab936830..75a8e4bff944f8b6786f61bb50aa7eefe1e082c7 100644
--- a/doc/src/pair_charmm.txt
+++ b/doc/src/pair_charmm.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style lj/charmm/coul/charmm command :h3
+pair_style lj/charmm/coul/charmm/intel command :h3
 pair_style lj/charmm/coul/charmm/omp command :h3
 pair_style lj/charmm/coul/charmm/implicit command :h3
 pair_style lj/charmm/coul/charmm/implicit/omp command :h3
diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt
index ce8495affd006b014dba20fe6732051b0fa34faa..a0026432ec6cd52255ca53f8a084c8f10b60454f 100644
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@@ -14,6 +14,7 @@ pair_style eam/omp command :h3
 pair_style eam/opt command :h3
 pair_style eam/alloy command :h3
 pair_style eam/alloy/gpu command :h3
+pair_style eam/alloy/intel command :h3
 pair_style eam/alloy/kk command :h3
 pair_style eam/alloy/omp command :h3
 pair_style eam/alloy/opt command :h3
@@ -21,6 +22,7 @@ pair_style eam/cd command :h3
 pair_style eam/cd/omp command :h3
 pair_style eam/fs command :h3
 pair_style eam/fs/gpu command :h3
+pair_style eam/fs/intel command :h3
 pair_style eam/fs/kk command :h3
 pair_style eam/fs/omp command :h3
 pair_style eam/fs/opt command :h3
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index ac8279949a8abd41aecda963df99bd907b9e629f..6a4c4c14be2ac54c174b7b09fc709afc817803ad 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -14,7 +14,7 @@ SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
index 389a578f7224fb35d65f558d138b28945b7bed27..d4cbdbdb03d7e6b0c08d3409d99e9ea07844eafb 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@@ -7,7 +7,7 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		mpicxx -cxx=icc
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
index b65905440dbaea8db6ddd6d8811074fcf7b81e7f..50433ce4c6be736af395e67455fb7a767fb004c1 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 export OMPI_CXX = icc
 CC =		mpicxx
-OPTFLAGS =      -xAVX -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
index 736059aa06d7caa58c4a28a506c44690e1474878..275b4839f5d5ed738ff70938435c4ac6795d9e96 100644
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@@ -46,6 +46,7 @@ action npair_intel.h
 action npair_intel.cpp
 action intel_simd.h pair_sw_intel.cpp
 action intel_intrinsics.h pair_tersoff_intel.cpp
+action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 action verlet_lrt_intel.h pppm.cpp
 action verlet_lrt_intel.cpp pppm.cpp
 
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index c02014d0ce9b5e9ea16948b97a78b081dd84fefb..3b8444605775e8525d3f6c65f24f355e865c8bb7 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -4,9 +4,9 @@
                      --------------------------------
                      
              W. Michael Brown (Intel) michael.w.brown at intel.com
+                  Markus Hohnerbach (RWTH Aachen University)
                    William McDoniel (RWTH Aachen University)
                    Rodrigo Canales (RWTH Aachen University)
-                  Markus Hï¿½hnerbach (RWTH Aachen University)
                            Stan Moore (Sandia)
 		   Ahmed E. Ismail (RWTH Aachen University)
                    Paolo Bientinesi (RWTH Aachen University)
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index 758c37bf56edd69e7d618ef9d68345a3c9057dfa..434189dd263ecef43212a030f60889c4df0d998b 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -8,6 +8,7 @@
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
+# in.intel.airebo -     Polyethelene benchmark with AIREBO
 #
 #############################################################################
 
@@ -24,6 +25,7 @@
 # in.intel.sw -	           132.4               161.9
 # in.intel.tersoff -        83.3               101.1
 # in.intel.water -          53.4                90.3
+# in.intel.airebo -          7.3                11.8
 #
 #############################################################################
 
diff --git a/src/USER-INTEL/TEST/in.intel.airebo b/src/USER-INTEL/TEST/in.intel.airebo
new file mode 100644
index 0000000000000000000000000000000000000000..fcd8af4707f9b2e5b008c494bf3e036df225ad8c
--- /dev/null
+++ b/src/USER-INTEL/TEST/in.intel.airebo
@@ -0,0 +1,47 @@
+# AIREBO polyethelene benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 550	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 17*$x
+variable	yy equal 16*$y
+variable	zz equal 2*$z
+variable	rr equal floor($t*$m)
+variable        root getenv LMP_ROOT
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+variable            root getenv LMP_ROOT
+
+units		    metal
+atom_style	    atomic
+
+read_data	    ${root}/examples/airebo/data.airebo
+
+replicate	    ${xx} ${yy} ${zz}
+
+neighbor	    0.5 bin
+neigh_modify	    delay 5 every 1
+
+pair_style	    airebo 3.0 1 1
+pair_coeff	    * * ${root}/potentials/CH.airebo C H
+
+velocity	    all create 300.0 761341
+
+fix		    1 all nve
+timestep	    0.0005
+
+thermo		    50
+
+if "$p > 0"	then "run_style verlet/power"
+
+if "$w > 0"	then "run $w"
+run		${rr}
diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam
index 5a3b3064afae85b831b00b333a726d242b6e105f..6486b22ee908ef5d219bf127cc4928694047f8c4 100644
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@@ -5,7 +5,6 @@ variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
 variable	n index 0       # Use NUMA Mapping for Multi-Node
-variable	b index 3       # Neighbor binsize
 variable	p index 0       # Use Power Measurement
 
 variable	x index 4
diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo
index 05145d79c0d507819b2946f74a59d7c645b99459..7ce7eb4452e38ebb7e1795dd4ba0d770242062df 100644
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@@ -5,7 +5,6 @@ variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
 variable	n index 0	# Use NUMA Mapping for Multi-Node
-variable        b index 3       # Neighbor binsize
 variable	p index 0	# Use Power Measurement
 variable	c index 0	# 1 to use collectives for PPPM
 variable        d index 1       # 1 to use 'diff ad' for PPPM
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index 3664bc248bdb77d9c14777f2644f1fb74de0d190..b4b664cb943354c3e38b97919fa4d2c5b53bb5aa 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -30,6 +30,9 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _off_map_listlocal = 0;
   _ccachex = 0;
   _ncache_alloc = 0;
+  _ncachetag = 0;
+  _cutneighsq = 0;
+  _cutneighghostsq = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   _separate_buffers = 0;
   _off_f = 0;
@@ -447,12 +450,17 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
     flt_t *ncachez = _ncachez;
     int *ncachej = _ncachej;
     int *ncachejtype = _ncachejtype;
+    int *ncachetag = _ncachetag;
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_ncache) {
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
         nocopy(ncachejtype:alloc_if(0) free_if(1))
+      if (ncachetag) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ncachetag:alloc_if(0) free_if(1))
+      }
     }
     _off_ncache = 0;
     #endif
@@ -462,8 +470,10 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
     lmp->memory->destroy(ncachez);
     lmp->memory->destroy(ncachej);
     lmp->memory->destroy(ncachejtype);
-
+    if (ncachetag)
+      lmp->memory->destroy(ncachetag);
     _ncache_alloc = 0;
+    _ncachetag = 0;
   }
 }
 
@@ -480,7 +490,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
   const int vsize = _ncache_stride * nt;
 
   if (_ncache_alloc) {
-    if (vsize > _ncache_alloc)
+    if (vsize > _ncache_alloc || (need_tag() && _ncachetag == 0))
       free_ncache();
     #ifdef _LMP_INTEL_OFFLOAD
     else if (off_flag && _off_ncache == 0)
@@ -495,6 +505,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
   lmp->memory->create(_ncachez, vsize, "_ncachez");
   lmp->memory->create(_ncachej, vsize, "_ncachej");
   lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
+  if (need_tag())
+    lmp->memory->create(_ncachetag, vsize, "_ncachetag");
 
   _ncache_alloc = vsize;
 
@@ -513,6 +525,14 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
         nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
     }
+    int tsize = vsize;
+    if (!need_tag()) {
+      tsize = 16;
+      lmp->memory->create(_ncachetag, tsize, "_ncachetag");
+    }
+    int *ncachetag = _ncachetag;
+    #pragma offload_transfer target(mic:_cop)			\
+      nocopy(ncachetag:length(tsize) alloc_if(1) free_if(0))
     _off_ncache = 1;
   }
   #endif
@@ -548,7 +568,8 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall,
 /* ---------------------------------------------------------------------- */
 
 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes, 
+					    const int use_ghost_cut)
 {
   if (ntypes != _ntypes) {
     if (_ntypes > 0) {
@@ -558,16 +579,34 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
         #pragma offload_transfer target(mic:_cop) \
           nocopy(cutneighsqo:alloc_if(0) free_if(1))
       }
+      flt_t * cutneighghostsqo;
+      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
+	cutneighghostsqo = _cutneighghostsq[0];
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
+      }
       #endif
       lmp->memory->destroy(_cutneighsq);
+      if (_cutneighghostsq != 0) lmp->memory->destroy(_cutneighghostsq);
     }
     if (ntypes > 0) {
       lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
+      if (use_ghost_cut)
+	lmp->memory->create(_cutneighghostsq, ntypes, ntypes, 
+			    "_cutneighghostsq");
       #ifdef _LMP_INTEL_OFFLOAD
       flt_t * cutneighsqo = _cutneighsq[0];
+      const int ntypes2 = ntypes * ntypes;
       if (_off_threads > 0 && cutneighsqo != NULL) {
         #pragma offload_transfer target(mic:_cop) \
-          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+          nocopy(cutneighsqo:length(ntypes2) alloc_if(1) free_if(0))
+      }
+      if (use_ghost_cut) {
+        flt_t * cutneighghostsqo = _cutneighghostsq[0];
+        if (_off_threads > 0 && cutneighghostsqo != NULL) {
+          #pragma offload_transfer target(mic:_cop) \
+            nocopy(cutneighghostsqo:length(ntypes2) alloc_if(1) free_if(0))
+        }
       }
       #endif
     }
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 7a7640a20366fc6410a61ae71e3e01ec7d92b7d2..8040715b2e197bea1a1d2eb4dea03b5ce4b6e15b 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -109,12 +109,14 @@ class IntelBuffers {
 
   void free_ncache();
   void grow_ncache(const int off_flag, const int nthreads);
+  void grow_ncachetag(const int off_flag, const int nthreads);
   inline int ncache_stride() { return _ncache_stride; }
   inline flt_t * get_ncachex() { return _ncachex; }
   inline flt_t * get_ncachey() { return _ncachey; }
   inline flt_t * get_ncachez() { return _ncachez; }
   inline int * get_ncachej() { return _ncachej; }
   inline int * get_ncachejtype() { return _ncachejtype; }
+  inline int * get_ncachetag() { return _ncachetag; }
 
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
@@ -131,7 +133,7 @@ class IntelBuffers {
       _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
-  void set_ntypes(const int ntypes);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
 
   inline int * firstneigh(const NeighList *list) { return _list_alloc; }
   inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
@@ -162,6 +164,7 @@ class IntelBuffers {
   inline void zero_ev()
     { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
   inline flt_t ** get_cutneighsq() { return _cutneighsq; }
+  inline flt_t ** get_cutneighghostsq() { return _cutneighghostsq; }
   inline int get_off_threads() { return _off_threads; }
   #ifdef _LMP_INTEL_OFFLOAD
   inline void set_off_params(const int n, const int cop,
@@ -274,13 +277,10 @@ class IntelBuffers {
              used_ghost * sizeof(flt_t));
     }
   }
+  #endif
 
   inline int need_tag() { return _need_tag; }
   inline void need_tag(const int nt) { _need_tag = nt; }
-  #else
-  inline int need_tag() { return 0; }
-  inline void need_tag(const int nt) { }
-  #endif
 
   double memory_usage(const int nthreads);
 
@@ -298,7 +298,7 @@ class IntelBuffers {
   int _list_alloc_atoms;
   int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;
 
-  flt_t **_cutneighsq;
+  flt_t **_cutneighsq, **_cutneighghostsq;
   int _ntypes;
 
   int _ccache_stride;
@@ -307,7 +307,10 @@ class IntelBuffers {
 
   int _ncache_stride, _ncache_alloc;
   flt_t *_ncachex, *_ncachey, *_ncachez;
-  int *_ncachej, *_ncachejtype;
+  int *_ncachej, *_ncachejtype, *_ncachetag;
+
+  int _need_tag, _host_nmax;
+
   #ifdef LMP_USE_AVXCD
   int _ccache_stride3;
   acc_t * _ccachef;
@@ -324,7 +327,6 @@ class IntelBuffers {
   int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
   int *_off_map_numneigh;
   bool _off_list_alloc;
-  int _need_tag, _host_nmax;
   #endif
 
   int _buf_size, _buf_local_size;
diff --git a/src/USER-INTEL/intel_intrinsics_airebo.h b/src/USER-INTEL/intel_intrinsics_airebo.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b091a4ba1d62988572d40be1dfd662b51304187
--- /dev/null
+++ b/src/USER-INTEL/intel_intrinsics_airebo.h
@@ -0,0 +1,2279 @@
+#ifndef LMP_INTEL_AIREBO_SCALAR
+# ifdef __INTEL_COMPILER
+#  if defined(__MIC__) || defined(__AVX512F__)
+#   define LMP_INTEL_AIREBO_512
+#  elif defined(__AVX__)
+#   define LMP_INTEL_AIREBO_256
+#  else
+#   define LMP_INTEL_AIREBO_SCALAR
+#  endif
+# else
+#  define LMP_INTEL_AIREBO_SCALAR
+# endif
+#endif
+
+#ifdef LMP_INTEL_AIREBO_512
+
+#include <cassert>
+#include <immintrin.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_LEN 8
+#  define FVEC_SUFFIX(a) a##pd
+#  define FVEC_SUFFIX_MASK(a) a##pd_mask
+#  define FVEC_MASK_T __mmask8
+#  define FVEC_VEC_T __m512d
+#  define FVEC_SCAL_T double
+#  define IVEC_NAME ivec8
+#  define FVEC_NAME fvec8pd
+#  define BVEC_NAME bvec8
+#  define AVEC_NAME avec8pd
+#else
+#  undef FVEC_LEN
+#  undef FVEC_SUFFIX
+#  undef FVEC_SUFFIX_MASK
+#  undef FVEC_MASK_T
+#  undef FVEC_VEC_T
+#  undef FVEC_SCAL_T
+#  undef IVEC_NAME
+#  undef FVEC_NAME
+#  undef BVEC_NAME
+#  undef AVEC_NAME
+
+#  define FVEC_LEN 16
+#  define FVEC_SUFFIX(a) a##ps
+#  define FVEC_SUFFIX_MASK(a) a##ps_mask
+#  define FVEC_MASK_T __mmask16
+#  define FVEC_VEC_T __m512
+#  define FVEC_SCAL_T float
+#  define IVEC_NAME ivec16
+#  define FVEC_NAME fvec16ps
+#  define BVEC_NAME bvec16
+#  define AVEC_NAME avec16ps
+#endif
+
+namespace mm512 {
+
+#ifndef __AVX512F__
+
+#ifndef FVEC_FIRST_PASS
+VEC_INLINE static inline __m512i _mm512_mask_expand_epi32(__m512i src, 
+							  __mmask16 k, 
+							  __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, a);
+  return _mm512_mask_loadunpacklo_epi32(src, k, buf);
+}
+VEC_INLINE static inline __m512i _mm512_maskz_expand_epi32(__mmask16 k, 
+							   __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, a);
+  return _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, buf);
+}
+VEC_INLINE static inline __m512i _mm512_mask_compress_epi32(__m512i src, 
+							    __mmask16 k, 
+							    __m512i a) {
+  int buf[16] __attribute__((aligned(64)));
+  _mm512_store_epi32(buf, src);
+  _mm512_mask_packstorelo_epi32(buf, k, a);
+  return _mm512_load_epi32(buf);
+}
+VEC_INLINE static inline __m512i _mm512_maskz_compress_epi32(__mmask16 k, 
+							     __m512i a) {
+  int buf[16] __attribute__((aligned(64))) = {0};
+  _mm512_mask_packstorelo_epi32(buf, k, a);
+  return _mm512_load_epi32(buf);
+}
+
+VEC_INLINE static inline void _mm512_mask_compressstoreu_epi32(int * dest, 
+							       __mmask16 mask, 
+							       __m512i src) {
+  _mm512_mask_packstorelo_epi32(dest, mask, src);
+  _mm512_mask_packstorehi_epi32(dest + 16, mask, src);
+}
+
+VEC_INLINE static inline __m512i _mm512_mask_loadu_epi32(__m512i src, 
+							 __mmask16 k, 
+							 const int * mem_addr) {
+  assert((k & (k + 1)) == 0);
+  __m512i ret = _mm512_mask_loadunpacklo_epi32(src, k, mem_addr);
+  ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16);
+  return ret;
+}
+VEC_INLINE static inline __m512i _mm512_maskz_loadu_epi32(__mmask16 k, 
+							const int * mem_addr) {
+  assert((k & (k + 1)) == 0);
+  __m512i ret = _mm512_mask_loadunpacklo_epi32(_mm512_setzero_epi32(), k, 
+					       mem_addr);
+  ret = _mm512_mask_loadunpackhi_epi32(ret, k, mem_addr + 16);
+  return ret;
+}
+VEC_INLINE static inline void _mm512_mask_storeu_epi32(int * dest, 
+						       __mmask16 mask, 
+						       __m512i src) {
+  assert((mask & (mask + 1)) == 0);
+  _mm512_mask_packstorelo_epi32(dest, mask, src);
+  _mm512_mask_packstorehi_epi32(dest + 16, mask, src);
+}
+#endif
+
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_expand_)
+  (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, a);
+  return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(src, k, buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_expand_)
+  (__mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, a);
+  return FVEC_SUFFIX(_mm512_mask_loadunpacklo_)(FVEC_SUFFIX(_mm512_setzero_)(),
+						k, buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_mask_compress_)
+  (FVEC_VEC_T src, __mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64)));
+  FVEC_SUFFIX(_mm512_store_)(buf, src);
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a);
+  return FVEC_SUFFIX(_mm512_load_)(buf);
+}
+VEC_INLINE static inline FVEC_VEC_T FVEC_SUFFIX(_mm512_maskz_compress_)
+  (__mmask16 k, FVEC_VEC_T a) {
+  FVEC_SCAL_T buf[FVEC_LEN] __attribute__((aligned(64))) = {0};
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(buf, k, a);
+  return FVEC_SUFFIX(_mm512_load_)(buf);
+}
+VEC_INLINE static inline void FVEC_SUFFIX(_mm512_mask_storeu_)
+  (FVEC_SCAL_T * dest, FVEC_MASK_T mask, FVEC_VEC_T src) {
+  assert((mask & (mask + 1)) == 0);
+  FVEC_SUFFIX(_mm512_mask_packstorelo_)(dest, mask, src);
+  FVEC_SUFFIX(_mm512_mask_packstorehi_)(dest + FVEC_LEN, mask, src);
+}
+#endif
+
+
+class FVEC_NAME;
+class IVEC_NAME;
+class AVEC_NAME;
+class BVEC_NAME {
+  friend class FVEC_NAME;
+  friend class IVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==16
+  friend class avec16pd;
+# endif
+  FVEC_MASK_T val_;
+  VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {}
+public:
+  VEC_INLINE BVEC_NAME() {}
+  VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kand(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kandn(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME knot(const BVEC_NAME &a) {
+    return _mm512_knot(a.val_);
+  }
+  VEC_INLINE static int kortestz(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return _mm512_kortestz(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const BVEC_NAME &a) {
+    const __m512i c_i1 = _mm512_set1_epi32(1);
+    __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(),
+						c_i1);
+    __m512i compressed = _mm512_mask_compress_epi32(_mm512_undefined_epi32(),
+						    mask.val_, a_int_vec);
+    return _mm512_cmpeq_epi32_mask(compressed, c_i1);
+  }
+  VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, 
+					  const BVEC_NAME &mask,
+					  const BVEC_NAME &a) {
+    const __m512i c_i1 = _mm512_set1_epi32(1);
+    __m512i a_int_vec = _mm512_mask_blend_epi32(a.val_, _mm512_setzero_epi32(),
+						c_i1);
+    __m512i src_int_vec = _mm512_mask_blend_epi32(src.val_, 
+						  _mm512_setzero_epi32(), c_i1);
+    __m512i compressed = _mm512_mask_expand_epi32(src_int_vec, mask.val_,
+						  a_int_vec);
+    return _mm512_cmpeq_epi32_mask(compressed, c_i1);
+  }
+  VEC_INLINE static BVEC_NAME full() {
+    return static_cast<FVEC_MASK_T>(0xFFFF);
+  }
+  VEC_INLINE static BVEC_NAME empty() {
+    return 0;
+  }
+  VEC_INLINE static BVEC_NAME only(int n) {
+    return full().val_ >> (FVEC_LEN - n);
+  }
+  VEC_INLINE static BVEC_NAME after(int n) {
+    return full().val_ << n;
+  }
+  VEC_INLINE static BVEC_NAME onlyafter(int only, int after) {
+    return (full().val_ >> (FVEC_LEN - only)) << after;
+  }
+  VEC_INLINE static int popcnt(const BVEC_NAME &a) {
+    return _popcnt32(a.val_);
+  }
+  VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) {
+    return _mm512_kortestz(a.val_, a.val_);
+  }
+  VEC_INLINE static bool test_any_set(const BVEC_NAME &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    return a.val_ & (1 << i);
+  }
+  VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const {
+    return _mm512_kand(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const {
+    return _mm512_kor(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator ~() const {
+    return _mm512_knot(val_);
+  }
+};
+
+class IVEC_NAME {
+  friend class FVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==16
+  friend class avec16pd;
+# endif
+  __m512i val_;
+  VEC_INLINE IVEC_NAME(const __m512i &v) : val_(v) {}
+public:
+  static const int VL = 16;
+  VEC_INLINE IVEC_NAME() {}
+
+  #define IVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a,	     \
+      const IVEC_NAME &b) {					     \
+      return _mm512_##the_name##_epi32_mask(a.val_, b.val_);         \
+    }								     \
+    VEC_INLINE static BVEC_NAME mask_##the_name(			\
+						const BVEC_NAME &mask,	\
+						  const IVEC_NAME &a,	\
+						  const IVEC_NAME &b    \
+						  ) {			\
+      return _mm512_mask_##the_name##_epi32_mask(			\
+      mask.val_, a.val_, b.val_);					\
+    }
+  IVEC_MASK_BINFN_B(cmpeq)
+  IVEC_MASK_BINFN_B(cmplt)
+  IVEC_MASK_BINFN_B(cmpneq)
+  IVEC_MASK_BINFN_B(cmpgt)
+
+  #define IVEC_MASK_BINFN_I(the_name)					\
+    VEC_INLINE static IVEC_NAME mask_##the_name(			\
+        const IVEC_NAME &src, const BVEC_NAME &mask,                    \
+        const IVEC_NAME &a, const IVEC_NAME &b                          \
+    ) {                                                                 \
+       return _mm512_mask_##the_name##_epi32(				\
+        src.val_, mask.val_, a.val_, b.val_);                           \
+    }
+  IVEC_MASK_BINFN_I(add)
+  VEC_INLINE static IVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_blend_epi32(mask.val_, a.val_, b.val_);
+  }
+
+  #define IVEC_BINFN_I(the_name)                                     \
+    VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a,	     \
+					 const IVEC_NAME &b) {	     \
+      return _mm512_##the_name##_epi32(a.val_, b.val_);              \
+    }
+  IVEC_BINFN_I(mullo)
+  IVEC_BINFN_I(srlv)
+  VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) {
+    return _mm512_and_epi32(a.val_, b.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME mask_expand(
+      const IVEC_NAME &src, const BVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_expand_epi32(src.val_,
+      a.val_, b.val_);
+  }
+  VEC_INLINE static IVEC_NAME masku_compress(
+      const BVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return _mm512_mask_compress_epi32(_mm512_undefined_epi32(), a.val_, b.val_);
+  }
+
+  VEC_INLINE static int at(const IVEC_NAME &a, int b) {
+    int data[16] __attribute__((aligned(64)));
+    _mm512_store_epi32(data, a.val_);
+    return data[b];
+  }
+
+  VEC_INLINE static IVEC_NAME load(const int * src) {
+    return _mm512_load_epi32(src);
+  }
+  VEC_INLINE static IVEC_NAME mask_loadu(const BVEC_NAME &mask, 
+                                         const int * src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    return _mm512_mask_loadu_epi32(_mm512_undefined_epi32(), mask.val_, src);
+  }
+  VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, 
+                                          const int * src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    return _mm512_maskz_loadu_epi32(mask.val_, src);
+  }
+  VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, int * dest, 
+    const IVEC_NAME &src) {
+    assert((mask.val_ & (mask.val_ + 1)) == 0);
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    _mm512_mask_storeu_epi32(dest, mask.val_, src.val_);
+  }
+  VEC_INLINE static void store(int * dest, const IVEC_NAME &src) {
+    _mm512_store_epi32(dest, src.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME mask_gather(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const int * mem, const int scale
+  ) {
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    assert(scale == sizeof(int));
+    return _mm512_mask_i32gather_epi32(src.val_, mask.val_, idx.val_, mem, 
+      sizeof(int));
+  }
+  VEC_INLINE static void mask_i32scatter(
+      int * mem, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const IVEC_NAME &a, const int scale
+  ) {
+    assert(mask.val_ <= BVEC_NAME::full().val_);
+    assert(scale == sizeof(int));
+    _mm512_mask_i32scatter_epi32(mem, mask.val_, idx.val_, a.val_, sizeof(int));
+  }
+
+  VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest,
+    const IVEC_NAME &src) {
+    _mm512_mask_compressstoreu_epi32(dest, mask.val_, src.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME set1(int i) {
+    return _mm512_set1_epi32(i);
+  }
+  VEC_INLINE static IVEC_NAME setzero() {
+    return _mm512_setzero_epi32();
+  }
+  VEC_INLINE static IVEC_NAME undefined() {
+    return _mm512_undefined_epi32();
+  }
+
+  VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const {
+    return _mm512_add_epi32(this->val_, b.val_);
+  }
+  VEC_INLINE static void print(const char * str, const IVEC_NAME &a) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    printf("%s:", str);
+    for (int i = 0; i < FVEC_LEN; i++) {
+      printf(" %d", data[i]);
+    }
+    printf("\n");
+  }
+};
+
+class FVEC_NAME {
+  friend class AVEC_NAME;
+#if FVEC_LEN==16
+  friend class avec16pd;
+#endif
+  FVEC_VEC_T val_;
+  VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {}
+public:
+  static const int VL = FVEC_LEN;
+  VEC_INLINE FVEC_NAME() {}
+  VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64)));
+    FVEC_SUFFIX(_mm512_store_)(data, a.val_);
+    return data[i];
+  }
+  VEC_INLINE static bool fast_compress() { return true; }
+
+  #define FVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {	     \
+      return FVEC_SUFFIX_MASK(_mm512_##the_name##_)(a.val_, b.val_); \
+    }                                                                \
+    VEC_INLINE static BVEC_NAME mask_##the_name(                     \
+        const BVEC_NAME &mask,                                       \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      return FVEC_SUFFIX_MASK(_mm512_mask_##the_name##_)(            \
+        mask.val_, a.val_, b.val_);                                  \
+    }
+  FVEC_MASK_BINFN_B(cmple)
+  FVEC_MASK_BINFN_B(cmplt)
+  FVEC_MASK_BINFN_B(cmpneq)
+  FVEC_MASK_BINFN_B(cmpnle)
+  FVEC_MASK_BINFN_B(cmpnlt)
+
+  #define FVEC_UNFN_F(the_name)                                      \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) {       \
+      return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_);              \
+    }
+  FVEC_UNFN_F(abs)
+  FVEC_UNFN_F(exp)
+  FVEC_UNFN_F(invsqrt)
+  FVEC_UNFN_F(recip)
+  FVEC_UNFN_F(sqrt)
+
+  #define FVEC_MASK_UNFN_F(the_name)                                 \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a                                           \
+    ) {                                                              \
+      return FVEC_SUFFIX(_mm512_mask_##the_name##_)(                 \
+        src.val_, mask.val_, a.val_);                                \
+    }
+  FVEC_MASK_UNFN_F(cos)
+  FVEC_MASK_UNFN_F(recip)
+  FVEC_MASK_UNFN_F(sqrt)
+
+  #define FVEC_BINFN_F(the_name)                                     \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {       \
+      return FVEC_SUFFIX(_mm512_##the_name##_)(a.val_, b.val_);      \
+    }
+  FVEC_BINFN_F(max)
+  FVEC_BINFN_F(min)
+
+  #define FVEC_MASK_BINFN_F(the_name)                                \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      return FVEC_SUFFIX(_mm512_mask_##the_name##_)(                 \
+        src.val_, mask.val_, a.val_, b.val_);                        \
+    }
+  FVEC_MASK_BINFN_F(add)
+  FVEC_MASK_BINFN_F(div)
+  FVEC_MASK_BINFN_F(mul)
+  FVEC_MASK_BINFN_F(sub)
+  VEC_INLINE static FVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_blend_)(mask.val_, a.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME mask_expand(
+      const FVEC_NAME &src, const BVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_expand_)(src.val_,
+      a.val_, b.val_);
+  }
+  VEC_INLINE static FVEC_NAME masku_compress(
+      const BVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_compress_)(FVEC_SUFFIX(_mm512_undefined_)(),
+						a.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) {
+    return FVEC_SUFFIX(_mm512_set1_)(a);
+  }
+  VEC_INLINE static FVEC_NAME setzero() {
+    return FVEC_SUFFIX(_mm512_setzero_)();
+  }
+  VEC_INLINE static FVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm512_undefined_)();
+  }
+
+  VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) {
+    return FVEC_SUFFIX(_mm512_load_)(mem);
+  }
+  VEC_INLINE static void mask_storeu(const BVEC_NAME &mask, FVEC_SCAL_T * dest,
+				       const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm512_mask_storeu_)(dest, mask.val_, a.val_);
+  }
+  VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm512_store_)(dest, a.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, 
+				     const FVEC_SCAL_T * mem, 
+				     const int scale) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_i32logather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_i32gather_)(idx.val_, mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static FVEC_NAME mask_gather(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
+                       mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
+                       mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, 
+					   FVEC_NAME * out_0, 
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    *out_0 = FVEC_NAME::gather(idx, mem + 0, scale);
+    *out_1 = FVEC_NAME::gather(idx, mem + 1, scale);
+    *out_2 = FVEC_NAME::gather(idx, mem + 2, scale);
+  }
+  VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, FVEC_NAME * out_0,
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2, 
+					   FVEC_NAME * out_3) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    *out_0 = FVEC_NAME::gather(idx, mem + 0, scale);
+    *out_1 = FVEC_NAME::gather(idx, mem + 1, scale);
+    *out_2 = FVEC_NAME::gather(idx, mem + 2, scale);
+    *out_3 = FVEC_NAME::gather(idx, mem + 3, scale);
+  }
+
+  VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, 
+						const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm512_mask_reduce_add_)(mask.val_, a.val_);
+  }
+  VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm512_reduce_add_)(a.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) {
+#   if FVEC_LEN==8
+    return _mm512_maskz_compress_epi32(0x5555, _mm512_castpd_si512(a.val_));
+#   else
+    return _mm512_castps_si512(a.val_);
+#   endif
+  }
+
+  VEC_INLINE static FVEC_NAME mask_sincos(
+      FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b,
+      const BVEC_NAME &mask, const FVEC_NAME &arg
+  ) {
+    return FVEC_SUFFIX(_mm512_mask_sincos_)(&cos->val_, src_a.val_, src_b.val_,
+      mask.val_, arg.val_);
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_);            \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    #ifdef __AVX512PF__
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+      sizeof(FVEC_SCAL_T), _MM_HINT_T0);
+    #endif
+  }
+};
+
+class AVEC_NAME {
+  FVEC_VEC_T val_;
+  VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {}
+public:
+  VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {}
+  VEC_INLINE static AVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm512_undefined_)();
+  }
+  VEC_INLINE static AVEC_NAME mask_gather(
+      const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    return FVEC_SUFFIX(_mm512_mask_i32logather_)(src.val_, mask.val_, idx.val_,
+						 mem, sizeof(FVEC_SCAL_T));
+#   else
+    return FVEC_SUFFIX(_mm512_mask_i32gather_)(src.val_, mask.val_, idx.val_,
+                                               mem, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const AVEC_NAME &a, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==8
+    FVEC_SUFFIX(_mm512_mask_i32loscatter_)(mem, mask.val_, idx.val_, a.val_, 
+					   sizeof(FVEC_SCAL_T));
+#   else
+    FVEC_SUFFIX(_mm512_mask_i32scatter_)(mem, mask.val_, idx.val_, a.val_, 
+					 sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm512_##the_name##_)(this->val_, b.val_);            \
+  }
+  AVEC_BINOP(-, sub)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+				      sizeof(FVEC_SCAL_T), _MM_HINT_T0);
+  }
+};
+
+#if FVEC_LEN==16
+class avec16pd {
+  __m512d lo_, hi_;
+  VEC_INLINE avec16pd(const __m512d &lo, const __m512d &hi) : lo_(lo), hi_(hi) 
+    {}
+  VEC_INLINE static __mmask8 get_bvec_hi(__mmask16 a) {
+    return a >> 8;
+  }
+  VEC_INLINE static __m512i get_ivec_hi(__m512i a) {
+    return _mm512_permute4f128_epi32(a, _MM_PERM_BADC);
+  }
+public:
+  VEC_INLINE avec16pd(const FVEC_NAME &a) {
+    lo_ = _mm512_cvtpslo_pd(a.val_);
+    hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC));
+  }
+  VEC_INLINE static avec16pd undefined() {
+    return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd());
+  }
+  VEC_INLINE static avec16pd mask_gather(
+      const avec16pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const double * mem, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    __m512d lo = _mm512_mask_i32logather_pd(src.lo_, mask.val_, idx.val_, mem, 
+					    sizeof(double));
+    __m512d hi = _mm512_mask_i32logather_pd(src.hi_, get_bvec_hi(mask.val_), 
+					    get_ivec_hi(idx.val_), mem, 
+					    sizeof(double));
+    return avec16pd(lo, hi);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const avec16pd &a, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    _mm512_mask_i32loscatter_pd(mem, mask.val_, idx.val_, a.lo_, 
+				sizeof(double));
+    _mm512_mask_i32loscatter_pd(mem, get_bvec_hi(mask.val_), 
+				get_ivec_hi(idx.val_), a.hi_, sizeof(double));
+  }
+
+  #define AVEC2_BINOP(the_sym, the_name)                                    \
+    VEC_INLINE inline avec16pd operator the_sym(const avec16pd &b) const {  \
+    __m512d lo = _mm512_##the_name##_pd(this->lo_, b.lo_);                  \
+    __m512d hi = _mm512_##the_name##_pd(this->hi_, b.hi_);                  \
+    return avec16pd(lo, hi);                                                \
+  }
+  AVEC2_BINOP(-, sub)
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    _mm512_mask_prefetch_i32gather_ps(a.val_, BVEC_NAME::full().val_, mem, 
+				      sizeof(double), _MM_HINT_T0);
+  }
+};
+#endif
+
+}
+
+
+#ifdef FVEC_FIRST_PASS
+
+template<typename flt_t, typename acc_t>
+struct intr_types;
+
+template<>
+struct intr_types<double,double> {
+  typedef mm512::fvec8pd fvec;
+  typedef mm512::ivec8 ivec;
+  typedef mm512::bvec8 bvec;
+  typedef mm512::avec8pd avec;
+};
+
+template<>
+struct intr_types<float,float> {
+  typedef mm512::fvec16ps fvec;
+  typedef mm512::ivec16 ivec;
+  typedef mm512::bvec16 bvec;
+  typedef mm512::avec16ps avec;
+};
+
+template<>
+struct intr_types<float,double> {
+  typedef mm512::fvec16ps fvec;
+  typedef mm512::ivec16 ivec;
+  typedef mm512::bvec16 bvec;
+  typedef mm512::avec16pd avec;
+};
+
+#endif
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_FIRST_PASS
+#  include "intel_intrinsics_airebo.h"
+#endif
+
+#endif
+
+#ifdef LMP_INTEL_AIREBO_256
+
+#include <cassert>
+#include <immintrin.h>
+#include <stdint.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_LEN 4
+#  define FVEC_SUFFIX(a) a##pd
+#  define FVEC_MASK_T __m256d
+#  define FVEC_VEC_T __m256d
+#  define FVEC_SCAL_T double
+#  define IVEC_NAME ivec4
+#  define FVEC_NAME fvec4pd
+#  define BVEC_NAME bvec4
+#  define AVEC_NAME avec4pd
+#else
+#  undef FVEC_LEN
+#  undef FVEC_SUFFIX
+#  undef FVEC_SUFFIX_MASK
+#  undef FVEC_MASK_T
+#  undef FVEC_VEC_T
+#  undef FVEC_SCAL_T
+#  undef IVEC_NAME
+#  undef FVEC_NAME
+#  undef BVEC_NAME
+#  undef AVEC_NAME
+
+#  define FVEC_LEN 8
+#  define FVEC_SUFFIX(a) a##ps
+#  define FVEC_MASK_T __m256
+#  define FVEC_VEC_T __m256
+#  define FVEC_SCAL_T float
+#  define IVEC_NAME ivec8
+#  define FVEC_NAME fvec8ps
+#  define BVEC_NAME bvec8
+#  define AVEC_NAME avec8ps
+#endif
+
+
+
+namespace mm256 {
+
+//#define __AVX2__ __AVX2__
+
+#if !defined(__AVX2__) && !defined(FVEC_FIRST_PASS)
+
+#define IVEC_EM_BIN(op) \
+  __m128i a_lo = _mm256_castsi256_si128(a);  \
+  __m128i b_lo = _mm256_castsi256_si128(b);  \
+  __m128i a_hi = _mm256_extractf128_si256(a, 1);  \
+  __m128i b_hi = _mm256_extractf128_si256(b, 1);  \
+  __m128i c_lo = op(a_lo, b_lo); \
+  __m128i c_hi = op(a_hi, b_hi); \
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi); \
+  return ret;
+
+VEC_INLINE inline __m256i _cm256_add_epi32(const __m256i &a, const __m256i &b) {
+  IVEC_EM_BIN(_mm_add_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_and_si256(const __m256i &a, const __m256i &b) {
+  IVEC_EM_BIN(_mm_and_si128)
+}
+
+VEC_INLINE inline __m256i _cm256_andnot_si256(const __m256i &a, 
+					      const __m256i &b) {
+  IVEC_EM_BIN(_mm_andnot_si128)
+}
+
+VEC_INLINE inline __m256i _cm256_cmpeq_epi32(const __m256i &a, 
+					     const __m256i &b) {
+  IVEC_EM_BIN(_mm_cmpeq_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_cmpgt_epi32(const __m256i &a, 
+					     const __m256i &b) {
+  IVEC_EM_BIN(_mm_cmpgt_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_cvtepu8_epi32(const __m128i &a) {
+  __m128i a_hi = _mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(a), 1));
+  __m128i c_lo = _mm_cvtepu8_epi32(a);
+  __m128i c_hi = _mm_cvtepu8_epi32(a_hi);
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+  return ret;
+
+}
+
+#define IVEC_EM_SCAL(op)                       \
+  int buf_a[8] __attribute__((aligned(32)));   \
+  int buf_b[8] __attribute__((aligned(32)));   \
+  int dest[8] __attribute__((aligned(32)));    \
+  _mm256_store_si256((__m256i*)buf_a, a);      \
+  _mm256_store_si256((__m256i*)buf_b, b);      \
+  for (int i = 0; i < 8; i++) {		       \
+    dest[i] = op;			       \
+  }					       \
+  return _mm256_load_si256((__m256i*) dest);
+
+VEC_INLINE inline __m256i _cm256_permutevar8x32_epi32(const __m256i &a, 
+						      const __m256i &b) {
+  IVEC_EM_SCAL(buf_a[buf_b[i]])
+}
+
+VEC_INLINE inline __m256i _cm256_mullo_epi32(__m256i a, __m256i b) {
+  IVEC_EM_BIN(_mm_mullo_epi32)
+}
+
+VEC_INLINE inline __m256i _cm256_srlv_epi32(__m256i a, __m256i b) {
+  IVEC_EM_SCAL(buf_a[i] >> buf_b[i])
+}
+
+
+VEC_INLINE inline __m256 _cm256_permutevar8x32_ps(const __m256 &a, 
+						  const __m256i &b) {
+  return _mm256_castsi256_ps(_cm256_permutevar8x32_epi32(_mm256_castps_si256(a),
+							 b));
+}
+
+VEC_INLINE inline __m128i _cm_maskload_epi32(int const * mem, __m128i mask) {
+  return _mm_castps_si128(_mm_maskload_ps((float const *) mem, mask));
+}
+
+VEC_INLINE inline __m256i _cm256_maskload_epi32(int const * mem, __m256i mask) {
+  __m128i a_lo = _mm256_castsi256_si128(mask);
+  __m128i a_hi = _mm256_extractf128_si256(mask, 1);
+  __m128i c_lo = _cm_maskload_epi32(mem, a_lo);
+  __m128i c_hi = _cm_maskload_epi32(mem + 4, a_hi);
+  __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+  return ret;
+}
+
+
+VEC_INLINE inline __m256i _cm256_mask_i32gather_epi32(__m256i src, 
+						      int const * base_addr, 
+						      __m256i index, 
+						      __m256i mask, 
+						      const int scale) {
+  assert(scale == sizeof(int));
+  int buf_index[8] __attribute__((aligned(32)));
+  int buf_mask[8] __attribute__((aligned(32)));
+  int dest[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*)dest, src);
+  _mm256_store_si256((__m256i*)buf_index, index);
+  _mm256_store_si256((__m256i*)buf_mask, mask);
+  for (int i = 0; i < 8; i++) {
+    if (buf_mask[i]) dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_si256((__m256i*) dest);
+}
+
+VEC_INLINE inline __m256 _cm256_mask_i32gather_ps(__m256 src, 
+						  float const * base_addr, 
+						  __m256i index, __m256 mask, 
+						  const int scale) {
+  return _mm256_castsi256_ps(_cm256_mask_i32gather_epi32(
+    _mm256_castps_si256(src), (const int *) base_addr, index,
+    _mm256_castps_si256(mask), scale));
+}
+
+VEC_INLINE inline __m256d _cm256_mask_i32gather_pd(__m256d src, 
+						   double const * base_addr,
+						   __m128i index, __m256d mask,
+						   const int scale) {
+  assert(scale == sizeof(double));
+  int buf_index[4] __attribute__((aligned(32)));
+  int buf_mask[8] __attribute__((aligned(32)));
+  double dest[4] __attribute__((aligned(32)));
+  _mm256_store_pd(dest, src);
+  _mm_store_si128((__m128i*)buf_index, index);
+  _mm256_store_si256((__m256i*)buf_mask, _mm256_castpd_si256(mask));
+  for (int i = 0; i < 4; i++) {
+    if (buf_mask[2*i]) dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_pd(dest);
+}
+
+VEC_INLINE inline __m256i _cm256_i32gather_epi32(int const * base_addr,
+						 __m256i index, 
+						 const int scale) {
+  assert(scale == sizeof(int));
+  int buf_index[8] __attribute__((aligned(32)));
+  int dest[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*)buf_index, index);
+  for (int i = 0; i < 8; i++) {
+    dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_si256((__m256i*) dest);
+}
+
+VEC_INLINE inline __m256 _cm256_i32gather_ps(float const * base_addr,
+					     __m256i index, const int scale) {
+  return _mm256_castsi256_ps(_cm256_i32gather_epi32((const int *) base_addr,
+						    index, scale));
+}
+
+VEC_INLINE inline __m256d _cm256_i32gather_pd(double const * base_addr,
+					      __m128i index, const int scale) {
+  assert(scale == sizeof(double));
+  int buf_index[4] __attribute__((aligned(32)));
+  double dest[4] __attribute__((aligned(32)));
+  _mm_store_si128((__m128i*)buf_index, index);
+  for (int i = 0; i < 4; i++) {
+    dest[i] = base_addr[buf_index[i]];
+  }
+  return _mm256_load_pd(dest);
+}
+
+VEC_INLINE inline uint64_t _cdep_u64(uint64_t tmp, uint64_t mask) {
+  uint64_t dst = 0;
+  uint64_t k = 0;
+  const uint64_t one = 1;
+  const uint64_t zero = 0;
+  for (uint64_t m = 0; m < 64; m++) {
+    if (mask & (one << m)) {
+      dst |= static_cast<uint64_t>((tmp & (one << k)) != zero) << m;
+      k += 1;
+    }
+  }
+  return dst;
+}
+
+VEC_INLINE inline uint64_t _cext_u64(uint64_t tmp, uint64_t mask) {
+  uint64_t dst = 0;
+  uint64_t k = 0;
+  const uint64_t one = 1;
+  const uint64_t zero = 0;
+  for (uint64_t m = 0; m < 64; m++) {
+    if (mask & (one << m)) {
+      dst |= static_cast<uint64_t>((tmp & (one << m)) != zero) << k;
+      k += 1;
+    }
+  }
+  return dst;
+}
+
+#define _mm256_add_epi32 _cm256_add_epi32
+#define _mm256_and_si256 _cm256_and_si256
+#define _mm256_andnot_si256 _cm256_andnot_si256
+#define _mm256_cmpeq_epi32 _cm256_cmpeq_epi32
+#define _mm256_cmpgt_epi32 _cm256_cmpgt_epi32
+#define _mm256_permutevar8x32_epi32 _cm256_permutevar8x32_epi32
+#define _mm256_permutevar8x32_ps _cm256_permutevar8x32_ps
+#define _mm_maskload_epi32 _cm_maskload_epi32
+#define _mm256_maskload_epi32 _cm256_maskload_epi32
+#define _mm256_mullo_epi32 _cm256_mullo_epi32
+#define _mm256_srlv_epi32 _cm256_srlv_epi32
+#define _mm256_mask_i32gather_epi32 _cm256_mask_i32gather_epi32
+#define _mm256_mask_i32gather_pd _cm256_mask_i32gather_pd
+#define _mm256_mask_i32gather_ps _cm256_mask_i32gather_ps
+#define _mm256_i32gather_epi32 _cm256_i32gather_epi32
+#define _mm256_i32gather_pd _cm256_i32gather_pd
+#define _mm256_i32gather_ps _cm256_i32gather_ps
+#define _pdep_u64 _cdep_u64
+#define _pext_u64 _cext_u64
+#define _mm256_cvtepu8_epi32 _cm256_cvtepu8_epi32
+
+#endif
+
+#ifndef FVEC_FIRST_PASS
+
+VEC_INLINE inline __m256 _mm256_compress_ps(__m256 mask, __m256 a) {
+# ifdef __AVX2__
+  uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), 
+				     0x0101010101010101);
+  // unpack each bit to a byte
+  expanded_mask *= 0xFF;   // mask |= mask<<1 | mask<<2 | ... | mask<<7;
+  // the identity shuffle for vpermps, packed to one index per byte
+  const uint64_t identity_indices = 0x0706050403020100;   
+  uint64_t wanted_indices = _pext_u64(identity_indices, expanded_mask);
+
+  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
+  __m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
+
+  return _mm256_permutevar8x32_ps(a, shufmask);
+# else
+  int mask_buf[8] __attribute__((aligned(32)));
+  float a_buf[8] __attribute__((aligned(32)));
+  float dst_buf[8] __attribute__((aligned(32)));
+  _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask));
+  _mm256_store_ps(a_buf, a);
+  int k = 0;
+  for (int i = 0; i < 8; i++) {
+    if (mask[i]) {
+      dst_buf[k++] = a_buf[i];
+    }
+  }
+  return _mm256_load_ps(dst_buf);
+# endif
+}
+VEC_INLINE inline __m256 _mm256_expand_ps(__m256 mask, __m256 a) {
+# ifdef __AVX2__
+  uint64_t expanded_mask = _pdep_u64(_mm256_movemask_ps(mask), 
+				     0x0101010101010101);
+  expanded_mask *= 0xFF;
+  const uint64_t identity_indices = 0x0706050403020100;
+  uint64_t wanted_indices = _pdep_u64(identity_indices, expanded_mask);
+  __m128i bytevec = _mm_cvtsi64_si128(wanted_indices);
+  __m256i shufmask = _mm256_cvtepu8_epi32(bytevec);
+  return _mm256_permutevar8x32_ps(a, shufmask);
+# else
+  int mask_buf[8] __attribute__((aligned(32)));
+  float a_buf[8] __attribute__((aligned(32)));
+  float dst_buf[8] __attribute__((aligned(32))) = {0};
+  _mm256_store_si256((__m256i*) mask_buf, _mm256_castps_si256(mask));
+  _mm256_store_ps(a_buf, a);
+  int k = 0;
+  for (int i = 0; i < 8; i++) {
+    if (mask[i]) {
+      dst_buf[i] = a_buf[k++];
+    }
+  }
+  return _mm256_load_ps(dst_buf);
+# endif
+}
+
+VEC_INLINE inline __m256d _mm256_compress_pd(__m256d mask, __m256d a) {
+  return _mm256_castps_pd(_mm256_compress_ps(_mm256_castpd_ps(mask), 
+					     _mm256_castpd_ps(a)));
+}
+VEC_INLINE inline __m256d _mm256_expand_pd(__m256d mask, __m256d a) {
+  return _mm256_castps_pd(_mm256_expand_ps(_mm256_castpd_ps(mask), 
+                                           _mm256_castpd_ps(a)));
+}
+#endif
+
+
+class FVEC_NAME;
+class IVEC_NAME;
+class AVEC_NAME;
+class BVEC_NAME {
+  friend class FVEC_NAME;
+  friend class IVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==8
+  friend class avec8pd;
+# endif
+  FVEC_MASK_T val_;
+  VEC_INLINE BVEC_NAME(const FVEC_MASK_T &v) : val_(v) {}
+  VEC_INLINE BVEC_NAME(const __m256i &v) : val_(FVEC_SUFFIX(_mm256_castsi256_)
+						(v)) {}
+public:
+  VEC_INLINE BVEC_NAME() {}
+  VEC_INLINE static BVEC_NAME kand(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return FVEC_SUFFIX(_mm256_and_)(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME kandn(const BVEC_NAME &a, const BVEC_NAME &b) {
+    return FVEC_SUFFIX(_mm256_andnot_)(a.val_, b.val_);
+  }
+  VEC_INLINE static BVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const BVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_compress_)(mask.val_, a.val_);
+  }
+  VEC_INLINE static BVEC_NAME mask_expand(const BVEC_NAME &src, 
+					  const BVEC_NAME &mask, 
+					  const BVEC_NAME &a) {
+    FVEC_MASK_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, a.val_);
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+				  (mask.val_, src.val_));
+    return ret;
+  }
+  VEC_INLINE static BVEC_NAME full() {
+    __m256i a = _mm256_undefined_si256();
+    return FVEC_SUFFIX(_mm256_castsi256_)(_mm256_cmpeq_epi32(a, a));
+  }
+  VEC_INLINE static BVEC_NAME empty() {
+    return FVEC_SUFFIX(_mm256_setzero_)();
+  }
+  VEC_INLINE static BVEC_NAME only(int n) {
+    static const unsigned int FULL_ps = (unsigned int) -1;
+    static const unsigned int LUT_ps[9][8] = {
+      {0, 0, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, 0, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, 0, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, 0},
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+    };
+    static const unsigned long long FULL_pd = (unsigned long long) -1;
+    static const unsigned long long LUT_pd[5][4] = {
+      {0, 0, 0, 0},
+      {FULL_pd, 0, 0, 0},
+      {FULL_pd, FULL_pd, 0, 0},
+      {FULL_pd, FULL_pd, FULL_pd, 0},
+      {FULL_pd, FULL_pd, FULL_pd, FULL_pd},
+    };
+    return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]);
+  }
+  VEC_INLINE static BVEC_NAME after(int n) {
+    static const unsigned int FULL_ps = (unsigned int) -1;
+    static const unsigned int LUT_ps[9][8] = {
+      {FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, FULL_ps, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, 0, FULL_ps, FULL_ps},
+      {0, 0, 0, 0, 0, 0, 0, FULL_ps},
+      {0, 0, 0, 0, 0, 0, 0, 0},
+    };
+    static const unsigned long long FULL_pd = (unsigned long long) -1;
+    static const unsigned long long LUT_pd[5][4] = {
+      {FULL_pd, FULL_pd, FULL_pd, FULL_pd},
+      {0, FULL_pd, FULL_pd, FULL_pd},
+      {0, 0, FULL_pd, FULL_pd},
+      {0, 0, 0, FULL_pd},
+      {0, 0, 0, 0},
+    };
+    return FVEC_SUFFIX(_mm256_load_)((const FVEC_SCAL_T*) FVEC_SUFFIX(LUT_)[n]);
+  }
+  VEC_INLINE static BVEC_NAME onlyafter(int only_, int after_) {
+    return kand(after(after_), only(after_ + only_));
+  }
+  VEC_INLINE static int popcnt(const BVEC_NAME &a) {
+    return _popcnt32(FVEC_SUFFIX(_mm256_movemask_)(a.val_));
+  }
+  VEC_INLINE static bool test_all_unset(const BVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_testz_)(a.val_, a.val_);
+  }
+  VEC_INLINE static bool test_any_set(const BVEC_NAME &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const BVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    return FVEC_SUFFIX(_mm256_movemask_)(a.val_) & (1 << i);
+  }
+  VEC_INLINE BVEC_NAME operator &(const BVEC_NAME &b) const {
+    return FVEC_SUFFIX(_mm256_and_)(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator |(const BVEC_NAME &b) const {
+    return FVEC_SUFFIX(_mm256_or_)(val_, b.val_);
+  }
+  VEC_INLINE BVEC_NAME operator ~() const {
+    return FVEC_SUFFIX(_mm256_andnot_)(val_, full().val_);
+  }
+};
+
+class IVEC_NAME {
+  friend class FVEC_NAME;
+  friend class AVEC_NAME;
+# if FVEC_LEN==8
+  friend class avec8pd;
+# endif
+  __m256i val_;
+  VEC_INLINE IVEC_NAME(const __m256i &v) : val_(v) {}
+  VEC_INLINE static __m256i to(const FVEC_VEC_T &a) {
+#   if FVEC_LEN==4
+    return _mm256_castpd_si256(a);
+#   else
+    return _mm256_castps_si256(a);
+#   endif
+  }
+  VEC_INLINE static FVEC_VEC_T from(const __m256i &a) {
+    return FVEC_SUFFIX(_mm256_castsi256_)(a);
+  }
+public:
+  static const int VL = 8;
+  VEC_INLINE IVEC_NAME() {}
+
+  #define IVEC_MASK_BINFN_B(the_name)                                \
+    VEC_INLINE static BVEC_NAME the_name(const IVEC_NAME &a,         \
+                                         const IVEC_NAME &b) {	     \
+      return _mm256_##the_name##_epi32(a.val_, b.val_);              \
+    }                                                                \
+    VEC_INLINE static BVEC_NAME mask_##the_name(                     \
+        const BVEC_NAME &mask,                                       \
+        const IVEC_NAME &a, const IVEC_NAME &b                       \
+    ) {                                                              \
+      BVEC_NAME ret = _mm256_##the_name##_epi32(                     \
+        a.val_, b.val_);                                             \
+      return mask & ret;                                             \
+    }
+  IVEC_MASK_BINFN_B(cmpeq)
+  IVEC_MASK_BINFN_B(cmpgt)
+
+  VEC_INLINE static __m256i _mm256_cmplt_epi32(__m256i a, __m256i b) {
+    __m256i le = _mm256_cmpgt_epi32(b, a);
+    __m256i eq = _mm256_cmpeq_epi32(a, b);
+    return _mm256_andnot_si256(eq, le);
+  }
+
+  VEC_INLINE static __m256i _mm256_cmpneq_epi32(__m256i a, __m256i b) {
+    __m256i eq = _mm256_cmpeq_epi32(a, b);
+    __m256i t = _mm256_undefined_si256();
+    __m256i f = _mm256_cmpeq_epi32(t, t);
+    return _mm256_andnot_si256(eq, f);
+  }
+
+  IVEC_MASK_BINFN_B(cmplt)
+  IVEC_MASK_BINFN_B(cmpneq)
+  #undef IVEC_MASK_BINFN_B
+
+  VEC_INLINE static IVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const IVEC_NAME &a, const IVEC_NAME &b
+  ) {
+    return to(FVEC_SUFFIX(_mm256_blendv_)(from(a.val_), from(b.val_), 
+              mask.val_));
+  }
+  #define IVEC_MASK_BINFN_I(the_name)                                \
+    VEC_INLINE static IVEC_NAME mask_##the_name(                     \
+        const IVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const IVEC_NAME &a, const IVEC_NAME &b                       \
+    ) {                                                              \
+      IVEC_NAME ret = _mm256_##the_name##_epi32(                     \
+						a.val_, b.val_);     \
+	return mask_blend(mask, src, ret);			     \
+    }
+  IVEC_MASK_BINFN_I(add)
+  #undef IVEC_MASK_BINFN_I
+
+  #define IVEC_BINFN_I(the_name)                                     \
+    VEC_INLINE static IVEC_NAME the_name(const IVEC_NAME &a,         \
+					 const IVEC_NAME &b) {	     \
+      return _mm256_##the_name##_epi32(a.val_, b.val_);              \
+    }
+  IVEC_BINFN_I(mullo)
+  IVEC_BINFN_I(srlv)
+  #undef IVEC_BINFN_I
+  VEC_INLINE static IVEC_NAME the_and(const IVEC_NAME &a, const IVEC_NAME &b) {
+    return _mm256_and_si256(a.val_, b.val_);
+  }
+
+  VEC_INLINE static IVEC_NAME masku_compress(const BVEC_NAME &mask, 
+					     const IVEC_NAME &b) {
+    return to(FVEC_SUFFIX(_mm256_compress_)(mask.val_, from(b.val_)));
+  }
+  VEC_INLINE static IVEC_NAME mask_expand(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &b
+  ) {
+    FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, from(b.val_));
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+				    (mask.val_, from(src.val_)));
+    return to(ret);
+  }
+
+  VEC_INLINE static void store(int * dest, const IVEC_NAME &src) {
+    _mm256_store_si256((__m256i*)dest, src.val_);
+#   if FVEC_LEN==4
+    dest[1] = dest[2];
+    dest[2] = dest[4];
+    dest[3] = dest[6];
+#   endif
+  }
+
+  VEC_INLINE static int at(const IVEC_NAME &a, int b) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    return data[b];
+  }
+
+  VEC_INLINE static void print(const char * str, const IVEC_NAME &a) {
+    int data[8] __attribute__((aligned(32)));
+    store(data, a);
+    printf("%s:", str);
+    for (int i = 0; i < FVEC_LEN; i++) {
+      printf(" %d", data[i]);
+    }
+    printf("\n");
+  }
+
+  VEC_INLINE static IVEC_NAME maskz_loadu(const BVEC_NAME &mask, 
+					  const int * src) {
+    FVEC_VEC_T mask_val = mask.val_;
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256 m = _mm256_castpd_ps(mask_val);
+    m = _mm256_permutevar8x32_ps(m, _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i ret = _mm_maskload_epi32(src, 
+       _mm256_castsi256_si128(_mm256_castps_si256(m)));
+    static const unsigned int load_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 1, 1, 2, 2, 3, 3};
+    return _mm256_permutevar8x32_epi32(_mm256_castsi128_si256(ret), 
+      _mm256_load_si256((__m256i*)load_shuffle));
+#    else
+    int dest[8] __attribute__((aligned(32))) = {0};
+    int mask_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd((double*) mask_buf, mask.val_);
+    for (int i = 0; i < 4; i++) {
+      if (mask_buf[2*i]) {
+        int val = src[i];
+        dest[2*i+0] = val;
+        dest[2*i+1] = val;
+      }
+    }
+    return _mm256_load_si256((__m256i*) dest);
+#    endif
+#   else
+    return _mm256_maskload_epi32(src, to(mask_val));
+#   endif
+  }
+
+  VEC_INLINE static IVEC_NAME mask_gather(
+      const IVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx, 
+      const int * mem, const int scale
+  ) {
+    assert(scale == sizeof(int));
+    return _mm256_mask_i32gather_epi32(src.val_, mem, idx.val_, to(mask.val_), 
+				       sizeof(int));
+  }
+
+  VEC_INLINE static void mask_compressstore(const BVEC_NAME &mask, int * dest,
+					    const IVEC_NAME &src) {
+    int buf[8] __attribute__((aligned(64)));
+    const int stride = FVEC_LEN==4 ? 2 : 1;
+    _mm256_store_si256((__m256i*)buf, src.val_);
+    int mask_val = FVEC_SUFFIX(_mm256_movemask_)(mask.val_);
+    int k = 0;
+    #pragma unroll
+    for (int i = 0; i < FVEC_LEN; i++) {
+      if (mask_val & (1 << i))
+        dest[k++] = buf[stride*i];
+    }
+  }
+
+  VEC_INLINE static IVEC_NAME set1(int i) {
+    return _mm256_set1_epi32(i);
+  }
+  VEC_INLINE static IVEC_NAME setzero() {
+    return _mm256_setzero_si256();
+  }
+  VEC_INLINE static IVEC_NAME undefined() {
+    return _mm256_undefined_si256();
+  }
+
+  VEC_INLINE IVEC_NAME operator +(const IVEC_NAME &b) const {
+    return _mm256_add_epi32(this->val_, b.val_);
+  }
+};
+
+class FVEC_NAME {
+  friend class AVEC_NAME;
+#if FVEC_LEN==8
+  friend class avec8pd;
+#endif
+  FVEC_VEC_T val_;
+  VEC_INLINE FVEC_NAME(const FVEC_VEC_T &v) : val_(v) {}
+public:
+  static const int VL = FVEC_LEN;
+# if defined(__AVX2__) || defined(__MIC__) || defined(__AVX512F__)
+  VEC_INLINE static bool fast_compress() { return true; }
+# else
+  VEC_INLINE static bool fast_compress() { return false; }
+# endif
+  VEC_INLINE FVEC_NAME() {}
+  VEC_INLINE static FVEC_SCAL_T at(const FVEC_NAME &a, int i) {
+    assert(i < FVEC_LEN);
+    FVEC_SCAL_T data[FVEC_LEN] __attribute__((aligned(64)));
+    FVEC_SUFFIX(_mm256_store_)(data, a.val_);
+    return data[i];
+  }
+
+  #define FVEC_MASK_BINFN_B(the_name, the_imm)				\
+    VEC_INLINE static BVEC_NAME the_name(const FVEC_NAME &a,		\
+					 const FVEC_NAME &b) {		\
+      return FVEC_SUFFIX(_mm256_cmp_)(a.val_, b.val_, the_imm);         \
+    }									\
+    VEC_INLINE static BVEC_NAME mask_##the_name(                        \
+        const BVEC_NAME &mask,                                          \
+        const FVEC_NAME &a, const FVEC_NAME &b                          \
+    ) {                                                                 \
+      BVEC_NAME ret = FVEC_SUFFIX(_mm256_cmp_)(                         \
+        a.val_, b.val_, the_imm);                                       \
+      return mask & ret;						\
+    }
+  FVEC_MASK_BINFN_B(cmple, _CMP_LE_OS)
+  FVEC_MASK_BINFN_B(cmplt, _CMP_LT_OS)
+  FVEC_MASK_BINFN_B(cmpneq, _CMP_NEQ_UQ)
+  FVEC_MASK_BINFN_B(cmpnle, _CMP_NLE_US)
+  FVEC_MASK_BINFN_B(cmpnlt, _CMP_NLT_US)
+  #undef FVEC_MASK_BINFN_B
+
+  VEC_INLINE static __m256d _mm256_recip_pd(__m256d a) {
+    __m256d c_1 = _mm256_set1_pd(1);
+    return _mm256_div_pd(c_1, a);
+  }
+  VEC_INLINE static __m256 _mm256_recip_ps(__m256 a) {
+    return _mm256_rcp_ps(a);
+  }
+  VEC_INLINE static __m256d _mm256_abs_pd(__m256d a) {
+    const unsigned long long abs_mask = 0x7FFFFFFFFFFFFFFF;
+    const unsigned long long abs_full[8] =
+        {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask};
+    return _mm256_and_pd(_mm256_load_pd((double*)abs_full), a);
+  }
+  VEC_INLINE static __m256 _mm256_abs_ps(__m256 a) {
+    const unsigned long long abs_mask = 0x7FFFFFFF;
+    const unsigned long long abs_full[16] =
+        {abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, abs_mask, 
+	   abs_mask, abs_mask, abs_mask};
+    return _mm256_and_ps(_mm256_load_ps((float*)abs_full), a);
+  }
+
+  #define FVEC_UNFN_F(the_name)                                      \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a) {       \
+      return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_);              \
+    }
+  FVEC_UNFN_F(abs)
+  FVEC_UNFN_F(exp)
+  FVEC_UNFN_F(invsqrt)
+  FVEC_UNFN_F(recip)
+  FVEC_UNFN_F(sqrt)
+  #undef FVEC_UNFN_F
+
+  VEC_INLINE static FVEC_NAME mask_blend(
+      const BVEC_NAME &mask, const FVEC_NAME &a, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm256_blendv_)(a.val_, b.val_, mask.val_);
+  }
+  #define FVEC_MASK_UNFN_F(the_name)                                 \
+    VEC_INLINE static FVEC_NAME mask_##the_name(                     \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a                                           \
+    ) {                                                              \
+      FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)(             \
+							a.val_);     \
+      return mask_blend(mask, src, ret);			     \
+    }
+  FVEC_MASK_UNFN_F(cos)
+  FVEC_MASK_UNFN_F(recip)
+  FVEC_MASK_UNFN_F(sqrt)
+  #undef FVEC_MASK_UNFN_F
+
+  #define FVEC_BINFN_F(the_name)                                     \
+    VEC_INLINE static FVEC_NAME the_name(const FVEC_NAME &a,         \
+                                         const FVEC_NAME &b) {	     \
+      return FVEC_SUFFIX(_mm256_##the_name##_)(a.val_, b.val_);	     \
+    }
+  FVEC_BINFN_F(max)
+  FVEC_BINFN_F(min)
+  #undef FVEC_BINFN_F
+
+  #define FVEC_MASK_BINFN_F(the_name)                                \
+    VEC_INLINE static FVEC_NAME mask_##the_name(	             \
+        const FVEC_NAME &src, const BVEC_NAME &mask,                 \
+        const FVEC_NAME &a, const FVEC_NAME &b                       \
+    ) {                                                              \
+      FVEC_NAME ret = FVEC_SUFFIX(_mm256_##the_name##_)(             \
+        a.val_, b.val_);                                             \
+      return mask_blend(mask, src, ret);                             \
+    }
+  FVEC_MASK_BINFN_F(add)
+  FVEC_MASK_BINFN_F(div)
+  FVEC_MASK_BINFN_F(mul)
+  FVEC_MASK_BINFN_F(sub)
+  #undef FVEC_MASK_BINFN_F
+
+  VEC_INLINE static FVEC_NAME mask_expand(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const FVEC_NAME &b
+  ) {
+    FVEC_VEC_T ret = FVEC_SUFFIX(_mm256_expand_)(mask.val_, b.val_);
+    ret = FVEC_SUFFIX(_mm256_and_)(mask.val_, ret);
+    ret = FVEC_SUFFIX(_mm256_or_)(ret, FVEC_SUFFIX(_mm256_andnot_)
+      (mask.val_, src.val_));
+    return ret;
+  }
+  VEC_INLINE static FVEC_NAME masku_compress(
+      const BVEC_NAME &mask, const FVEC_NAME &b
+  ) {
+    return FVEC_SUFFIX(_mm256_compress_)(mask.val_, b.val_);
+  }
+
+  VEC_INLINE static FVEC_NAME set1(const FVEC_SCAL_T &a) {
+    return FVEC_SUFFIX(_mm256_set1_)(a);
+  }
+  VEC_INLINE static FVEC_NAME setzero() {
+    return FVEC_SUFFIX(_mm256_setzero_)();
+  }
+  VEC_INLINE static FVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm256_undefined_)();
+  }
+
+  VEC_INLINE static FVEC_NAME load(const FVEC_SCAL_T *mem) {
+    return FVEC_SUFFIX(_mm256_load_)(mem);
+  }
+  VEC_INLINE static void store(FVEC_SCAL_T * dest, const FVEC_NAME &a) {
+    FVEC_SUFFIX(_mm256_store_)(dest, a.val_);
+  }
+
+
+  VEC_INLINE static FVEC_NAME gather(const IVEC_NAME &idx, 
+    const FVEC_SCAL_T * mem, const int scale) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256i m = _mm256_permutevar8x32_epi32(idx.val_, 
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i idx_short = _mm256_castsi256_si128(m);
+    return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx_short, sizeof(FVEC_SCAL_T));
+#    else
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    double dest[4] __attribute__((aligned(32)));
+    for (int i = 0; i < 4; i++) {
+      dest[i] = mem[idx_buf[2*i]];
+    }
+    return _mm256_load_pd(dest);
+#    endif
+#   else
+    return FVEC_SUFFIX(_mm256_i32gather_)(mem, idx.val_, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+  VEC_INLINE static FVEC_NAME mask_gather(
+      const FVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+#   if FVEC_LEN==4
+#    ifdef __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 2, 4, 6, 0, 0, 0, 0};
+    __m256i m = _mm256_permutevar8x32_epi32(idx.val_, 
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    __m128i idx_short = _mm256_castsi256_si128(m);
+    return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx_short, 
+      mask.val_, sizeof(FVEC_SCAL_T));
+#    else
+    int idx_buf[8] __attribute__((aligned(32)));
+    int mask_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    _mm256_store_pd((double*) mask_buf, mask.val_);
+    double dest[4] __attribute__((aligned(32)));
+    _mm256_store_pd((double*) dest, src.val_);
+    for (int i = 0; i < 4; i++) {
+      if (mask_buf[2*i])
+        dest[i] = mem[idx_buf[2*i]];
+    }
+    return _mm256_load_pd(dest);
+#    endif
+#   else
+    return FVEC_SUFFIX(_mm256_mask_i32gather_)(src.val_, mem, idx.val_, 
+      mask.val_, sizeof(FVEC_SCAL_T));
+#   endif
+  }
+
+  VEC_INLINE static void gather_4_adjacent(const IVEC_NAME &idx, 
+      const FVEC_SCAL_T * mem, const int scale, FVEC_NAME * out_0, 
+      FVEC_NAME * out_1, FVEC_NAME * out_2, FVEC_NAME * out_3) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+#   if FVEC_LEN==4
+    __m256d a0 = _mm256_load_pd(&mem[idx_buf[0]]);
+    __m256d a1 = _mm256_load_pd(&mem[idx_buf[2]]);
+    __m256d a2 = _mm256_load_pd(&mem[idx_buf[4]]);
+    __m256d a3 = _mm256_load_pd(&mem[idx_buf[6]]);
+    __m256d b0 = _mm256_unpacklo_pd(a0, a1);
+    __m256d b1 = _mm256_unpackhi_pd(a0, a1);
+    __m256d b2 = _mm256_unpacklo_pd(a2, a3);
+    __m256d b3 = _mm256_unpackhi_pd(a2, a3);
+    *out_0 = _mm256_permute2f128_pd(b0, b2, 0x20);
+    *out_1 = _mm256_permute2f128_pd(b1, b3, 0x20);
+    *out_2 = _mm256_permute2f128_pd(b0, b2, 0x31);
+    *out_3 = _mm256_permute2f128_pd(b1, b3, 0x31);
+#   else
+    const float *e0 = &mem[idx_buf[0]];
+    const float *e1 = &mem[idx_buf[1]];
+    const float *e2 = &mem[idx_buf[2]];
+    const float *e3 = &mem[idx_buf[3]];
+    const float *e4 = &mem[idx_buf[4]];
+    const float *e5 = &mem[idx_buf[5]];
+    const float *e6 = &mem[idx_buf[6]];
+    const float *e7 = &mem[idx_buf[7]];
+    __m256 a0 = _mm256_loadu2_m128(e4, e0);
+    __m256 a1 = _mm256_loadu2_m128(e5, e1);
+    __m256 b0 = _mm256_unpacklo_ps(a0, a1);
+    __m256 b1 = _mm256_unpackhi_ps(a0, a1);
+    __m256 a2 = _mm256_loadu2_m128(e6, e2);
+    __m256 a3 = _mm256_loadu2_m128(e7, e3);
+    __m256 b2 = _mm256_unpacklo_ps(a2, a3);
+    __m256 b3 = _mm256_unpackhi_ps(a2, a3);
+    *out_0 = _mm256_shuffle_ps(b0, b2, 0x44);
+    *out_1 = _mm256_shuffle_ps(b0, b2, 0xEE);
+    *out_2 = _mm256_shuffle_ps(b1, b3, 0x44);
+    *out_3 = _mm256_shuffle_ps(b1, b3, 0xEE);
+#   endif
+  }
+  VEC_INLINE static void gather_3_adjacent(const IVEC_NAME &idx, 
+					   const FVEC_SCAL_T * mem, 
+					   const int scale, 
+					   FVEC_NAME * out_0, 
+					   FVEC_NAME * out_1, 
+					   FVEC_NAME * out_2) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    FVEC_NAME tmp_3;
+    gather_4_adjacent(idx, mem, scale, out_0, out_1, out_2, &tmp_3);
+  }
+
+  VEC_INLINE static double _mm256_reduce_add_pd(__m256d a) {
+    __m256d t1 = _mm256_hadd_pd(a, a);
+    __m128d t2 = _mm256_extractf128_pd(t1, 1);
+    __m128d t3 = _mm256_castpd256_pd128(t1);
+    return _mm_cvtsd_f64(_mm_add_pd(t2, t3));
+  }
+
+  VEC_INLINE static float _mm256_reduce_add_ps(__m256 a) {
+    __m256 t1 = _mm256_hadd_ps(a, a);
+    __m128 t2 = _mm256_extractf128_ps(t1, 1);
+    __m128 t3 = _mm256_castps256_ps128(t1);
+    __m128 t4 = _mm_add_ps(t2, t3);
+    __m128 t5 = _mm_permute_ps(t4, 0x1B); // 0x1B = reverse
+    return _mm_cvtss_f32(_mm_add_ps(t4, t5));
+  }
+
+  VEC_INLINE static FVEC_SCAL_T reduce_add(const FVEC_NAME &a) {
+    return FVEC_SUFFIX(_mm256_reduce_add_)(a.val_);
+  }
+  VEC_INLINE static FVEC_SCAL_T mask_reduce_add(const BVEC_NAME &mask, 
+						const FVEC_NAME &a) {
+    return reduce_add(FVEC_SUFFIX(_mm256_and_)(mask.val_, a.val_));
+  }
+
+  VEC_INLINE static IVEC_NAME unpackloepi32(const FVEC_NAME &a) {
+#   if FVEC_LEN==4
+#    if __AVX2__
+    static const unsigned int mask_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 2, 2, 4, 4, 6, 6};
+    __m256 m = _mm256_permutevar8x32_ps(_mm256_castpd_ps(a.val_),
+      _mm256_load_si256((__m256i*)mask_shuffle));
+    return _mm256_castps_si256(m);
+#    else
+    __m128i a_lo = _mm256_castsi256_si128(_mm256_castpd_si256(a.val_));
+    __m128i a_hi = _mm256_extractf128_si256(_mm256_castpd_si256(a.val_), 1);
+    __m128i c_lo = _mm_shuffle_epi32(a_lo, 0xA0); /*1010 0000*/
+    __m128i c_hi = _mm_shuffle_epi32(a_hi, 0xA0);
+    __m256i ret = _mm256_setr_m128i(c_lo, c_hi);
+    return ret;
+#    endif
+#   else
+    return _mm256_castps_si256(a.val_);
+#   endif
+  }
+
+  VEC_INLINE static FVEC_NAME mask_sincos(
+      FVEC_NAME * cos, const FVEC_NAME &src_a, const FVEC_NAME &src_b,
+      const BVEC_NAME &mask, const FVEC_NAME &arg
+  ) {
+    FVEC_VEC_T c, s = FVEC_SUFFIX(_mm256_sincos_)(&c, arg.val_);
+    *cos = mask_blend(mask, src_b, c);
+    return mask_blend(mask, src_a, s);
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline FVEC_NAME operator the_sym(const FVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_);            \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+  #undef FVEC_BINOP
+
+  VEC_INLINE static void gather_prefetch0(const IVEC_NAME &a, void * mem) {
+    /* NOP */
+  }
+};
+
+class AVEC_NAME {
+  friend class avec8pd;
+  FVEC_VEC_T val_;
+  VEC_INLINE AVEC_NAME(const FVEC_VEC_T &a) : val_(a) {}
+public:
+  VEC_INLINE AVEC_NAME(const FVEC_NAME &a) : val_(a.val_) {}
+  VEC_INLINE static AVEC_NAME undefined() {
+    return FVEC_SUFFIX(_mm256_undefined_)();
+  }
+  VEC_INLINE static AVEC_NAME mask_gather(
+      const AVEC_NAME &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const FVEC_SCAL_T * mem, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    return FVEC_NAME::mask_gather(src.val_, mask, idx, mem, scale);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      FVEC_SCAL_T * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const AVEC_NAME &a, const int scale
+  ) {
+    assert(scale == sizeof(FVEC_SCAL_T));
+    for (int l = 0; l < FVEC_NAME::VL; l++) {
+      if (BVEC_NAME::test_at(mask, l))
+        mem[IVEC_NAME::at(idx, l)] = FVEC_NAME::at(a.val_, l);
+    }
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                                      \
+    VEC_INLINE inline AVEC_NAME operator the_sym(const AVEC_NAME &b) const { \
+    return FVEC_SUFFIX(_mm256_##the_name##_)(this->val_, b.val_);            \
+  }
+  AVEC_BINOP(-, sub)
+  #undef AVEC_BINOP
+};
+
+#if FVEC_LEN==8
+class avec8pd {
+  __m256d lo_, hi_;
+  VEC_INLINE avec8pd(const __m256d &lo, const __m256d &hi) : lo_(lo), hi_(hi) {}
+  VEC_INLINE static __m128 get_ps_hi(__m256 a) {
+    return _mm256_extractf128_ps(a, 1);
+  }
+  VEC_INLINE static __m128 get_ps_lo(__m256 a) {
+    return _mm256_castps256_ps128(a);
+  }
+  VEC_INLINE static __m128i get_si_hi(__m256i a) {
+    return _mm_castps_si128(get_ps_hi(_mm256_castsi256_ps(a)));
+  }
+  VEC_INLINE static __m128i get_si_lo(__m256i a) {
+    return _mm_castps_si128(get_ps_lo(_mm256_castsi256_ps(a)));
+  }
+public:
+  VEC_INLINE avec8pd(const FVEC_NAME &a) {
+    lo_ = _mm256_cvtps_pd(get_ps_lo(a.val_));
+    hi_ = _mm256_cvtps_pd(get_ps_hi(a.val_));
+  }
+  VEC_INLINE static avec8pd undefined() {
+    return avec8pd(_mm256_undefined_pd(), _mm256_undefined_pd());
+  }
+  VEC_INLINE static avec8pd mask_gather(
+      const avec8pd &src, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const double * mem, const int scale
+  ) {
+#   ifndef __AVX2__
+    assert(scale == sizeof(double));
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*) idx_buf, idx.val_);
+    int mask_val = _mm256_movemask_ps(mask.val_);
+    double ret_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd(&ret_buf[0], src.lo_);
+    _mm256_store_pd(&ret_buf[4], src.hi_);
+    for (int i = 0; i < 8; i++) {
+      if (mask_val & (1 << i)) {
+        ret_buf[i] = mem[idx_buf[i]];
+      }
+    }
+    __m256d lo = _mm256_load_pd(&ret_buf[0]);
+    __m256d hi = _mm256_load_pd(&ret_buf[4]);
+#   else
+    static const unsigned int lo_shuffle[8] __attribute__((aligned(32))) =
+      {0, 0, 1, 1, 2, 2, 3, 3};
+    static const unsigned int hi_shuffle[8] __attribute__((aligned(32))) =
+      {4, 4, 5, 5, 6, 6, 7, 7};
+    __m256d lo_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_,
+      _mm256_load_si256((__m256i*) lo_shuffle)));
+    __m256d hi_mask = _mm256_castps_pd(_mm256_permutevar8x32_ps(mask.val_, 
+      _mm256_load_si256((__m256i*) hi_shuffle)));
+    __m256d lo = _mm256_mask_i32gather_pd(src.lo_, mem, get_si_lo(idx.val_), 
+					  lo_mask, sizeof(double));
+    __m256d hi = _mm256_mask_i32gather_pd(src.hi_, mem, get_si_hi(idx.val_), 
+					  hi_mask, sizeof(double));
+#   endif
+    return avec8pd(lo, hi);
+  }
+  VEC_INLINE static void mask_i32loscatter(
+      double * mem, const BVEC_NAME &mask, const IVEC_NAME &idx,
+      const avec8pd &a, const int scale
+  ) {
+    assert(scale == sizeof(double));
+    double a_buf[8] __attribute__((aligned(32)));
+    _mm256_store_pd(a_buf, a.lo_);
+    _mm256_store_pd(&a_buf[4], a.hi_);
+    int idx_buf[8] __attribute__((aligned(32)));
+    _mm256_store_si256((__m256i*)idx_buf, idx.val_);
+    int mask_val = _mm256_movemask_ps(mask.val_);
+    for (int i = 0; i < 8; i++) {
+      if (mask_val & (1 << i))
+        mem[idx_buf[i]] = a_buf[i];
+    }
+  }
+
+  #define AVEC2_BINOP(the_sym, the_name)                                    \
+    VEC_INLINE inline avec8pd operator the_sym(const avec8pd &b) const {    \
+    __m256d lo = _mm256_##the_name##_pd(this->lo_, b.lo_);                  \
+    __m256d hi = _mm256_##the_name##_pd(this->hi_, b.hi_);                  \
+    return avec8pd(lo, hi);                                                 \
+  }
+  AVEC2_BINOP(-, sub)
+};
+#endif
+
+}
+
+
+#ifdef FVEC_FIRST_PASS
+
+template<typename flt_t, typename acc_t>
+struct intr_types;
+
+template<>
+struct intr_types<double,double> {
+  typedef mm256::fvec4pd fvec;
+  typedef mm256::ivec4 ivec;
+  typedef mm256::bvec4 bvec;
+  typedef mm256::avec4pd avec;
+};
+
+template<>
+struct intr_types<float,float> {
+  typedef mm256::fvec8ps fvec;
+  typedef mm256::ivec8 ivec;
+  typedef mm256::bvec8 bvec;
+  typedef mm256::avec8ps avec;
+};
+
+template<>
+struct intr_types<float,double> {
+  typedef mm256::fvec8ps fvec;
+  typedef mm256::ivec8 ivec;
+  typedef mm256::bvec8 bvec;
+  typedef mm256::avec8pd avec;
+};
+
+#endif
+
+#ifndef FVEC_FIRST_PASS
+#  define FVEC_FIRST_PASS
+#  include "intel_intrinsics_airebo.h"
+#endif
+
+#endif
+
+#ifdef LMP_INTEL_AIREBO_SCALAR
+
+#include <cassert>
+#include <cmath>
+#include <immintrin.h>
+
+#define VEC_INLINE __attribute__((always_inline))
+
+template<typename flt_t, typename acc_t>
+struct intr_types {
+
+class fvec;
+class ivec;
+class avec;
+class bvec {
+  friend class fvec;
+  friend class ivec;
+  friend class avec;
+  bool val_;
+  VEC_INLINE bvec(const bool &v) : val_(v) {}
+public:
+  VEC_INLINE bvec() {}
+  VEC_INLINE static bvec kand(const bvec &a, const bvec &b) {
+    return a.val_ && b.val_;
+  }
+  VEC_INLINE static bvec kandn(const bvec &a, const bvec &b) {
+    return (! a.val_) && b.val_;
+  }
+  VEC_INLINE static bvec knot(const bvec &a) {
+    return ! a.val_;
+  }
+  VEC_INLINE static int kortestz(const bvec &a, const bvec &b) {
+    return (! a.val_) && (! b.val_) ? true : false;
+  }
+  VEC_INLINE static bvec masku_compress(const bvec &mask, const bvec &a) {
+    return mask.val_ ? a.val_ : false;
+  }
+  VEC_INLINE static bvec mask_expand(const bvec &src, const bvec &mask, 
+				     const bvec &a) {
+    return mask.val_ ? a.val_ : src.val_;
+  }
+  VEC_INLINE static bvec full() {
+    return true;
+  }
+  VEC_INLINE static bvec empty() {
+    return false;
+  }
+  VEC_INLINE static bvec only(int n) {
+    return n == 1 ? true : false;
+  }
+  VEC_INLINE static bvec after(int n) {
+    return n == 0 ? true : false;
+  }
+  VEC_INLINE static bvec onlyafter(int only, int after) {
+    return after == 0 && only == 1 ? true : false;
+  }
+  VEC_INLINE static int popcnt(const bvec &a) {
+    return static_cast<int>(a.val_);
+  }
+  VEC_INLINE static bool test_all_unset(const bvec &a) {
+    return kortestz(a, a);
+  }
+  VEC_INLINE static bool test_any_set(const bvec &a) {
+    return ! test_all_unset(a);
+  }
+  VEC_INLINE static bool test_at(const bvec &a, int i) {
+    assert(i < 1);
+    return a.val_;
+  }
+  VEC_INLINE bvec operator &(const bvec &b) const {
+    return val_ && b.val_;
+  }
+  VEC_INLINE bvec operator |(const bvec &b) const {
+    return val_ || b.val_;
+  }
+  VEC_INLINE bvec operator ~() const {
+    return ! val_;
+  }
+};
+
+class ivec {
+  friend class fvec;
+  friend class avec;
+  int val_;
+  VEC_INLINE ivec(const int &v) : val_(v) {}
+public:
+  static const int VL = 1;
+  VEC_INLINE ivec() {}
+
+  #define IVEC_MASK_BINFN_B(the_name, the_op)                        \
+    VEC_INLINE static bvec the_name(const ivec &a, const ivec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }                                                                \
+    VEC_INLINE static bvec mask_##the_name(                          \
+        const bvec &mask,                                            \
+        const ivec &a, const ivec &b                                 \
+    ) {                                                              \
+      return mask.val_ && (a.val_ the_op b.val_);                    \
+                                                                     \
+    }
+  IVEC_MASK_BINFN_B(cmpeq, ==)
+  IVEC_MASK_BINFN_B(cmplt, <)
+  IVEC_MASK_BINFN_B(cmpneq, !=)
+  IVEC_MASK_BINFN_B(cmpgt, >)
+
+  #define IVEC_MASK_BINFN_I(the_name, the_op)                        \
+    VEC_INLINE static ivec mask_##the_name(                          \
+        const ivec &src, const bvec &mask,                           \
+        const ivec &a, const ivec &b                                 \
+    ) {                                                              \
+      return mask.val_ ? a.val_ the_op b.val_ : src.val_;            \
+    }
+  IVEC_MASK_BINFN_I(add, +)
+  VEC_INLINE static ivec mask_blend(
+      const bvec &mask, const ivec &a, const ivec &b
+  ) {
+    return mask.val_ ? b.val_ : a.val_;
+  }
+
+  #define IVEC_BINFN_I(the_name, the_op)                             \
+    VEC_INLINE static ivec the_name(const ivec &a, const ivec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }
+  IVEC_BINFN_I(mullo, *)
+  IVEC_BINFN_I(srlv, >>)
+  VEC_INLINE static ivec the_and(const ivec &a, const ivec &b) {
+    return a.val_ & b.val_;
+  }
+
+  VEC_INLINE static ivec mask_expand(
+      const ivec &src, const bvec &a, const ivec &b
+  ) {
+    return a.val_ ? b.val_ : src.val_;
+  }
+  VEC_INLINE static ivec masku_compress(
+      const bvec &a, const ivec &b
+  ) {
+    return a.val_ ? b.val_ : 0;
+  }
+
+  VEC_INLINE static int at(const ivec &a, int b) {
+    assert(b == 0);
+    return a.val_;
+  }
+
+  VEC_INLINE static ivec load(const int * src) {
+    return *src;
+  }
+  VEC_INLINE static ivec mask_loadu(const bvec &mask, const int * src) {
+    return mask.val_ ? *src : 0xDEAD;
+  }
+  VEC_INLINE static ivec maskz_loadu(const bvec &mask, const int * src) {
+    return mask.val_ ? *src : 0;
+  }
+  VEC_INLINE static void mask_storeu(const bvec &mask, int * dest, 
+    const ivec &src) {
+    if (mask.val_) *dest = src.val_;
+  }
+  VEC_INLINE static void store(int * dest, const ivec &src) {
+    *dest = src.val_;
+  }
+
+  VEC_INLINE static ivec mask_gather(
+      const ivec &src, const bvec &mask, const ivec &idx, const int * mem, 
+	const int scale
+  ) {
+    return mask.val_ ? *reinterpret_cast<const int *>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+  VEC_INLINE static void mask_i32scatter(
+      int * mem, const bvec &mask, const ivec &idx, const ivec &a, 
+	const int scale
+  ) {
+    if (mask.val_) *reinterpret_cast<int *>(reinterpret_cast<char*>(mem) + 
+      scale * idx.val_) = a.val_;
+  }
+
+  VEC_INLINE static void mask_compressstore(const bvec &mask, int * dest, 
+      const ivec &src) {
+    if (mask.val_) *dest = src.val_;
+  }
+
+  VEC_INLINE static ivec set(
+      int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+      int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0
+  ) {
+    return i0;
+  }
+  VEC_INLINE static ivec set1(int i) {
+    return i;
+  }
+  VEC_INLINE static ivec setzero() {
+    return 0;
+  }
+  VEC_INLINE static ivec undefined() {
+    return 0xDEAD;
+  }
+
+  VEC_INLINE ivec operator +(const ivec &b) const {
+    return val_ + b.val_;
+  }
+};
+
+class fvec {
+  friend class avec;
+  flt_t val_;
+  VEC_INLINE fvec(const flt_t &v) : val_(v) {}
+public:
+  static const int VL = 1;
+  VEC_INLINE fvec() {}
+  VEC_INLINE static flt_t at(const fvec &a, int i) {
+    assert(i < 1);
+    return a.val_;
+  }
+  VEC_INLINE static bool fast_compress() { return false; }
+
+  #define FVEC_MASK_BINFN_B(the_name, the_op)                        \
+    VEC_INLINE static bvec the_name(const fvec &a, const fvec &b) {  \
+      return a.val_ the_op b.val_;                                   \
+    }                                                                \
+    VEC_INLINE static bvec mask_##the_name(                          \
+        const bvec &mask,                                            \
+        const fvec &a, const fvec &b                                 \
+    ) {                                                              \
+      return mask.val_ && (a.val_ the_op b.val_);                    \
+    }
+  FVEC_MASK_BINFN_B(cmple, <=)
+  FVEC_MASK_BINFN_B(cmplt, <)
+  FVEC_MASK_BINFN_B(cmpneq, !=)
+  FVEC_MASK_BINFN_B(cmpnle, >)
+  FVEC_MASK_BINFN_B(cmpnlt, >=)
+
+  #define FVEC_UNFN_F(the_name, the_fn)                              \
+    VEC_INLINE static fvec the_name(const fvec &a) {                 \
+      return the_fn(a.val_);                                         \
+    }
+  FVEC_UNFN_F(abs, fabs)
+  FVEC_UNFN_F(exp, ::exp)
+  FVEC_UNFN_F(invsqrt, 1/std::sqrt)
+  FVEC_UNFN_F(recip, 1/)
+  FVEC_UNFN_F(sqrt, std::sqrt)
+
+  #define FVEC_MASK_UNFN_F(the_name, the_fn)                         \
+    VEC_INLINE static fvec mask_##the_name(                          \
+        const fvec &src, const bvec &mask,                           \
+        const fvec &a                                                \
+    ) {                                                              \
+      return mask.val_ ? the_fn(a.val_) : src.val_;                  \
+    }
+  FVEC_MASK_UNFN_F(cos, std::cos)
+  FVEC_MASK_UNFN_F(recip, 1/)
+  FVEC_MASK_UNFN_F(sqrt, std::sqrt)
+
+  #define FVEC_BINFN_F(the_name, the_fn)                             \
+    VEC_INLINE static fvec the_name(const fvec &a, const fvec &b) {  \
+      return the_fn(a.val_, b.val_);                                 \
+    }
+  FVEC_BINFN_F(max, ::fmax)
+  FVEC_BINFN_F(min, ::fmin)
+
+  #define FVEC_MASK_BINFN_F(the_name, the_op)                        \
+    VEC_INLINE static fvec mask_##the_name(                          \
+        const fvec &src, const bvec &mask,                           \
+        const fvec &a, const fvec &b                                 \
+    ) {                                                              \
+      return mask.val_ ? a.val_ the_op b.val_ : src.val_;            \
+    }
+  FVEC_MASK_BINFN_F(add, +)
+  FVEC_MASK_BINFN_F(div, /)
+  FVEC_MASK_BINFN_F(mul, *)
+  FVEC_MASK_BINFN_F(sub, -)
+  VEC_INLINE static fvec mask_blend(
+      const bvec &mask, const fvec &a, const fvec &b
+  ) {
+    return mask.val_ ? b.val_ : a.val_;
+  }
+
+  VEC_INLINE static fvec mask_expand(
+      const fvec &src, const bvec &a, const fvec &b
+  ) {
+    return a.val_ ? b.val_ : src.val_;
+  }
+  VEC_INLINE static fvec masku_compress(
+      const bvec &a, const fvec &b
+  ) {
+    return a.val_ ? b.val_ : 0;
+  }
+
+  VEC_INLINE static fvec set1(const flt_t &a) {
+    return a;
+  }
+  VEC_INLINE static fvec setzero() {
+    return 0;
+  }
+  VEC_INLINE static fvec undefined() {
+    return 1337.1337;
+  }
+
+  VEC_INLINE static fvec load(const flt_t *mem) {
+    return *mem;
+  }
+  VEC_INLINE static void mask_storeu(const bvec &mask, flt_t * dest, 
+				     const fvec &a) {
+    if (mask.val_) *dest = a.val_;
+  }
+  VEC_INLINE static void store(flt_t * dest, const fvec &a) {
+    *dest = a.val_;
+  }
+
+  VEC_INLINE static fvec gather(const ivec &idx, const flt_t * mem, 
+				const int scale) {
+    return *reinterpret_cast<const flt_t*>(reinterpret_cast<const char*>(mem) +
+      scale * idx.val_);
+  }
+  VEC_INLINE static fvec mask_gather(
+      const fvec &src, const bvec &mask, const ivec &idx,
+      const flt_t * mem, const int scale
+  ) {
+    return mask.val_ ? *reinterpret_cast<const flt_t*>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+
+  VEC_INLINE static void gather_3_adjacent(const ivec &idx, const flt_t * mem,
+					   const int scale, fvec * out_0, 
+					   fvec * out_1, fvec * out_2) {
+    assert(scale == sizeof(flt_t));
+    *out_0 = gather(idx, mem + 0, scale);
+    *out_1 = gather(idx, mem + 1, scale);
+    *out_2 = gather(idx, mem + 2, scale);
+  }
+  VEC_INLINE static void gather_4_adjacent(const ivec &idx, const flt_t * mem,
+					   const int scale, fvec * out_0, 
+					   fvec * out_1, fvec * out_2, 
+					   fvec * out_3) {
+    assert(scale == sizeof(flt_t));
+    *out_0 = gather(idx, mem + 0, scale);
+    *out_1 = gather(idx, mem + 1, scale);
+    *out_2 = gather(idx, mem + 2, scale);
+    *out_3 = gather(idx, mem + 3, scale);
+  }
+
+  VEC_INLINE static flt_t mask_reduce_add(const bvec &mask, const fvec &a) {
+    return mask.val_ ? a.val_ : 0;
+  }
+  VEC_INLINE static flt_t reduce_add(const fvec &a) {
+    return a.val_;
+  }
+
+  VEC_INLINE static ivec unpackloepi32(const fvec &a) {
+    return reinterpret_cast<const int*>(&a.val_)[0];
+  }
+
+  VEC_INLINE static fvec mask_sincos(
+      fvec * cos_out, const fvec &src_a, const fvec &src_b,
+      const bvec &mask, const fvec &arg
+  ) {
+    cos_out->val_ = mask.val_ ? ::cos(arg.val_) : src_b.val_;
+    return mask.val_ ? ::sin(arg.val_) : src_a.val_;
+  }
+
+  #define FVEC_BINOP(the_sym, the_name)                              \
+    VEC_INLINE inline fvec operator the_sym(const fvec &b) const {   \
+    return this->val_ the_sym b.val_;                                \
+  }
+  FVEC_BINOP(+, add)
+  FVEC_BINOP(-, sub)
+  FVEC_BINOP(*, mul)
+  FVEC_BINOP(/, div)
+
+  VEC_INLINE static void gather_prefetch0(const ivec &idx, const void * mem) {}
+};
+
+class avec {
+  acc_t val_;
+  VEC_INLINE avec(const acc_t &a) : val_(a) {}
+public:
+  VEC_INLINE avec(const fvec &a) : val_(a.val_) {}
+  VEC_INLINE static avec undefined() {
+    return 1337.1337;
+  }
+  VEC_INLINE static avec mask_gather(const avec &src, const bvec &mask, 
+				     const ivec &idx, const acc_t * mem, 
+				     const int scale) {
+    return mask.val_ ? *reinterpret_cast<const acc_t*>
+      (reinterpret_cast<const char*>(mem) + scale * idx.val_) : src.val_;
+  }
+  VEC_INLINE static void mask_i32loscatter(acc_t * mem, const bvec &mask, 
+					   const ivec &idx, const avec &a, 
+					   const int scale) {
+    if (mask.val_) *reinterpret_cast<acc_t*>(reinterpret_cast<char*>(mem) + 
+					     idx.val_ * scale) = a.val_;
+  }
+
+  #define AVEC_BINOP(the_sym, the_name)                              \
+    VEC_INLINE inline avec operator the_sym(const avec &b) const {   \
+    return this->val_ the_sym b.val_;                                \
+  }
+  AVEC_BINOP(-, sub)
+};
+
+};
+
+#endif
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index c5574a78c7a59703fee558dd2e8a910f7e02048f..3a36ead499ebe34a1dfcce5350abbd1f13fb8d6b 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -211,6 +211,8 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
     for (i = nall-1; i >= nlocal; i--) {
       if (mask[i] & bitmask) {
         ibin = coord2bin(atom->x[i]);
+	// Only necessary to store when neighboring ghost
+	atombin[i] = ibin;
         bins[i] = binhead[ibin];
         binhead[ibin] = i;
       }
@@ -222,14 +224,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
       binhead[ibin] = i;
     }
   } else {
-    for (i = nall-1; i >= nlocal; i--) {
-      ibin = coord2bin(atom->x[i]);
-      bins[i] = binhead[ibin];
-      binhead[ibin] = i;
-    }
-    for (i = nlocal-1; i >= 0; i--) {
+    for (i = nall-1; i >= 0; i--) {
       ibin = coord2bin(atom->x[i]);
-      atombin[i]=ibin;
+      // Only necessary to store for ghost when neighboring ghost
+      atombin[i] = ibin;
       bins[i] = binhead[ibin];
       binhead[ibin] = i;
     }
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12101712f11ec833b6501b88959844ce9085854b
--- /dev/null
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -0,0 +1,593 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_full_bin_ghost_intel.h"
+#include "neighbor.h"
+#include "nstencil.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "molecule.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairFullBinGhostIntel::NPairFullBinGhostIntel(LAMMPS *lmp) : NPairIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void NPairFullBinGhostIntel::build(NeighList *list)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_fix->offload_noghost())
+    error->all(FLERR,
+      "The 'ghost no' option cannot be used with this USER-INTEL pair style.");
+  #endif
+
+  if (nstencil > INTEL_MAX_STENCIL_CHECK)
+    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (exclude)
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    fbi(list, _fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    fbi(list, _fix->get_double_buffers());
+  else
+    fbi(list, _fix->get_single_buffers());
+
+  _fix->stop_watch(TIME_HOST_NEIGHBOR);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void NPairFullBinGhostIntel::fbi(NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers) 
+{
+  const int nlocal = atom->nlocal;
+  const int nall = atom->nlocal + atom->nghost;
+  list->inum = atom->nlocal;
+  list->gnum = atom->nghost;
+
+  int host_start = _fix->host_start_neighbor();
+  const int off_end = _fix->offload_end_neighbor();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_end) grow_stencil();
+  if (_fix->full_host_list()) host_start = 0;
+  int offload_noghost = _fix->offload_noghost();
+  #endif
+
+  // only uses offload_end_neighbor to check whether we are doing offloading
+  // at all, no need to correct this later
+  buffers->grow_list(list, nall, comm->nthreads, off_end,
+		     _fix->nbor_pack_width());
+
+  int need_ic = 0;
+  if (atom->molecular)
+    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
+			 neighbor->cutneighmax);
+
+  if (need_ic) {
+    fbi<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
+  } else {
+    fbi<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
+    fbi<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int need_ic>
+void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, 
+				 IntelBuffers<flt_t,acc_t> * buffers,
+				 const int pstart, const int pend) {
+  if (pend-pstart == 0) return;
+
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+  int nall_t = nall;
+  const int aend = nall;
+
+  const int pack_width = _fix->nbor_pack_width();
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = this->nstencil;
+  const int * _noalias const stencil = this->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const flt_t * _noalias const cutneighghostsq = 
+    buffers->get_cutneighghostsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int *molindex = atom->molindex;
+  int *molatom = atom->molatom;
+  Molecule **onemols = atom->avec->onemols;
+  int moltemplate;
+  if (molecular == 2) moltemplate = 1;
+  else moltemplate = 0;
+  if (moltemplate) 
+    error->all(FLERR, 
+	       "Can't use moltemplate with npair style full/bin/ghost/intel.");
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    timer_compute = _fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = _fix->get_off_overflow_flag();
+    _fix->stop_watch(TIME_HOST_NEIGHBOR);
+    _fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else
+  #endif
+  {
+    tnum = comm->nthreads;
+    overflow = _fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+  const int * _noalias const binpacked = buffers->get_binpacked();
+
+  const int xperiodic = domain->xperiodic;
+  const int yperiodic = domain->yperiodic;
+  const int zperiodic = domain->zperiodic;
+  const flt_t xprd_half = domain->xprd_half;
+  const flt_t yprd_half = domain->yprd_half;
+  const flt_t zprd_half = domain->zprd_half;
+
+  flt_t * _noalias const ncachex = buffers->get_ncachex();
+  flt_t * _noalias const ncachey = buffers->get_ncachey();
+  flt_t * _noalias const ncachez = buffers->get_ncachez();
+  int * _noalias const ncachej = buffers->get_ncachej();
+  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
+  const int ncache_stride = buffers->ncache_stride();
+
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * const stencilxyz = &this->stencilxyz[0][0];
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const bins = this->bins;
+  const int cop = _fix->coprocessor_number();
+  const int separate_buffers = _fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
+    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \
+    in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
+    in(stencilxyz:length(3*nstencil)) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = 0;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    int nstencilp = 0;
+    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
+    for (int k = 0; k < nstencil; k++) {
+      binstart[nstencilp] = stencil[k];
+      int end = stencil[k] + 1;
+      for (int kk = k + 1; kk < nstencil; kk++) {
+        if (stencil[kk-1]+1 == stencil[kk]) {
+          end++;
+          k++;
+        } else break;
+      }
+      binend[nstencilp] = end;
+      nstencilp++;
+    }
+
+    const int mbinyx = mbiny * mbinx;
+
+    #if defined(_OPENMP)
+    #pragma omp parallel
+    #endif
+    {
+      const int num = aend;
+      int tid, ifrom, ito;
+
+      const double balance_factor = 2.0;
+      const double ibalance_factor = 1.0 / balance_factor;
+      const int gnum = num - nlocal;
+      const int wlocal = static_cast<int>(ceil(balance_factor * nlocal));
+      const int snum = wlocal + gnum;
+      IP_PRE_omp_range_id(ifrom, ito, tid, snum, nthreads);
+      if (ifrom < wlocal) ifrom = static_cast<int>(ibalance_factor * ifrom);
+      else ifrom -= wlocal - nlocal;
+      if (ito < wlocal) ito = static_cast<int>(ibalance_factor * ito);
+      else ito -= wlocal - nlocal;
+
+      int e_ito = ito;
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+
+      int which;
+
+      int pack_offset = maxnbors;
+      int ct = (ifrom + tid * 2) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;
+
+      const int toffs = tid * ncache_stride;
+      flt_t * _noalias const tx = ncachex + toffs;
+      flt_t * _noalias const ty = ncachey + toffs;
+      flt_t * _noalias const tz = ncachez + toffs;
+      int * _noalias const tj = ncachej + toffs;
+      int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
+
+      // loop over all atoms in other bins in stencil, store every pair
+      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      for (int i = ifrom; i < ito; i++) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+        const tagint itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        const int ibin = atombin[i];
+        if (ibin != oldbin) {
+          oldbin = ibin;
+          ncount = 0;
+	  if (i < nlocal) {
+	    for (int k = 0; k < nstencilp; k++) {
+	      const int bstart = binhead[ibin + binstart[k]];
+	      const int bend = binhead[ibin + binend[k]];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } else {
+	    const int zbin = ibin / mbinyx;
+	    const int zrem = ibin % mbinyx;
+	    const int ybin = zrem / mbinx;
+	    const int xbin = zrem % mbinx;
+	    for (int k = 0; k < nstencil; k++) {
+	      const int xbin2 = xbin + stencilxyz[3 * k + 0];
+	      const int ybin2 = ybin + stencilxyz[3 * k + 1];
+	      const int zbin2 = zbin + stencilxyz[3 * k + 2];
+	      if (xbin2 < 0 || xbin2 >= mbinx ||
+                  ybin2 < 0 || ybin2 >= mbiny ||
+                  zbin2 < 0 || zbin2 >= mbinz) continue;
+
+	      const int bstart = binhead[ibin + stencil[k]];
+	      const int bend = binhead[ibin + stencil[k] + 1];
+              #if defined(LMP_SIMD_COMPILER)
+              #pragma vector aligned
+              #pragma simd
+              #endif
+              for (int jj = bstart; jj < bend; jj++)
+                tj[ncount++] = binpacked[jj];
+	    }
+	  } // if i < nlocal
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int u = 0; u < ncount; u++) {
+            const int j = tj[u];
+            tx[u] = x[j].x;
+            ty[u] = x[j].y;
+            tz[u] = x[j].z;
+            tjtype[u] = x[j].w;
+	    ttag[u] = tag[j];
+          }
+	} // if ibin != oldbin
+
+        // ---------------------- Loop over other bins
+
+        int n = maxnbors;
+        int n2 = n * 2;
+	int *neighptr2 = neighptr;
+	const flt_t * _noalias cutsq;
+	if (i < nlocal) cutsq = cutneighsq;
+	else cutsq = cutneighghostsq;
+
+	const int icp = i;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int u = 0; u < ncount; u++) {
+          int addme = 1;
+          int j = tj[u];
+
+	  if (i == j) addme = 0;
+
+          // Cutoff Check
+          const flt_t delx = xtmp - tx[u];
+          const flt_t dely = ytmp - ty[u];
+          const flt_t delz = ztmp - tz[u];
+          const int jtype = tjtype[u];
+	  const int jtag = ttag[u];
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutsq[ioffset + jtype]) addme = 0;
+
+          if (need_ic && icp < nlocal) {
+            int no_special;
+	    ominimum_image_check(no_special, delx, dely, delz);
+            if (no_special)
+              j = -j - 1;
+          }
+
+	  int flist = 0;
+	  if (itag > jtag) {
+	    if (((itag+jtag) & 1) == 0) flist = 1;
+	  } else if (itag < jtag) {
+	    if (((itag+jtag) & 1) == 1) flist = 1;
+	  } else {
+	    if (tz[u] < ztmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+	    else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp)
+	      flist = 1;
+	  }
+	  if (addme) {
+	    if (flist)
+	      neighptr2[n2++] = j;
+	    else
+	      neighptr[n++] = j;
+	  }
+        } // for u
+
+        #ifndef _LMP_INTEL_OFFLOAD
+        if (exclude) {
+          int alln = n;
+          n = maxnbors;
+          for (int u = pack_offset; u < alln; u++) {
+            const int j = neighptr[u];
+            int pj = j;
+            if (need_ic)
+              if (pj < 0) pj = -j - 1;
+            const int jtype = x[pj].w;
+            if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+            neighptr[n++] = j;
+          }
+	  alln = n2;
+	  n2 = maxnbors * 2;
+	  for (int u = n2; u < alln; u++) {
+	    const int j = neighptr[u];
+	    int pj = j;
+	    if (need_ic)
+	      if (pj < 0) pj = -j - 1;
+	    const int jtype = x[pj].w;
+	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+	    neighptr[n2++] = j;
+	  }
+        }
+        #endif
+        int ns = n - maxnbors;
+	int alln = n;
+	atombin[i] = ns;
+	n = 0;
+	for (int u = maxnbors; u < alln; u++)
+          neighptr[n++] = neighptr[u];
+	ns += n2 - maxnbors * 2;
+	for (int u = maxnbors * 2; u < n2; u++)
+          neighptr[n++] = neighptr[u];
+	if (ns > maxnbors) *overflow = 1;
+
+        ilist[i] = i;
+        cnumneigh[i] = ct;
+        numneigh[i] = ns;
+
+	ct += ns;
+	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	const int edge = ct & (alignb - 1);
+	if (edge) ct += alignb - edge;
+	neighptr = firstneigh + ct;
+	if (ct + obound > list_size) {
+	  if (i < ito - 1) {
+	    *overflow = 1;
+	    ct = (ifrom + tid * 2) * maxnbors;
+	  }
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          #if __INTEL_COMPILER+0 > 1499
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            int j = jlist[jj];
+            if (need_ic && j < 0) j = -j - 1;
+          }
+        }
+
+	overflow[LMP_LOCAL_MIN] = 0;
+	overflow[LMP_LOCAL_MAX] = nlocal - 1;
+	overflow[LMP_GHOST_MIN] = nlocal;
+	overflow[LMP_GHOST_MAX] = e_nall - 1;
+
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+        } else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      } // if separate_buffers
+      #endif
+
+      if (molecular) {
+	int ito_m = ito;
+	if (ito >= nlocal) ito_m = nlocal; 
+        for (int i = ifrom; i < ito_m; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma simd
+          #endif
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            if (need_ic && j < 0) {
+              which = 0;
+              jlist[jj] = -j - 1;
+            } else
+              ofind_special(which, special, nspecial, i, tag[j]);
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j >= nlocal) {
+              if (j == e_nall)
+                jlist[jj] = nall_offset;
+              else if (which)
+                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+              else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+            if (which) jlist[jj] = j ^ (which << SBBITS);
+          }
+        } // for i
+      } // if molecular
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          int jj = 0;
+          #pragma vector aligned
+          #pragma simd
+          for (jj = 0; jj < jnum; jj++) {
+            if (jlist[jj] >= nlocal) {
+              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+              else jlist[jj] -= ghost_offset;
+            }
+          }
+        }
+      }
+      #endif
+    } // end omp
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+  } else {
+    for (int i = 0; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    if (separate_buffers) {
+      _fix->start_watch(TIME_PACK);
+      _fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(_fix->host_min_local(),
+                                    _fix->host_used_local(),
+                                    _fix->host_min_ghost(),
+                                    _fix->host_used_ghost());
+      _fix->stop_watch(TIME_PACK);
+    }
+  }
+  #else
+  #pragma vector aligned
+  #pragma simd
+  for (int i = 0; i < aend; i++)
+    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  #endif
+}
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.h b/src/USER-INTEL/npair_full_bin_ghost_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4449dfa1e1b0b6958cb20b907f9800277d051e04
--- /dev/null
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.h
@@ -0,0 +1,55 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(full/bin/ghost/intel,
+           NPairFullBinGhostIntel,
+           NP_FULL | NP_BIN | NP_GHOST | NP_NEWTON | NP_NEWTOFF | 
+           NP_ORTHO | NP_TRI | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+#define LMP_NPAIR_FULL_BIN_GHOST_INTEL_H
+
+#include "npair_intel.h"
+
+namespace LAMMPS_NS {
+
+class NPairFullBinGhostIntel : public NPairIntel {
+ public:
+  NPairFullBinGhostIntel(class LAMMPS *);
+  ~NPairFullBinGhostIntel() {}
+  void build(class NeighList *);
+ private:
+  template<class flt_t, class acc_t>
+  void fbi(NeighList * list, IntelBuffers<flt_t,acc_t> * buffers);
+  template<class flt_t, class acc_t, int need_ic>
+  void fbi(const int offload, NeighList * list, 
+	   IntelBuffers<flt_t,acc_t> * buffers, 
+           const int astart, const int aend);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index b20b1dcd08d95cc3afed1cbe50d6b0153d3eeb32..79dc75366e80d73db0c70ef7cb46d45dce54c1f0 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -143,6 +143,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
   flt_t * _noalias const ncachez = buffers->get_ncachez();
   int * _noalias const ncachej = buffers->get_ncachej();
   int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  int * _noalias const ncachetag = buffers->get_ncachetag();
   const int ncache_stride = buffers->ncache_stride();
 
   #ifdef _LMP_INTEL_OFFLOAD
@@ -165,7 +166,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
     in(atombin:length(aend) alloc_if(0) free_if(0)) \
     in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
     in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
-    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \
     in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
     in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
     in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
@@ -222,7 +223,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       ito += astart;
       int e_ito = ito;
       if (THREE && ito == num) {
-        int imod = ito % pack_width;
+        int imod = ito & (pack_width - 1);
         if (imod) e_ito += pack_width - imod;
       }
       const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
@@ -241,6 +242,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
       flt_t * _noalias const tz = ncachez + toffs;
       int * _noalias const tj = ncachej + toffs;
       int * _noalias const tjtype = ncachejtype + toffs;
+      int * _noalias const ttag = ncachetag + toffs;
 
       flt_t * _noalias itx;
       flt_t * _noalias ity;
@@ -287,13 +289,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             ty[u] = x[j].y;
             tz[u] = x[j].z;
             tjtype[u] = x[j].w;
+	    if (THREE) ttag[u] = tag[j];
           }
 
           if (FULL == 0 || TRI == 1) {
             icount = 0;
             istart = ncount;
             const int alignb = INTEL_DATA_ALIGN / sizeof(int);
-            int nedge = istart % alignb;
+            int nedge = istart & (alignb - 1);
             if (nedge) istart + (alignb - nedge);
             itx = tx + istart;
             ity = ty + istart;
@@ -343,7 +346,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
 
             // i bin (half) check and offload ghost check
             if (j < nlocal) {
-              const int ijmod = (i + j) % 2;
+              const int ijmod = (i + j) & 1;
               if (i > j) {
                 if (ijmod == 0) addme = 0;
               } else if (i < j) {
@@ -424,8 +427,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           }
           #endif
 
-          int pj;
-          if (THREE) pj = j;
           if (need_ic) {
             int no_special;
             ominimum_image_check(no_special, delx, dely, delz);
@@ -434,12 +435,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           }
 
           if (THREE) {
-            const int jtag = tag[pj];
+            const int jtag = ttag[u];
             int flist = 0;
             if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
+	      if (((itag+jtag) & 1) == 0) flist = 1;
             } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
+	      if (((itag+jtag) & 1) == 1) flist = 1;
             } else {
               if (tz[u] < ztmp) flist = 1;
               else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
@@ -512,7 +513,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           cnumneigh[i] += lane;
           numneigh[i] = ns;
         } else {
-          int edge = (n % pad_width);
+          int edge = n & (pad_width - 1);
           if (edge) {
             const int pad_end = n + (pad_width - edge);
             #if defined(LMP_SIMD_COMPILER)
@@ -532,7 +533,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
           if (lane == pack_width) {
             ct += max_chunk * pack_width;
             const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-            const int edge = (ct % alignb);
+            const int edge = ct & (alignb - 1);
             if (edge) ct += alignb - edge;
             neighptr = firstneigh + ct;
             max_chunk = 0;
@@ -548,7 +549,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
         } else {
           ct += n;
           const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
+          const int edge = ct & (alignb - 1);
           if (edge) ct += alignb - edge;
           neighptr = firstneigh + ct;
           if (ct + obound > list_size) {
diff --git a/src/USER-INTEL/pair_airebo_intel.cpp b/src/USER-INTEL/pair_airebo_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad3c97c9df0e61b0f998e078eb1ea4fa2cfead61
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
@@ -0,0 +1,4891 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+#include <unistd.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <stddef.h>
+#include "lmptype.h"
+#include "intel_preprocess.h"
+#include "intel_intrinsics_airebo.h"
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+#include <omp.h>
+#include <string.h>
+#include "pair_airebo_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "group.h"
+#include "kspace.h"
+#include "modify.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+
+template<typename flt_t, typename acc_t>
+struct LAMMPS_NS::PairAIREBOIntelParam {
+  flt_t cutlj, cutljrebosq, cut3rebo;
+  flt_t sigmin, sigcut;
+  flt_t cutljsq[2][2];
+  flt_t lj1[2][2], lj2[2][2], lj3[2][2], lj4[2][2];
+
+  flt_t smin, Nmin, Nmax, NCmin, NCmax, thmin, thmax;
+  flt_t rcmin[2][2], rcmax[2][2], rcmaxsq[2][2], rcmaxp[2][2];
+  flt_t Q[2][2], alpha[2][2], A[2][2], rho[2][2], BIJc[2][2][3],
+      Beta[2][2][3];
+  flt_t rcLJmin[2][2], rcLJmax[2][2], rcLJmaxsq[2][2], bLJmin[2][2],
+      bLJmax[2][2];
+  flt_t epsilon[2][2], sigma[2][2], epsilonT[2][2];
+
+  // spline coefficients
+
+  flt_t gCdom[5], gC1[4][6], gC2[4][6], gHdom[4], gH[3][6];
+  flt_t gDom[5+4];
+  flt_t gVal[(4+4+3)*6];
+  flt_t pCCdom[2][2], pCHdom[2][2], pCC[4][4][16], pCH[4][4][16];
+  flt_t piCCdom[3][2], piCHdom[3][2], piHHdom[3][2];
+  acc_t piCC[4][4][9][64], piCH[4][4][9][64], piHH[4][4][9][64];
+  flt_t Tijdom[3][2];
+  acc_t Tijc[4][4][9][64];
+
+  // spline knot values
+
+  flt_t PCCf[5][5], PCCdfdx[5][5], PCCdfdy[5][5], PCHf[5][5];
+  flt_t PCHdfdx[5][5], PCHdfdy[5][5];
+  flt_t piCCf[5][5][11], piCCdfdx[5][5][11];
+  flt_t piCCdfdy[5][5][11], piCCdfdz[5][5][11];
+  flt_t piCHf[5][5][11], piCHdfdx[5][5][11];
+  flt_t piCHdfdy[5][5][11], piCHdfdz[5][5][11];
+  flt_t piHHf[5][5][11], piHHdfdx[5][5][11];
+  flt_t piHHdfdy[5][5][11], piHHdfdz[5][5][11];
+  flt_t Tf[5][5][10], Tdfdx[5][5][10], Tdfdy[5][5][10], Tdfdz[5][5][10];
+};
+
+namespace {
+
+struct NeighListAIREBO {
+  int * num; /* num_all */
+  int * num_half; /* num_all */
+  int * offset; /* num_all */
+  int * entries; /* num_all * num_neighs_per_atom */
+};
+
+template<typename flt_t>
+struct AtomAIREBOT {
+  flt_t x, y, z;
+  int w;
+};
+
+template<typename acc_t>
+struct ResultForceT {
+  acc_t x, y, z, w;
+};
+
+template<typename flt_t, typename acc_t>
+struct KernelArgsAIREBOT {
+  int num_local;
+  int num_all;
+  int num_neighs_per_atom;
+  int num_types;
+  int frebo_from_atom, frebo_to_atom;
+  int neigh_from_atom, neigh_to_atom;
+  int rebuild_flag;
+  flt_t skin;
+  struct NeighListAIREBO neigh_lmp;
+  struct NeighListAIREBO neigh_rebo;
+  PairAIREBOIntelParam<flt_t,acc_t> params;
+  struct AtomAIREBOT<flt_t> * x; /* num_all */
+  int * tag; /* num_all */
+  flt_t * nC, * nH; /* num_all */
+  int * map; /* num_types+1 */
+  struct ResultForceT<acc_t> * result_f; /* num_all */
+  acc_t result_eng;
+};
+
+template<typename flt_t, typename acc_t>
+void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag);
+template<typename flt_t, typename acc_t>
+void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka);
+template<typename flt_t, typename acc_t>
+void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag);
+
+}
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOIntel::PairAIREBOIntel(LAMMPS *lmp) : PairAIREBO(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  REBO_cnumneigh = NULL;
+  REBO_num_skin = NULL;
+  REBO_list_data = NULL;
+  fix = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOIntel::~PairAIREBOIntel()
+{
+  memory->destroy(REBO_cnumneigh);
+  memory->destroy(REBO_num_skin);
+  memory->destroy(REBO_list_data);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairAIREBOIntel::init_style()
+{
+  PairAIREBO::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    pack_force_const(fix->get_mixed_buffers());
+    fix->get_mixed_buffers()->need_tag(1);
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    pack_force_const(fix->get_double_buffers());
+    fix->get_double_buffers()->need_tag(1);
+  } else {
+    pack_force_const(fix->get_single_buffers());
+    fix->get_single_buffers()->need_tag(1);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_noghost())
+    error->all(FLERR,"The 'ghost no' option cannot be used with airebo/intel.");
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<typename T>
+T * calloc_it(size_t size) {
+  return static_cast<T*>(calloc(size, sizeof(T)));
+}
+
+void PairAIREBOIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers());
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers());
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers());
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+PairAIREBOIntelParam<flt_t,acc_t> PairAIREBOIntel::get_param()
+{
+  PairAIREBOIntelParam<flt_t,acc_t> fc;
+
+#define A(a)                                                           \
+  for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) {           \
+    reinterpret_cast<flt_t*>(&fc.a)[i] =			       \
+      reinterpret_cast<double*>(&this->a)[i];			       \
+  }
+#define A0(a)								\
+  for (int i = 0; i < sizeof(fc.a)/sizeof(flt_t); i++) {		\
+    reinterpret_cast<flt_t*>(&fc.a)[i] =				\
+      reinterpret_cast<double*>(this->a[0])[i];				\
+  }
+#define B(a)								\
+  for (int i = 0; i < sizeof(this->a)/sizeof(double); i++) {		\
+    reinterpret_cast<acc_t*>(&fc.a)[i] =				\
+      reinterpret_cast<double*>(&this->a)[i];				\
+  }
+
+  A(cutlj) A(cutljrebosq) A(cut3rebo) A(sigmin);
+  A(sigcut) A0(cutljsq) A0(lj1) A0(lj2) A0(lj3);
+  A0(lj4) A(smin) A(Nmin) A(Nmax) A(NCmin) A(NCmax) A(thmin) A(thmax);
+  A(rcmin) A(rcmax) A(rcmaxsq) A(rcmaxp) A(Q) A(alpha) A(A) A(rho) A(BIJc);
+  A(Beta) A(rcLJmin) A(rcLJmax) A(rcLJmaxsq) A(bLJmin) A(bLJmax) A(epsilon);
+  A(sigma) A(epsilonT) A(gCdom) A(gC1) A(gC2) A(gHdom) A(gH) A(pCCdom);
+  A(pCHdom) A(pCC) A(pCH) A(piCCdom) A(piCHdom) A(piHHdom) B(piCC);
+  B(piCH) B(piHH) A(Tijdom) B(Tijc) A(PCCf) A(PCCdfdx) A(PCCdfdy) A(PCHf);
+  A(PCHdfdx) A(PCHdfdy) A(piCCf) A(piCCdfdx) A(piCCdfdy) A(piCCdfdz);
+  A(piCHf) A(piCHdfdx) A(piCHdfdy) A(piCHdfdz) A(piHHf) A(piHHdfdx);
+  A(piHHdfdy) A(piHHdfdz) A(Tf) A(Tdfdx) A(Tdfdy) A(Tdfdz);
+
+#undef A
+#undef A0
+#undef B
+  for (int i = 0; i < 5; i++) fc.gDom[i] = fc.gCdom[i];
+  for (int i = 0; i < 4; i++) fc.gDom[5+i] = fc.gHdom[i];
+  for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[6*i+j] = fc.gC1[i][j];
+  for (int i = 0; i < 4; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[4*6+6*i+j] = fc.gC2[i][j];
+  for (int i = 0; i < 3; i++) for (int j = 0; j < 6; j++) 
+				fc.gVal[8*6+6*i+j] = fc.gH[i][j];
+
+  return fc;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PairAIREBOIntel::compute(
+    int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers
+) {
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = vflag_atom = 0;
+  pvector[0] = pvector[1] = pvector[2] = 0.0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+        buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (atom->nmax > maxlocal) {
+    #ifdef LMP_INTEL_OFFLOAD
+    if (maxlocal > 0 && _cop >= 0) {
+      int * const REBO_numneigh = this->REBO_numneigh;
+      int * const REBO_num_skin = this->REBO_num_skin;
+      int * const REBO_cnumneigh = this->REBO_cnumneigh;
+      int * const REBO_list_data = this->REBO_list_data;
+      double * const nC = this->nC;
+      double * const nH = this->nH;
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(REBO_numneigh: alloc_if(0) free_if(1)) \
+        nocopy(REBO_cnumneigh: alloc_if(0) free_if(1)) \
+        nocopy(REBO_num_skin: alloc_if(0) free_if(1)) \
+        nocopy(REBO_list_data: alloc_if(0) free_if(1)) \
+        nocopy(nH: alloc_if(0) free_if(1)) \
+        nocopy(nC: alloc_if(0) free_if(1))
+    }
+    #endif
+    maxlocal = atom->nmax;
+    memory->destroy(REBO_numneigh);
+    memory->destroy(REBO_cnumneigh);
+    memory->destroy(REBO_list_data);
+    memory->sfree(REBO_firstneigh);
+    memory->destroy(nC);
+    memory->destroy(nH);
+    memory->create(REBO_numneigh,maxlocal,"AIREBO:numneigh");
+    memory->create(REBO_cnumneigh,maxlocal,"AIREBO:cnumneigh");
+    memory->create(REBO_num_skin,maxlocal,"AIREBO:cnumneigh");
+    int max_nbors = buffers->get_max_nbors();
+    memory->create(REBO_list_data,maxlocal * max_nbors,"AIREBO:list_data");
+    REBO_firstneigh = (int **) memory->smalloc(maxlocal*sizeof(int *),
+                                               "AIREBO:firstneigh");
+    memory->create(nC,maxlocal,"AIREBO:nC");
+    memory->create(nH,maxlocal,"AIREBO:nH");
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_cop >= 0) {
+      int * const REBO_numneigh = this->REBO_numneigh;
+      int * const REBO_num_skin = this->REBO_num_skin;
+      int * const REBO_cnumneigh = this->REBO_cnumneigh;
+      int * const REBO_list_data = this->REBO_list_data;
+      double * const nC = this->nC;
+      double * const nH = this->nH;
+      const int mnml = max_nbors * maxlocal;
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(REBO_numneigh: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_cnumneigh:length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_num_skin: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(REBO_list_data:length(mnml) alloc_if(1) free_if(0)) \
+        nocopy(nH: length(maxlocal) alloc_if(1) free_if(0)) \
+        nocopy(nC: length(maxlocal) alloc_if(1) free_if(0))
+    }
+    #endif
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      eval<1,1>(1, ovflag, buffers, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, host_start, inum);
+    }
+  } else {
+    eval<0,0>(1, 0, buffers, 0, offload_end);
+    eval<0,0>(0, 0, buffers, host_start, inum);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<int EVFLAG, int EFLAG, class flt_t, class acc_t>
+void PairAIREBOIntel::eval(
+    const int offload, const int vflag,
+    IntelBuffers<flt_t,acc_t> * buffers,
+    const int astart, const int aend
+) {
+  const int inum = aend - astart;
+  if (inum == 0) {
+    return;
+  }
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const numneighhalf = buffers->get_atombin();
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * const tag = atom->tag;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, 1 /*NEWTON_PAIR*/, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  const double skin = neighbor->skin;
+  const int max_nbor = buffers->get_max_nbors();
+  const PairAIREBOIntelParam<flt_t,acc_t> param = get_param<flt_t,acc_t>();
+
+  // offload here
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  int * const REBO_numneigh = this->REBO_numneigh;
+  int * const REBO_num_skin = this->REBO_num_skin;
+  int * const REBO_cnumneigh = this->REBO_cnumneigh;
+  int * const REBO_list_data = this->REBO_list_data;
+  double * const nC = this->nC;
+  double * const nH = this->nH;
+  const int torflag = this->torflag;
+  const int ljflag = this->ljflag;
+  const int morseflag = this->morseflag;
+  int * const map = this->map;
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+
+  #pragma offload target(mic:_cop) if(offload) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \
+    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    in(param,skin,max_nbor) \
+    in(tag: length(0) alloc_if(0) free_if(0)) \
+    in(torflag, ljflag, morseflag, ago) \
+    in(nC: length(0) alloc_if(0) free_if(0)) \
+    in(nH: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_numneigh: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_cnumneigh: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_num_skin: length(0) alloc_if(0) free_if(0)) \
+    in(REBO_list_data: length(0) alloc_if(0) free_if(0)) \
+    in(map: length(0) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(1 /*NEWTON_PAIR*/, separate_flag, nlocal, nall,
+			      f_stride, x, 0/*q*/);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel \
+      shared(f_start,f_stride,nlocal,nall,minlocal)	\
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int neigh_iifrom, neigh_iito;
+      IP_PRE_omp_range(neigh_iifrom, neigh_iito, tid, nall, nthreads);
+
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      KernelArgsAIREBOT<flt_t,acc_t> args;
+      args.num_local = nlocal;
+      args.num_all = nall;
+      args.num_neighs_per_atom = max_nbor;
+      args.num_types = ntypes;
+      args.frebo_from_atom = 0;
+      args.frebo_to_atom = args.num_local;
+      args.neigh_from_atom = 0;
+      args.neigh_to_atom = args.num_all;
+      args.rebuild_flag = ago == 0;
+      args.skin = skin;
+      args.neigh_lmp.num = const_cast<int*>(numneigh);
+      args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
+      args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
+      args.neigh_lmp.entries = const_cast<int*>(firstneigh);
+      args.neigh_rebo.num = REBO_numneigh;
+      args.neigh_rebo.num_half = REBO_num_skin;
+      args.neigh_rebo.offset = REBO_cnumneigh;
+      args.neigh_rebo.entries = REBO_list_data;
+      args.params = param;
+      args.tag = tag;
+      args.nC = reinterpret_cast<flt_t*>(nC);
+      args.nH = reinterpret_cast<flt_t*>(nH);
+      args.map = map;
+      args.result_eng = 0;
+      args.x = (AtomAIREBOT<flt_t>*) x;
+
+      args.result_f = (ResultForceT<acc_t> *) f;
+      args.neigh_from_atom = neigh_iifrom;
+      args.neigh_to_atom = neigh_iito;
+      args.frebo_from_atom = iifrom;
+      args.frebo_to_atom = iito;
+
+      aut_rebo_neigh(&args);
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      aut_frebo(&args, torflag);
+      if (ljflag) aut_lennard_jones(&args, morseflag);
+
+      oevdwl += args.result_eng;
+
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, x,
+                              offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
+    } // end of omp parallel region
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
+  int tp1 = atom->ntypes + 1;
+
+  buffers->set_ntypes(tp1,1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+  flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        cut = cutghost[i][j] + neighbor->skin;
+        cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
+      }
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * ocutneighsq = cutneighsq[0];
+  size_t VL = 512 / 8 / sizeof(flt_t);
+  int ntypes = tp1;
+  int tp1sq = tp1 * tp1;
+  // TODO the lifecycle of "map" is currently not 100% correct
+  // it might not be freed if this method is called more than once
+  int * map = this->map;
+  #pragma offload_transfer target(mic:_cop) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(map: length(tp1) alloc_if(1) free_if(0))
+  #endif
+
+}
+
+/* ----------------------------------------------------------------------
+    Implementation
+   ---------------------------------------------------------------------- */
+
+namespace {
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(push, target(mic))
+#endif
+
+namespace overloaded {
+  double sqrt(double a) { return ::sqrt(a); }
+  float sqrt(float a) { return ::sqrtf(a); }
+  double sin(double a) { return ::sin(a); }
+  float sin(float a) { return ::sinf(a); }
+  double cos(double a) { return ::cos(a); }
+  float cos(float a) { return ::cosf(a); }
+  double exp(double a) { return ::exp(a); }
+  float exp(float a) { return ::expf(a); }
+  double pow(double a, double b) { return ::pow(a, b); }
+  float pow(float a, float b) { return ::powf(a, b); }
+}
+
+/* ----------------------------------------------------------------------
+    Scalar AIREBO implementation, standalone, with massive code reuse
+    compared to original code.
+   ---------------------------------------------------------------------- */
+
+#define M_PI           3.14159265358979323846  /* pi */
+
+#define CARBON 0
+#define HYDROGEN 1
+#define TOL 1.0e-9
+
+template<typename T>
+inline T fmin_nonan(T a, T b) {
+  return a < b ? a : b;
+}
+template<typename T>
+inline T fmax_nonan(T a, T b) {
+  return a > b ? a : b;
+}
+
+template<typename flt_t>
+inline flt_t Sp(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
+  flt_t t = (r - lo) / (hi - lo);
+  if (t <= 0) {
+    if (del) *del = 0;
+    return 1;
+  } else if (t >= 1) {
+    if (del) *del = 0;
+    return 0;
+  } else {
+    t *= static_cast<flt_t>(M_PI);
+    if (del) *del = static_cast<flt_t>(-0.5 * M_PI)
+                  * overloaded::sin(t) / (hi - lo);
+    return static_cast<flt_t>(0.5) * (1 + overloaded::cos(t));
+  }
+}
+
+template<typename flt_t>
+inline flt_t Sp2(flt_t r, flt_t lo, flt_t hi, flt_t * del) {
+  flt_t t = (r - lo) / (hi - lo);
+  if (t <= 0) {
+    if (del) *del = 0;
+    return 1;
+  } else if (t >= 1) {
+    if (del) *del = 0;
+    return 0;
+  } else {
+    if (del) *del = 6 * (t * t - t) / (hi - lo);
+    return 1 - t * t * (3 - 2 * t);
+  }
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_lin(int n, flt_t * coeffs, flt_t x, flt_t * deriv) {
+  flt_t result = coeffs[n - 1];
+  *deriv = coeffs[n - 1] * (n - 1);
+  for (int i = n - 2; i > 0; i--) {
+    result = coeffs[i] + x * result;
+    *deriv = coeffs[i] * i + x * (*deriv);
+  }
+  result = coeffs[0] + x * result;
+  return result;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t gSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, flt_t cos, flt_t N, flt_t * dgdc, flt_t * dgdN) {
+  flt_t NCmin = ka->params.NCmin;
+  flt_t NCmax = ka->params.NCmax;
+  int index = 0;
+  flt_t * gDom = NULL;
+  int nDom = 0;
+  int offs = 0;
+  if (itype == 0) {
+    nDom = 4;
+    gDom = &ka->params.gCdom[0];
+    if (N > NCmin) offs = 4 * 6;
+  } else {
+    nDom = 3;
+    gDom = &ka->params.gHdom[0];
+    offs = 8 * 6;
+  }
+  cos = fmax_nonan(gDom[0], fmin_nonan(gDom[nDom], cos));
+  int i;
+  for (i = 0; i < nDom; i++) {
+    if (cos >= gDom[i] && cos <= gDom[i + 1]) {
+      index = i;
+    }
+  }
+  flt_t g = eval_poly_lin(6, &ka->params.gVal[offs+index*6], cos, dgdc);
+  *dgdN = 0;
+  if (itype == 0 && N > NCmin && N < NCmax) {
+    flt_t dg1;
+    flt_t g1 = eval_poly_lin(6, &ka->params.gVal[index*6], cos, &dg1);
+    flt_t dS;
+    flt_t cut = Sp(N, NCmin, NCmax, &dS);
+    *dgdN = dS * (g1 - g);
+    g = g + cut * (g1 - g);
+    *dgdc = *dgdc + cut * (dg1 - *dgdc);
+  }
+  return g;
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_bi(int n, flt_t * coeffs, flt_t x, flt_t y, 
+			  flt_t * deriv) {
+  flt_t dy;
+  flt_t vy = eval_poly_lin(n, &coeffs[n * (n - 1)], y, &dy);
+  flt_t result = vy;
+  deriv[0] = vy * (n - 1);
+  deriv[1] = dy;
+  for (int i = n - 2; i > 0; i--) {
+    vy = eval_poly_lin(n, &coeffs[n * i], y, &dy);
+    result = vy + x * result;
+    deriv[0] = vy * i + x * deriv[0];
+    deriv[1] = dy + x * deriv[1];
+  }
+  result = eval_poly_lin(n, &coeffs[0], y, &dy) + x * result;
+  deriv[1] = dy + x * deriv[1];
+  return result;
+}
+
+template<typename flt_t>
+inline flt_t eval_poly_tri(int n, flt_t * coeffs, flt_t x, flt_t y, flt_t z, 
+			   flt_t * deriv) {
+  flt_t dyz[2];
+  flt_t vyz = eval_poly_bi(n, &coeffs[n * n * (n - 1)], y, z, &dyz[0]);
+  flt_t result = vyz;
+  deriv[0] = vyz * (n - 1);
+  deriv[1] = dyz[0];
+  deriv[2] = dyz[1];
+  for (int i = n - 2; i > 0; i--) {
+    vyz = eval_poly_bi(n, &coeffs[n * n * i], y, z, &dyz[0]);
+    result = vyz + x * result;
+    deriv[0] = vyz * i + x * deriv[0];
+    deriv[1] = dyz[0] + x * deriv[1];
+    deriv[2] = dyz[1] + x * deriv[2];
+  }
+  result = eval_poly_bi(n, &coeffs[0], y, z, &dyz[0]) + x * result;
+  deriv[1] = dyz[0] + x * deriv[1];
+  deriv[2] = dyz[1] + x * deriv[2];
+  return result;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+		       int jtype, flt_t NC, flt_t NH, flt_t * dN) {
+  dN[0] = 0.0;
+  dN[1] = 0.0;
+  if (itype == HYDROGEN) return 0;
+  flt_t *pCJdom = jtype == CARBON ? &ka->params.pCCdom[0][0] : 
+    &ka->params.pCHdom[0][0];
+  NC = fmax_nonan(pCJdom[0], fmin_nonan(pCJdom[1], NC));
+  NH = fmax_nonan(pCJdom[2], fmin_nonan(pCJdom[3], NH));
+  int nC = floor(NC);
+  int nH = floor(NH);
+  #define PijSelect(a, b) (jtype == CARBON ? ka->params.a : ka->params.b)
+  if (fabs(NC - nC) < TOL && fabs(NH - nH) < TOL) {
+    dN[0] = PijSelect(PCCdfdx, PCHdfdx)[nC][nH];
+    dN[1] = PijSelect(PCCdfdy, PCHdfdy)[nC][nH];
+    return PijSelect(PCCf, PCHf)[nC][nH];
+  }
+  if (NC == pCJdom[1]) nC -= 1;
+  if (NH == pCJdom[3]) nH -= 1;
+  return eval_poly_bi(4, &PijSelect(pCC, pCH)[nC][nH][0], NC, NH, dN);
+  #undef PijSelect
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t TijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t Nij, 
+    flt_t Nji, flt_t Nijconj, acc_t * dN3) {
+  flt_t * Tijdom = &ka->params.Tijdom[0][0];
+  Nij = fmax_nonan(Tijdom[0], fmin_nonan(Tijdom[1], Nij));
+  Nji = fmax_nonan(Tijdom[2], fmin_nonan(Tijdom[3], Nji));
+  Nijconj = fmax_nonan(Tijdom[4], fmin_nonan(Tijdom[5], Nijconj));
+  int nij = floor(Nij);
+  int nji = floor(Nji);
+  int nijconj = floor(Nijconj);
+  if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < 
+			  TOL && fabs(Nijconj - nijconj) < TOL) {
+    dN3[0] = ka->params.Tdfdx[nij][nji][nijconj];
+    dN3[1] = ka->params.Tdfdy[nij][nji][nijconj];
+    dN3[2] = ka->params.Tdfdz[nij][nji][nijconj];
+    return ka->params.Tf[nij][nji][nijconj];
+  }
+  if (Nij == Tijdom[1]) nij -= 1;
+  if (Nji == Tijdom[3]) nji -= 1;
+  if (Nijconj == Tijdom[5]) nijconj -= 1;
+  return eval_poly_tri<acc_t>(4, &ka->params.Tijc[nij][nji][nijconj][0], Nij, 
+    Nji, Nijconj, dN3);
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t piRCSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, acc_t * dN3) {
+  const int HH = 2;
+  const int CH = 1;
+  /* const int CC = 0; */
+  int select = itype + jtype;
+  #define piRCSelect(a, b, c) (select == HH ? ka->params.a : select == CH ? \
+			       ka->params.b : ka->params.c)
+  flt_t * piIJdom = &piRCSelect(piHHdom, piCHdom, piCCdom)[0][0];
+  if (select == HH) {
+    if (Nij < piIJdom[0] || Nij > piIJdom[1] || Nji < piIJdom[2] || 
+	Nji > piIJdom[3] || Nijconj < piIJdom[4] || Nijconj > piIJdom[5]) {
+      Nij = 0;
+      Nji = 0;
+      Nijconj = 0;
+    }
+  }
+  Nij = fmax_nonan(piIJdom[0], fmin_nonan(piIJdom[1], Nij));
+  Nji = fmax_nonan(piIJdom[2], fmin_nonan(piIJdom[3], Nji));
+  Nijconj = fmax_nonan(piIJdom[4], fmin_nonan(piIJdom[5], Nijconj));
+  int nij = floor(Nij);
+  int nji = floor(Nji);
+  int nijconj = floor(Nijconj);
+  if (fabs(Nij - nij) < TOL && fabs(Nji - nji) < 
+			  TOL && fabs(Nijconj - nijconj) < TOL) {
+    dN3[0] = piRCSelect(piHHdfdx, piCHdfdx, piCCdfdx)[nij][nji][nijconj];
+    dN3[1] = piRCSelect(piHHdfdy, piCHdfdy, piCCdfdy)[nij][nji][nijconj];
+    dN3[2] = piRCSelect(piHHdfdz, piCHdfdz, piCCdfdz)[nij][nji][nijconj];
+    return piRCSelect(piHHf, piCHf, piCCf)[nij][nji][nijconj];
+  }
+  if (Nij == piIJdom[1]) nij -= 1;
+  if (Nji == piIJdom[3]) nji -= 1;
+  if (Nijconj == piIJdom[5]) nijconj -= 1;
+  return eval_poly_tri<acc_t>(4, 
+    &piRCSelect(piHH, piCH, piCC)[nij][nji][nijconj][0], Nij, Nji, Nijconj, 
+    dN3);
+  #undef piRCSelect
+}
+
+/*
+ * Implements the p_ij term in airebo, which occurs on 4 different occasions
+ * in the original lammps code.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_pij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j, 
+    flt_t rijx, flt_t rijy, flt_t rijz, flt_t rijmag, flt_t wij, flt_t VA, 
+    flt_t * sum_N, acc_t fij[3]) {
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t * nC = ka->nC;
+  flt_t * nH = ka->nH;
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+  flt_t invrijm = 1 / rijmag;
+  flt_t invrijm2 = invrijm * invrijm;
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t Nmin = ka->params.Nmin;
+  flt_t Nmax = ka->params.Nmax;
+  flt_t Nij = nC[i] + nH[i] - wij;
+  flt_t NijC = nC[i] - wij * (1 - jtype);
+  flt_t NijH = nH[i] - wij * jtype;
+  flt_t sum_pij = 0;
+  flt_t sum_dpij_dN = 0;
+  flt_t dN2[2] = {0};
+  flt_t pij = 0;
+  *sum_N = 0;
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int pass;
+  for (pass = 0; pass < 2; pass++) {
+    int kk;
+    int knum = ka->neigh_rebo.num[i];
+    for (kk = 0; kk < knum; kk++) {
+      int k = neighs[kk];
+      if (k == j) continue;
+      flt_t rikx = x_i - x[k].x;
+      flt_t riky = y_i - x[k].y;
+      flt_t rikz = z_i - x[k].z;
+      int ktype = map[x[k].w];
+      flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
+      flt_t rho_k = ka->params.rho[ktype][1];
+      flt_t rho_j = ka->params.rho[jtype][1];
+      flt_t lamdajik = 4 * itype * ((rho_k - rikmag) - (rho_j - rijmag));
+      flt_t ex_lam = exp(lamdajik);
+      flt_t rcminik = ka->params.rcmin[itype][ktype];
+      flt_t rcmaxik = ka->params.rcmax[itype][ktype];
+      flt_t dwik;
+      flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
+      flt_t Nki = nC[k] + nH[k] - wik;
+      flt_t cosjik = (rijx * rikx + rijy * riky + rijz * rikz) / 
+	(rijmag * rikmag);
+      cosjik = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cosjik));
+      flt_t dgdc, dgdN;
+      flt_t g = gSpline(ka, itype, cosjik, Nij, &dgdc, &dgdN);
+      if (pass == 0) {
+        sum_pij += wik * g * ex_lam;
+        sum_dpij_dN += wik * dgdN * ex_lam;
+        flt_t cutN = Sp<flt_t>(Nki, Nmin, Nmax, NULL);
+        *sum_N += (1 - ktype) * wik * cutN;
+      } else {
+        flt_t tmp = -0.5 * pij * pij * pij;
+        flt_t invrikm = 1 / rikmag;
+        flt_t rjkx = rikx - rijx;
+        flt_t rjky = riky - rijy;
+        flt_t rjkz = rikz - rijz;
+        flt_t rjkmag = sqrt(rjkx * rjkx + rjky * rjky + rjkz * rjkz);
+        flt_t rijrik = 2 * rijmag * rikmag;
+        flt_t rr = rijmag * rijmag - rikmag * rikmag;
+        flt_t dctdjk = -2 / rijrik;
+        flt_t dctdik = (-rr + rjkmag * rjkmag) / (rijrik * rikmag * rikmag);
+        flt_t dctdij = (rr + rjkmag * rjkmag) / (rijrik * rijmag * rijmag);
+
+        acc_t fi[3], fj[3], fk[3];
+        flt_t pref = 0.5 * VA * tmp;
+        flt_t tmp20 = pref * wik * dgdc * ex_lam;
+        fj[0] = fj[1] = fj[2] = 0;
+        fi[0] = -tmp20 * dctdik * rikx;
+        fi[1] = -tmp20 * dctdik * riky;
+        fi[2] = -tmp20 * dctdik * rikz;
+        fk[0] =  tmp20 * dctdik * rikx;
+        fk[1] =  tmp20 * dctdik * riky;
+        fk[2] =  tmp20 * dctdik * rikz;
+
+        fij[0] += -tmp20 * dctdij * rijx;
+        fij[1] += -tmp20 * dctdij * rijy;
+        fij[2] += -tmp20 * dctdij * rijz;
+
+        fi[0] += -tmp20 * dctdjk * rjkx;
+        fi[1] += -tmp20 * dctdjk * rjky;
+        fi[2] += -tmp20 * dctdjk * rjkz;
+        fk[0] +=  tmp20 * dctdjk * rjkx;
+        fk[1] +=  tmp20 * dctdjk * rjky;
+        fk[2] +=  tmp20 * dctdjk * rjkz;
+        fij[0] -= -tmp20 * dctdjk * rjkx;
+        fij[1] -= -tmp20 * dctdjk * rjky;
+        fij[2] -= -tmp20 * dctdjk * rjkz;
+
+        flt_t tmp21 = pref * (wik * g * ex_lam * 4 * itype);
+        fij[0] -= 1 * tmp21 * rijx * invrijm;
+        fij[1] -= 1 * tmp21 * rijy * invrijm;
+        fij[2] -= 1 * tmp21 * rijz * invrijm;
+        fi[0] -= tmp21 * (-rikx * invrikm);
+        fi[1] -= tmp21 * (-riky * invrikm);
+        fi[2] -= tmp21 * (-rikz * invrikm);
+        fk[0] -= tmp21 * (rikx * invrikm);
+        fk[1] -= tmp21 * (riky * invrikm);
+        fk[2] -= tmp21 * (rikz * invrikm);
+
+        // coordination forces
+
+        // dwik forces
+        flt_t tmp22 = pref * dwik * g * ex_lam * invrikm;
+        fi[0] -= tmp22 * rikx;
+        fi[1] -= tmp22 * riky;
+        fi[2] -= tmp22 * rikz;
+        fk[0] += tmp22 * rikx;
+        fk[1] += tmp22 * riky;
+        fk[2] += tmp22 * rikz;
+
+        // PIJ forces
+        flt_t tmp23 = pref * dN2[ktype] * dwik * invrikm;
+        fi[0] -= tmp23 * rikx;
+        fi[1] -= tmp23 * riky;
+        fi[2] -= tmp23 * rikz;
+        fk[0] += tmp23 * rikx;
+        fk[1] += tmp23 * riky;
+        fk[2] += tmp23 * rikz;
+
+        // dgdN forces
+        flt_t tmp24 = pref * sum_dpij_dN * dwik * invrikm;
+        fi[0] -= tmp24 * rikx;
+        fi[1] -= tmp24 * riky;
+        fi[2] -= tmp24 * rikz;
+        fk[0] += tmp24 * rikx;
+        fk[1] += tmp24 * riky;
+        fk[2] += tmp24 * rikz;
+
+        result_f[i].x += fi[0];
+        result_f[i].y += fi[1];
+        result_f[i].z += fi[2];
+        result_f[j].x += fj[0];
+        result_f[j].y += fj[1];
+        result_f[j].z += fj[2];
+        result_f[k].x += fk[0];
+        result_f[k].y += fk[1];
+        result_f[k].z += fk[2];
+      }
+    }
+    if (pass == 0) {
+      flt_t PijS = PijSpline(ka, itype, jtype, NijC, NijH, dN2);
+      pij = 1 / overloaded::sqrt(1 + sum_pij + PijS);
+    }
+  }
+  return pij;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_pi_rc(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
+  acc_t dN3tmp[3] = {0};
+  flt_t ret = piRCSpline(ka, itype, jtype, Nij, Nji, Nijconj, dN3tmp);
+  dN3[0] = dN3tmp[0];
+  dN3[1] = dN3tmp[1];
+  dN3[2] = dN3tmp[2];
+  return ret;
+}
+
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+    int jtype, flt_t Nij, flt_t Nji, flt_t Nijconj, flt_t * dN3) {
+  dN3[0] = 0;
+  dN3[1] = 0;
+  dN3[2] = 0;
+  if (itype == HYDROGEN || jtype == HYDROGEN) return 0;
+  acc_t dN3tmp[3] = {0};
+  flt_t ret = TijSpline(ka, Nij, Nji, Nijconj, dN3tmp);
+  dN3[0] = dN3tmp[0];
+  dN3[1] = dN3tmp[1];
+  dN3[2] = dN3tmp[2];
+  return ret;
+}
+
+/*
+ * Implements a scalar version of the sum cos^1(omega) term used in pi^dh_ij.
+ * Occurs in both bondorder and bondorderLJ.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t frebo_sum_omega(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int j,
+ flt_t r23x, flt_t r23y, flt_t r23z, flt_t r23mag, flt_t VA, acc_t fij[3]) {
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  acc_t sum_omega = 0;
+  int a2 = i;
+  int a3 = j;
+  flt_t r32x = - r23x;
+  flt_t r32y = - r23y;
+  flt_t r32z = - r23z;
+  int * map = ka->map;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  flt_t thmin = ka->params.thmin;
+  flt_t thmax = ka->params.thmax;
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+  int * neighs_i = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int * neighs_j = ka->neigh_rebo.entries + ka->neigh_rebo.offset[j];
+  int num_i = ka->neigh_rebo.num[i];
+  int num_j = ka->neigh_rebo.num[j];
+  int kk;
+  for (kk = 0; kk < num_i; kk++) {
+    int k = neighs_i[kk];
+    if (k == j) continue;
+    int a1 = k;
+    int ktype = map[x[k].w];
+    flt_t r21x = x[a2].x - x[a1].x;
+    flt_t r21y = x[a2].y - x[a1].y;
+    flt_t r21z = x[a2].z - x[a1].z;
+    flt_t r21mag = overloaded::sqrt(r21x * r21x + r21y * r21y + r21z * r21z);
+    flt_t cos321 = (r23x * r21x + r23y * r21y + r23z * r21z) / 
+      (r23mag * r21mag);
+    cos321 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos321));
+    flt_t sin321 = overloaded::sqrt(1 - cos321 * cos321);
+    if (sin321 == 0) continue;
+    flt_t sink2i = 1 / (sin321 * sin321);
+    flt_t rik2i = 1 / (r21mag * r21mag);
+    flt_t rr = r23mag * r23mag - r21mag * r21mag;
+    flt_t r31x = r21x - r23x;
+    flt_t r31y = r21y - r23y;
+    flt_t r31z = r21z - r23z;
+    flt_t r31mag2 = r31x * r31x + r31y * r31y + r31z * r31z;
+    flt_t rijrik = 2 * r23mag * r21mag;
+    flt_t r21mag2 = r21mag * r21mag;
+    flt_t dctik = (-rr + r31mag2) / (rijrik * r21mag2);
+    flt_t dctij = (rr + r31mag2) / (rijrik * r23mag * r23mag);
+    flt_t dctjk = -2 / rijrik;
+    flt_t rcmin21  = ka->params.rcmin [itype][ktype];
+    flt_t rcmaxp21 = ka->params.rcmaxp[itype][ktype];
+    flt_t dw21;
+    flt_t w21 = Sp(r21mag, rcmin21, rcmaxp21, &dw21);
+    // why does this additional cutoff in the cosine exist?
+    // the original code by stuart answers this:
+    // it avoid issues when bonds in the dihedral are linear
+    // by switching the dihedral off beforehand.
+    // This is the reason for both the sin == 0 checks and the
+    // tspjik = Sp2(..) calls.
+    // Unfortunately, this is not exactly stated in the original paper.
+    // It might be similar in purpose to the H(sin - s^min) term that
+    // appears in that paper, but can not be found in original REBO papers.
+    flt_t dtsjik;
+    flt_t tspjik = Sp2(cos321, thmin, thmax, &dtsjik);
+    dtsjik = - dtsjik;
+    int ll;
+    for (ll = 0; ll < num_j; ll++) {
+      int l = neighs_j[ll];
+      if (l == i || l == k) continue;
+      int ltype = map[x[l].w];
+      int a4 = l;
+      flt_t r34x = x[a3].x - x[a4].x;
+      flt_t r34y = x[a3].y - x[a4].y;
+      flt_t r34z = x[a3].z - x[a4].z;
+      flt_t r34mag = overloaded::sqrt(r34x * r34x + r34y * r34y + r34z * r34z);
+      flt_t cos234 = (r32x * r34x + r32y * r34y + r32z * r34z) / 
+	(r23mag * r34mag);
+      cos234 = fmin_nonan<flt_t>(1, fmax_nonan<flt_t>(-1, cos234));
+      flt_t sin234 = overloaded::sqrt(1 - cos234 * cos234);
+      if (sin234 == 0) continue;
+      flt_t sinl2i = 1 / (sin234 * sin234);
+      flt_t rjl2i = 1 / (r34mag * r34mag);
+
+      flt_t rcminjl = ka->params.rcmin[jtype][ltype];
+      flt_t rcmaxpjl = ka->params.rcmaxp[jtype][ltype];
+      flt_t dw34;
+      flt_t w34 = Sp(r34mag, rcminjl, rcmaxpjl, &dw34);
+      flt_t rr = (r23mag * r23mag) - (r34mag * r34mag);
+      flt_t r24x = r23x + r34x;
+      flt_t r24y = r23y + r34y;
+      flt_t r24z = r23z + r34z;
+      flt_t r242 =
+          (r24x * r24x) + (r24y * r24y) + (r24z * r24z);
+      flt_t rijrjl = 2 * r23mag * r34mag;
+      flt_t rjl2 = r34mag * r34mag;
+      flt_t dctjl = (-rr + r242) / (rijrjl * rjl2);
+      flt_t dctji = (rr + r242) / (rijrjl * r23mag * r23mag);
+      flt_t dctil = -2 / rijrjl;
+      flt_t dtsijl;
+      flt_t tspijl = Sp2(cos234, thmin, thmax, &dtsijl);
+      dtsijl = -dtsijl; // need minus sign
+      flt_t prefactor = VA;
+
+      flt_t cross321x = (r32y * r21z) - (r32z * r21y);
+      flt_t cross321y = (r32z * r21x) - (r32x * r21z);
+      flt_t cross321z = (r32x * r21y) - (r32y * r21x);
+      flt_t cross234x = (r23y * r34z) - (r23z * r34y);
+      flt_t cross234y = (r23z * r34x) - (r23x * r34z);
+      flt_t cross234z = (r23x * r34y) - (r23y * r34x);
+
+      flt_t cwnum = (cross321x * cross234x) +
+              (cross321y * cross234y) +
+              (cross321z * cross234z);
+      flt_t cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
+      flt_t om1234 = cwnum / cwnom;
+      flt_t cw = om1234;
+      sum_omega += ((1 - (om1234 * om1234)) * w21 * w34) *
+              (1 - tspjik) * (1 - tspijl);
+      if (VA == static_cast<flt_t>(0.0)) continue;
+
+      flt_t dt1dik = (rik2i) - (dctik * sink2i * cos321);
+      flt_t dt1djk = (-dctjk * sink2i * cos321);
+      flt_t dt1djl = (rjl2i) - (dctjl * sinl2i * cos234);
+      flt_t dt1dil = (-dctil * sinl2i * cos234);
+      flt_t dt1dij = (2 / (r23mag * r23mag)) -
+               (dctij * sink2i * cos321) -
+               (dctji * sinl2i * cos234);
+
+      flt_t dt2dikx = (-r23z * cross234y) + (r23y * cross234z);
+      flt_t dt2diky = (-r23x * cross234z) + (r23z * cross234x);
+      flt_t dt2dikz = (-r23y * cross234x) + (r23x * cross234y);
+
+      flt_t dt2djlx = (-r23y * cross321z) + (r23z * cross321y);
+      flt_t dt2djly = (-r23z * cross321x) + (r23x * cross321z);
+      flt_t dt2djlz = (-r23x * cross321y) + (r23y * cross321x);
+
+      flt_t dt2dijx = (r21z * cross234y) - (r34z * cross321y) -
+      flt_t      (r21y * cross234z) + (r34y * cross321z);
+      flt_t dt2dijy = (r21x * cross234z) - (r34x * cross321z) -
+      flt_t      (r21z * cross234x) + (r34z * cross321x);
+      flt_t dt2dijz = (r21y * cross234x) - (r34y * cross321x) -
+      flt_t      (r21x * cross234y) + (r34x * cross321y);
+
+      flt_t aa = (prefactor * 2 * cw / cwnom) * w21 * w34 *
+           (1 - tspjik) * (1 - tspijl);
+      flt_t aaa1 = -prefactor * (1 - (om1234 * om1234)) *
+             (1 - tspjik) * (1 - tspijl);
+      flt_t aaa2 = -prefactor * (1 - (om1234 * om1234)) * w21 * w34;
+      flt_t at2 = aa * cwnum;
+
+      flt_t fcijpc = (-dt1dij * at2) +
+              (aaa2 * dtsjik * dctij * (1 - tspijl)) +
+              (aaa2 * dtsijl * dctji * (1 - tspjik));
+      flt_t fcikpc = (-dt1dik * at2) +
+              (aaa2 * dtsjik * dctik * (1 - tspijl));
+      flt_t fcjlpc = (-dt1djl * at2) +
+              (aaa2 * dtsijl * dctjl * (1 - tspjik));
+      flt_t fcjkpc = (-dt1djk * at2) +
+              (aaa2 * dtsjik * dctjk * (1 - tspijl));
+      flt_t fcilpc = (-dt1dil * at2) +
+              (aaa2 * dtsijl * dctil * (1 - tspjik));
+
+      flt_t F23x = (fcijpc * r23x) + (aa * dt2dijx);
+      flt_t F23y = (fcijpc * r23y) + (aa * dt2dijy);
+      flt_t F23z = (fcijpc * r23z) + (aa * dt2dijz);
+
+      flt_t F12x = (fcikpc * r21x) + (aa * dt2dikx);
+      flt_t F12y = (fcikpc * r21y) + (aa * dt2diky);
+      flt_t F12z = (fcikpc * r21z) + (aa * dt2dikz);
+
+      flt_t F34x = (fcjlpc * r34x) + (aa * dt2djlx);
+      flt_t F34y = (fcjlpc * r34y) + (aa * dt2djly);
+      flt_t F34z = (fcjlpc * r34z) + (aa * dt2djlz);
+
+      flt_t F31x = (fcjkpc * r31x);
+      flt_t F31y = (fcjkpc * r31y);
+      flt_t F31z = (fcjkpc * r31z);
+
+      flt_t F24x = (fcilpc * r24x);
+      flt_t F24y = (fcilpc * r24y);
+      flt_t F24z = (fcilpc * r24z);
+
+      flt_t f1x = -F12x - F31x;
+      flt_t f1y = -F12y - F31y;
+      flt_t f1z = -F12z - F31z;
+      flt_t f2x = F12x + F31x;
+      flt_t f2y = F12y + F31y;
+      flt_t f2z = F12z + F31z;
+      flt_t f3x = F34x + F24x;
+      flt_t f3y = F34y + F24y;
+      flt_t f3z = F34z + F24z;
+      flt_t f4x = -F34x - F24x;
+      flt_t f4y = -F34y - F24y;
+      flt_t f4z = -F34z - F24z;
+
+      fij[0] += F23x + F24x - F31x;
+      fij[1] += F23y + F24y - F31y;
+      fij[2] += F23z + F24z - F31z;
+
+      // coordination forces
+
+      flt_t tmp20 = VA * ((1 - (om1234 * om1234))) *
+             (1 - tspjik) * (1 - tspijl) * dw21 * w34 / r21mag;
+      f2x -= tmp20 * r21x;
+      f2y -= tmp20 * r21y;
+      f2z -= tmp20 * r21z;
+      f1x += tmp20 * r21x;
+      f1y += tmp20 * r21y;
+      f1z += tmp20 * r21z;
+
+      flt_t tmp21 = VA * ((1 - (om1234 * om1234))) *
+             (1 - tspjik) * (1 - tspijl) * w21 * dw34 / r34mag;
+      f3x -= tmp21 * r34x;
+      f3y -= tmp21 * r34y;
+      f3z -= tmp21 * r34z;
+      f4x += tmp21 * r34x;
+      f4y += tmp21 * r34y;
+      f4z += tmp21 * r34z;
+
+      result_f[a1].x += f1x;
+      result_f[a1].y += f1y;
+      result_f[a1].z += f1z;
+      result_f[a2].x += f2x;
+      result_f[a2].y += f2y;
+      result_f[a2].z += f2z;
+      result_f[a3].x += f3x;
+      result_f[a3].y += f3y;
+      result_f[a3].z += f3z;
+      result_f[a4].x += f4x;
+      result_f[a4].y += f4y;
+      result_f[a4].z += f4z;
+    }
+  }
+  return sum_omega;
+}
+
+/*
+ * Implements a scalar implementation the force update due to splines.
+ * It is used for both pi^rc_ij and T_ij.
+ * Occurs four times in each bondorder and bondorderLJ.
+ */
+template<typename flt_t, typename acc_t>
+inline void frebo_N_spline_force(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, 
+    int j, flt_t VA, flt_t dN, flt_t dNconj, flt_t Nconj) {
+  int * map = ka->map;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  flt_t * nC = ka->nC;
+  flt_t * nH = ka->nH;
+  flt_t Nmin = ka->params.Nmin;
+  flt_t Nmax = ka->params.Nmax;
+  int itype = map[x[i].w];
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int knum = ka->neigh_rebo.num[i];
+  int kk;
+  for (kk = 0; kk < knum; kk++) {
+    int k = neighs[kk];
+    if (k == j) continue;
+    flt_t rikx = x[i].x - x[k].x;
+    flt_t riky = x[i].y - x[k].y;
+    flt_t rikz = x[i].z - x[k].z;
+    flt_t rikmag = overloaded::sqrt(rikx * rikx + riky * riky + rikz * rikz);
+    int ktype = map[x[k].w];
+    flt_t rcminik = ka->params.rcmin[itype][ktype];
+    flt_t rcmaxik = ka->params.rcmax[itype][ktype];
+    flt_t dwik;
+    flt_t wik = Sp(rikmag, rcminik, rcmaxik, &dwik);
+    flt_t Nki = nC[k] + nH[k] - wik;
+    flt_t dNki;
+    flt_t SpN = Sp(Nki, Nmin, Nmax, &dNki);
+    flt_t fdN = VA * dN * dwik / rikmag;
+    flt_t fdNconj = VA * dNconj * 2 * Nconj * dwik * SpN / rikmag;
+    flt_t ffactor = fdN;
+    if (ktype == 0) ffactor += fdNconj;
+    flt_t fkx = ffactor * rikx;
+    flt_t fky = ffactor * riky;
+    flt_t fkz = ffactor * rikz;
+    result_f[i].x -= fkx;
+    result_f[i].y -= fky;
+    result_f[i].z -= fkz;
+    result_f[k].x += fkx;
+    result_f[k].y += fky;
+    result_f[k].z += fkz;
+    if (ktype != 0 || fabs(dNki) <= TOL) continue;
+    int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
+    int nnum = ka->neigh_rebo.num[k];
+    int nn;
+    for (nn = 0; nn < nnum; nn++) {
+      int n = neighs_k[nn];
+      if (n == i) continue;
+      flt_t rknx = x[k].x - x[n].x;
+      flt_t rkny = x[k].y - x[n].y;
+      flt_t rknz = x[k].z - x[n].z;
+      flt_t rknmag = overloaded::sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
+      int ntype = map[x[n].w];
+      flt_t rcminkn = ka->params.rcmin[ktype][ntype];
+      flt_t rcmaxkn = ka->params.rcmax[ktype][ntype];
+      flt_t dwkn;
+      Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
+      flt_t ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
+      result_f[k].x -= ffactor * rknx;
+      result_f[k].y -= ffactor * rkny;
+      result_f[k].z -= ffactor * rknz;
+      result_f[n].x += ffactor * rknx;
+      result_f[n].y += ffactor * rkny;
+      result_f[n].z += ffactor * rknz;
+    }
+  }
+}
+
+/*
+ * This data-structure contains the result of a search through neighbor-lists.
+ * It is used to calculate C_ij and the corresponding force updates.
+ */
+template<typename flt_t>
+struct LennardJonesPathAIREBOT {
+  AtomAIREBOT<flt_t> del[3];
+  int num;
+  flt_t w[3];
+  flt_t dw[3];
+  flt_t r[3];
+  int idx[4];
+};
+
+/*
+ * Checks a candidate path stored in idxs whether it is better than *path
+ * and updates *path accordingly.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_test_path_single(
+ KernelArgsAIREBOT<flt_t,acc_t> * ka, flt_t best, int num, int * idxs, 
+ LennardJonesPathAIREBOT<flt_t> * path) {
+  LennardJonesPathAIREBOT<flt_t> result;
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  result.num = num;
+  flt_t combined = 1;
+  for (int i = num - 2; i >= 0; i--) {
+    int a0 = idxs[i+0];
+    int a1 = idxs[i+1];
+    flt_t delx = x[a1].x - x[a0].x;
+    flt_t dely = x[a1].y - x[a0].y;
+    flt_t delz = x[a1].z - x[a0].z;
+    flt_t rsq = delx * delx + dely * dely + delz * delz;
+    int type0 = map[x[a0].w];
+    int type1 = map[x[a1].w];
+    if (rsq >= ka->params.rcmaxsq[type0][type1]) return best;
+    flt_t r = overloaded::sqrt(rsq);
+    flt_t dw, w = Sp<flt_t>(r, ka->params.rcmin[type0][type1], 
+                            ka->params.rcmax[type0][type1], &dw);
+    if (w == 0) return best;
+    combined *= w;
+    if (combined <= best) return best;
+    result.idx[i] = a0;
+    result.del[i].x = delx;
+    result.del[i].y = dely;
+    result.del[i].z = delz;
+    result.r[i] = r;
+    result.w[i] = w;
+    result.dw[i] = dw;
+  }
+  result.idx[num - 1] = idxs[num - 1];
+  *path = result;
+  return combined;
+}
+
+/*
+ * Test through all paths surrounding i and j to find the corresponding
+ * best path. Uses the same iteration ordering as FLJ() does.
+ * Note that an optimization would use the j neighlist instead in the inner
+ * loop.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, flt_t rij, flt_t rcmax, 
+    LennardJonesPathAIREBOT<flt_t> * path) {
+  int idxs[4];
+  idxs[0] = i;
+  idxs[1] = j;
+  flt_t best = 0;
+  if (rij <= rcmax) {
+    best = ref_lennard_jones_test_path_single(ka, best, 2, idxs, path);
+    if (best == static_cast<flt_t>(1.0)) return 0;
+  }
+  for (int kk = 0; kk < ka->neigh_rebo.num[i]; kk++) {
+    int k = ka->neigh_rebo.entries[ka->neigh_rebo.offset[i] + kk];
+    if (k == j) continue;
+    idxs[1] = k;
+    idxs[2] = j;
+    best = ref_lennard_jones_test_path_single(ka, best, 3, idxs, path);
+    if (best == static_cast<flt_t>(1.0)) return 0;
+    for (int mm = 0; mm < ka->neigh_rebo.num[k]; mm++) {
+      int m = ka->neigh_rebo.entries[ka->neigh_rebo.offset[k] + mm];
+      if (m == i || m == j) continue;
+      idxs[2] = m;
+      idxs[3] = j;
+      best = ref_lennard_jones_test_path_single(ka, best, 4, idxs, path);
+      if (best == static_cast<flt_t>(1.0)) return 0;
+    }
+  }
+  return 1 - best;
+}
+
+/*
+ * Conducts the force update due to C_ij, given the active path.
+ */
+template<typename flt_t, typename acc_t>
+inline void ref_lennard_jones_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    flt_t dC, LennardJonesPathAIREBOT<flt_t> * path) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  for (int i = 0; i < path->num - 1; i++) {
+    flt_t fpair = dC * path->dw[i] / path->r[i];
+    for (int j = 0; j < path->num - 1; j++) {
+      if (i != j) fpair *= path->w[j];
+    }
+    result_f[path->idx[i+0]].x -= fpair * path->del[i].x;
+    result_f[path->idx[i+0]].y -= fpair * path->del[i].y;
+    result_f[path->idx[i+0]].z -= fpair * path->del[i].z;
+    result_f[path->idx[i+1]].x += fpair * path->del[i].x;
+    result_f[path->idx[i+1]].y += fpair * path->del[i].y;
+    result_f[path->idx[i+1]].z += fpair * path->del[i].z;
+  }
+}
+
+/*
+ * Calculate the bondorderLJ term.
+ */
+template<typename flt_t, typename acc_t>
+inline flt_t ref_lennard_jones_bondorder(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, flt_t VA, acc_t fij[3]) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+  flt_t rij = overloaded::sqrt(rsq);
+
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t dwij;
+  flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
+
+  flt_t the_r = ka->params.rcmin[itype][jtype];
+  flt_t scale = the_r / rij;
+  flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
+  flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
+  flt_t NconjtmpI;
+  acc_t fijc[3] = {0}, fjic[3] = {0};
+  flt_t pij = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, dely * scale, 
+    delz * scale, the_r, wij, 0.0, &NconjtmpI, fijc);
+  flt_t NconjtmpJ;
+  flt_t pji = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, -dely * scale, 
+    -delz * scale, the_r, wij, 0.0, &NconjtmpJ, fjic);
+  flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
+  flt_t dN3_pi_rc[3];
+  flt_t pi_rc = frebo_pi_rc<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj, 
+    dN3_pi_rc);
+  flt_t dN3_Tij[3];
+  flt_t Tij = frebo_Tij<flt_t,acc_t>(ka, itype, jtype, Nij, Nji, Nijconj, 
+    dN3_Tij);
+  flt_t sum_omega = 0;
+  if (fabs(Tij) > TOL) {
+    sum_omega = frebo_sum_omega<flt_t,acc_t>(ka, i, j, delx * scale, dely * 
+                                             scale, delz * scale, the_r, 0.0, 
+                                             fijc);
+  }
+  flt_t pi_dh = Tij * sum_omega;
+  flt_t bij = 0.5 * (pij + pji) + pi_rc + pi_dh;
+  flt_t dStb;
+  flt_t Stb = Sp2<flt_t>(bij, ka->params.bLJmin[itype][jtype], 
+    ka->params.bLJmax[itype][jtype], &dStb);
+  if (dStb != 0) {
+    flt_t pij_reverse = frebo_pij<flt_t,acc_t>(ka, i, j, delx * scale, 
+      dely * scale, delz * scale, the_r, wij, VA * dStb, &NconjtmpI, fijc);
+    flt_t pji_reverse = frebo_pij<flt_t,acc_t>(ka, j, i, -delx * scale, 
+      -dely * scale, -delz * scale, the_r, wij, VA * dStb, &NconjtmpJ, fjic);
+    fijc[0] -= fjic[0];
+    fijc[1] -= fjic[1];
+    fijc[2] -= fjic[2];
+    frebo_N_spline_force<flt_t,acc_t>(ka, i, j, VA * dStb, dN3_pi_rc[0], 
+      dN3_pi_rc[2], NconjtmpI);
+    frebo_N_spline_force<flt_t,acc_t>(ka, j, i, VA * dStb, dN3_pi_rc[1], 
+      dN3_pi_rc[2], NconjtmpJ);
+    if (fabs(Tij) > TOL) {
+      flt_t sum_omega_reverse = frebo_sum_omega<flt_t,acc_t>(ka, i, j, 
+        delx * scale, dely * scale, delz * scale, the_r, VA * dStb * Tij, fijc);
+      frebo_N_spline_force(ka, i, j, VA * dStb * sum_omega, dN3_Tij[0], 
+        dN3_Tij[2], NconjtmpI);
+      frebo_N_spline_force(ka, j, i, VA * dStb * sum_omega, dN3_Tij[1], 
+        dN3_Tij[2], NconjtmpJ);
+    }
+    assert(fij[0] == 0);
+    assert(fij[1] == 0);
+    assert(fij[2] == 0);
+    fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * 
+                                 fijc[1] + delz * delx * fijc[2]) / rsq);
+    fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * 
+                                 fijc[1] + delz * dely * fijc[2]) / rsq);
+    fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * 
+                                 fijc[1] + delz * delz * fijc[2]) / rsq);
+  }
+  return Stb;
+}
+
+/*
+ * Scalar reference implementation of neighbor routine.
+ */
+template<typename flt_t, typename acc_t>
+void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
+  for (int i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
+    ka->neigh_rebo.offset[i] = offset;
+    int itype = ka->map[ka->x[i].w];
+    int n = 0;
+    ka->nC[i] = 0;
+    ka->nH[i] = 0;
+    for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
+      int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
+      flt_t delx = ka->x[i].x - ka->x[ji].x;
+      flt_t dely = ka->x[i].y - ka->x[ji].y;
+      flt_t delz = ka->x[i].z - ka->x[ji].z;
+      flt_t rsq = delx * delx + dely * dely + delz * delz;
+      int jtype = ka->map[ka->x[ji].w];
+      if (rsq < ka->params.rcmaxsq[itype][jtype]) {
+        ka->neigh_rebo.entries[offset + n++] = ji;
+        flt_t rcmin = ka->params.rcmin[itype][jtype];
+        flt_t rcmax = ka->params.rcmax[itype][jtype];
+        if (jtype == CARBON)
+          ka->nC[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
+        else
+          ka->nH[i] += Sp<flt_t>(overloaded::sqrt(rsq), rcmin, rcmax, NULL);
+      }
+    }
+    ka->neigh_rebo.num[i] = n;
+    offset += n;
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_torsion_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
+                                    int j) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * f = ka->result_f;
+  flt_t (*rcmin)[2] = ka->params.rcmin;
+  flt_t (*rcmax)[2] = ka->params.rcmax;
+  flt_t (*epsilonT)[2] = ka->params.epsilonT;
+  flt_t thmin = ka->params.thmin;
+  flt_t thmax = ka->params.thmax;
+  int itype = map[x[i].w];
+  flt_t xtmp = x[i].x;
+  flt_t ytmp = x[i].y;
+  flt_t ztmp = x[i].z;
+  int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+  int jnum = ka->neigh_rebo.num[i];
+  int jtype = map[x[j].w];
+
+  flt_t del32x = x[j].x-x[i].x;
+  flt_t del32y = x[j].y-x[i].y;
+  flt_t del32z = x[j].z-x[i].z;
+  flt_t rsq = del32x*del32x + del32y*del32y + del32z*del32z;
+  flt_t r32 = overloaded::sqrt(rsq);
+  flt_t del23x = -del32x;
+  flt_t del23y = -del32y;
+  flt_t del23z = -del32z;
+  flt_t r23 = r32;
+  flt_t dw23, w23 = Sp<flt_t>(r23,rcmin[itype][jtype],rcmax[itype][jtype],
+    &dw23);
+
+  assert(itype == 0);
+  assert(jtype == 0);
+
+  for (int kk = 0; kk < jnum; kk++) {
+    int k = REBO_neighs_i[kk];
+    int ktype = map[x[k].w];
+    if (k == j) continue;
+    flt_t del21x = x[i].x-x[k].x;
+    flt_t del21y = x[i].y-x[k].y;
+    flt_t del21z = x[i].z-x[k].z;
+    flt_t rsq = del21x*del21x + del21y*del21y + del21z*del21z;
+    flt_t r21 = overloaded::sqrt(rsq);
+    flt_t cos321 = - ((del21x*del32x) + (del21y*del32y) +
+                (del21z*del32z)) / (r21*r32);
+    cos321 = fmin(cos321,1);
+    cos321 = fmax(cos321,-1);
+    flt_t sin321 = overloaded::sqrt(1 - cos321*cos321);
+    if (sin321 < TOL) continue;
+
+    flt_t deljkx = del21x-del23x;
+    flt_t deljky = del21y-del23y;
+    flt_t deljkz = del21z-del23z;
+    flt_t rjk2 = deljkx*deljkx + deljky*deljky + deljkz*deljkz;
+    flt_t rjk = overloaded::sqrt(rjk2);
+    flt_t rik2 = r21*r21;
+    flt_t dw21, w21 = Sp<flt_t>(r21,rcmin[itype][ktype],rcmax[itype][ktype],
+      &dw21);
+
+    flt_t rij = r32;
+    flt_t rik = r21;
+    flt_t rij2 = r32*r32;
+    flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rik2-rjk2)/rij/rik;
+    flt_t dtsjik, tspjik = Sp2<flt_t>(costmp,thmin,thmax,&dtsjik);
+    dtsjik = -dtsjik;
+
+    int * REBO_neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
+    int lnum = ka->neigh_rebo.num[j];
+    for (int ll = 0; ll < lnum; ll++) {
+      int l = REBO_neighs_j[ll];
+      int ltype = map[x[l].w];
+      if (l == i || l == k) continue;
+      flt_t del34x = x[j].x-x[l].x;
+      flt_t del34y = x[j].y-x[l].y;
+      flt_t del34z = x[j].z-x[l].z;
+      flt_t rsq = del34x*del34x + del34y*del34y + del34z*del34z;
+      flt_t r34 = overloaded::sqrt(rsq);
+      flt_t cos234 = (del32x*del34x + del32y*del34y +
+                del32z*del34z) / (r32*r34);
+      cos234 = fmin(cos234,1);
+      cos234 = fmax(cos234,-1);
+      flt_t sin234 = overloaded::sqrt(1 - cos234*cos234);
+      if (sin234 < TOL) continue;
+      flt_t dw34, w34 = Sp<flt_t>(r34,rcmin[jtype][ltype],rcmax[jtype][ltype],
+        &dw34);
+      flt_t delilx = del23x + del34x;
+      flt_t delily = del23y + del34y;
+      flt_t delilz = del23z + del34z;
+      flt_t ril2 = delilx*delilx + delily*delily + delilz*delilz;
+      flt_t ril = overloaded::sqrt(ril2);
+      flt_t rjl2 = r34*r34;
+
+      flt_t rjl = r34;
+      flt_t costmp = static_cast<flt_t>(0.5)*(rij2+rjl2-ril2)/rij/rjl;
+      flt_t dtsijl, tspijl = Sp2<flt_t>(costmp,thmin,thmax,&dtsijl);
+      dtsijl = -dtsijl; //need minus sign
+      flt_t cross321x = (del32y*del21z)-(del32z*del21y);
+      flt_t cross321y = (del32z*del21x)-(del32x*del21z);
+      flt_t cross321z = (del32x*del21y)-(del32y*del21x);
+      flt_t cross321mag = overloaded::sqrt(cross321x*cross321x+
+                         cross321y*cross321y + cross321z*cross321z);
+      flt_t cross234x = (del23y*del34z)-(del23z*del34y);
+      flt_t cross234y = (del23z*del34x)-(del23x*del34z);
+      flt_t cross234z = (del23x*del34y)-(del23y*del34x);
+      flt_t cross234mag = overloaded::sqrt(cross234x*cross234x+
+                         cross234y*cross234y + cross234z*cross234z);
+      flt_t cwnum = (cross321x*cross234x) +
+        (cross321y*cross234y)+(cross321z*cross234z);
+      flt_t cwnom = r21*r34*r32*r32*sin321*sin234;
+      flt_t cw = cwnum/cwnom;
+
+      flt_t cw2 = (static_cast<flt_t>(.5)*(1-cw));
+      flt_t ekijl = epsilonT[ktype][ltype];
+      flt_t Ec = 256*ekijl/405;
+      flt_t Vtors = (Ec*(overloaded::pow(cw2,5)))-(ekijl/10);
+
+      ka->result_eng += Vtors*w21*w23*w34*(1-tspjik)*(1-tspijl);
+
+      flt_t dndijx = (cross234y*del21z)-(cross234z*del21y);
+      flt_t dndijy = (cross234z*del21x)-(cross234x*del21z);
+      flt_t dndijz = (cross234x*del21y)-(cross234y*del21x);
+
+      flt_t tmpvecx = (del34y*cross321z)-(del34z*cross321y);
+      flt_t tmpvecy = (del34z*cross321x)-(del34x*cross321z);
+      flt_t tmpvecz = (del34x*cross321y)-(del34y*cross321x);
+
+      dndijx = dndijx+tmpvecx;
+      dndijy = dndijy+tmpvecy;
+      dndijz = dndijz+tmpvecz;
+
+      flt_t dndikx = (del23y*cross234z)-(del23z*cross234y);
+      flt_t dndiky = (del23z*cross234x)-(del23x*cross234z);
+      flt_t dndikz = (del23x*cross234y)-(del23y*cross234x);
+
+      flt_t dndjlx = (cross321y*del23z)-(cross321z*del23y);
+      flt_t dndjly = (cross321z*del23x)-(cross321x*del23z);
+      flt_t dndjlz = (cross321x*del23y)-(cross321y*del23x);
+
+      flt_t dcidij = ((r23*r23)-(r21*r21)+(rjk*rjk))/(2*r23*r23*r21);
+      flt_t dcidik = ((r21*r21)-(r23*r23)+(rjk*rjk))/(2*r23*r21*r21);
+      flt_t dcidjk = (-rjk)/(r23*r21);
+      flt_t dcjdji = ((r23*r23)-(r34*r34)+(ril*ril))/(2*r23*r23*r34);
+      flt_t dcjdjl = ((r34*r34)-(r23*r23)+(ril*ril))/(2*r23*r34*r34);
+      flt_t dcjdil = (-ril)/(r23*r34);
+
+      flt_t dsidij = (-cos321/sin321)*dcidij;
+      flt_t dsidik = (-cos321/sin321)*dcidik;
+      flt_t dsidjk = (-cos321/sin321)*dcidjk;
+
+      flt_t dsjdji = (-cos234/sin234)*dcjdji;
+      flt_t dsjdjl = (-cos234/sin234)*dcjdjl;
+      flt_t dsjdil = (-cos234/sin234)*dcjdil;
+
+      flt_t dxidij = (r21*sin321)+(r23*r21*dsidij);
+      flt_t dxidik = (r23*sin321)+(r23*r21*dsidik);
+      flt_t dxidjk = (r23*r21*dsidjk);
+
+      flt_t dxjdji = (r34*sin234)+(r23*r34*dsjdji);
+      flt_t dxjdjl = (r23*sin234)+(r23*r34*dsjdjl);
+      flt_t dxjdil = (r23*r34*dsjdil);
+
+      flt_t ddndij = (dxidij*cross234mag)+(cross321mag*dxjdji);
+      flt_t ddndik = dxidik*cross234mag;
+      flt_t ddndjk = dxidjk*cross234mag;
+      flt_t ddndjl = cross321mag*dxjdjl;
+      flt_t ddndil = cross321mag*dxjdil;
+      flt_t dcwddn = -cwnum/(cwnom*cwnom);
+      flt_t dcwdn = 1/cwnom;
+      flt_t dvpdcw = (-1)*Ec*static_cast<flt_t>(-0.5)*5*overloaded::pow(cw2,4)*
+                      w23*w21*w34*(1-tspjik)*(1-tspijl);
+
+      flt_t Ftmpx = dvpdcw*((dcwdn*dndijx)+(dcwddn*ddndij*del23x/r23));
+      flt_t Ftmpy = dvpdcw*((dcwdn*dndijy)+(dcwddn*ddndij*del23y/r23));
+      flt_t Ftmpz = dvpdcw*((dcwdn*dndijz)+(dcwddn*ddndij*del23z/r23));
+      flt_t fix = Ftmpx;
+      flt_t fiy = Ftmpy;
+      flt_t fiz = Ftmpz;
+      flt_t fjx = -Ftmpx;
+      flt_t fjy = -Ftmpy;
+      flt_t fjz = -Ftmpz;
+
+      Ftmpx = dvpdcw*((dcwdn*dndikx)+(dcwddn*ddndik*del21x/r21));
+      Ftmpy = dvpdcw*((dcwdn*dndiky)+(dcwddn*ddndik*del21y/r21));
+      Ftmpz = dvpdcw*((dcwdn*dndikz)+(dcwddn*ddndik*del21z/r21));
+      fix += Ftmpx;
+      fiy += Ftmpy;
+      fiz += Ftmpz;
+      flt_t fkx = -Ftmpx;
+      flt_t fky = -Ftmpy;
+      flt_t fkz = -Ftmpz;
+
+      Ftmpx = (dvpdcw*dcwddn*ddndjk*deljkx)/rjk;
+      Ftmpy = (dvpdcw*dcwddn*ddndjk*deljky)/rjk;
+      Ftmpz = (dvpdcw*dcwddn*ddndjk*deljkz)/rjk;
+      fjx += Ftmpx;
+      fjy += Ftmpy;
+      fjz += Ftmpz;
+      fkx -= Ftmpx;
+      fky -= Ftmpy;
+      fkz -= Ftmpz;
+
+      Ftmpx = dvpdcw*((dcwdn*dndjlx)+(dcwddn*ddndjl*del34x/r34));
+      Ftmpy = dvpdcw*((dcwdn*dndjly)+(dcwddn*ddndjl*del34y/r34));
+      Ftmpz = dvpdcw*((dcwdn*dndjlz)+(dcwddn*ddndjl*del34z/r34));
+      fjx += Ftmpx;
+      fjy += Ftmpy;
+      fjz += Ftmpz;
+      flt_t flx = -Ftmpx;
+      flt_t fly = -Ftmpy;
+      flt_t flz = -Ftmpz;
+
+      Ftmpx = (dvpdcw*dcwddn*ddndil*delilx)/ril;
+      Ftmpy = (dvpdcw*dcwddn*ddndil*delily)/ril;
+      Ftmpz = (dvpdcw*dcwddn*ddndil*delilz)/ril;
+      fix += Ftmpx;
+      fiy += Ftmpy;
+      fiz += Ftmpz;
+      flx -= Ftmpx;
+      fly -= Ftmpy;
+      flz -= Ftmpz;
+
+      // coordination forces
+
+      flt_t fpair = Vtors*dw21*w23*w34*(1-tspjik)*(1-tspijl) / r21;
+      fix -= del21x*fpair;
+      fiy -= del21y*fpair;
+      fiz -= del21z*fpair;
+      fkx += del21x*fpair;
+      fky += del21y*fpair;
+      fkz += del21z*fpair;
+
+      fpair = Vtors*w21*dw23*w34*(1-tspjik)*(1-tspijl) / r23;
+      fix -= del23x*fpair;
+      fiy -= del23y*fpair;
+      fiz -= del23z*fpair;
+      fjx += del23x*fpair;
+      fjy += del23y*fpair;
+      fjz += del23z*fpair;
+
+      fpair = Vtors*w21*w23*dw34*(1-tspjik)*(1-tspijl) / r34;
+      fjx -= del34x*fpair;
+      fjy -= del34y*fpair;
+      fjz -= del34z*fpair;
+      flx += del34x*fpair;
+      fly += del34y*fpair;
+      flz += del34z*fpair;
+
+      // additional cut off function forces
+
+      flt_t fcpc = -Vtors*w21*w23*w34*dtsjik*(1-tspijl);
+      fpair = fcpc*dcidij/rij;
+      fix += fpair*del23x;
+      fiy += fpair*del23y;
+      fiz += fpair*del23z;
+      fjx -= fpair*del23x;
+      fjy -= fpair*del23y;
+      fjz -= fpair*del23z;
+
+      fpair = fcpc*dcidik/rik;
+      fix += fpair*del21x;
+      fiy += fpair*del21y;
+      fiz += fpair*del21z;
+      fkx -= fpair*del21x;
+      fky -= fpair*del21y;
+      fkz -= fpair*del21z;
+
+      fpair = fcpc*dcidjk/rjk;
+      fjx += fpair*deljkx;
+      fjy += fpair*deljky;
+      fjz += fpair*deljkz;
+      fkx -= fpair*deljkx;
+      fky -= fpair*deljky;
+      fkz -= fpair*deljkz;
+
+      fcpc = -Vtors*w21*w23*w34*(1-tspjik)*dtsijl;
+      fpair = fcpc*dcjdji/rij;
+      fix += fpair*del23x;
+      fiy += fpair*del23y;
+      fiz += fpair*del23z;
+      fjx -= fpair*del23x;
+      fjy -= fpair*del23y;
+      fjz -= fpair*del23z;
+
+      fpair = fcpc*dcjdjl/rjl;
+      fjx += fpair*del34x;
+      fjy += fpair*del34y;
+      fjz += fpair*del34z;
+      flx -= fpair*del34x;
+      fly -= fpair*del34y;
+      flz -= fpair*del34z;
+
+      fpair = fcpc*dcjdil/ril;
+      fix += fpair*delilx;
+      fiy += fpair*delily;
+      fiz += fpair*delilz;
+      flx -= fpair*delilx;
+      fly -= fpair*delily;
+      flz -= fpair*delilz;
+
+      // sum per-atom forces into atom force array
+
+      f[i].x += fix; f[i].y += fiy; f[i].z += fiz;
+      f[j].x += fjx; f[j].y += fjy; f[j].z += fjz;
+      f[k].x += fkx; f[k].y += fky; f[k].z += fkz;
+      f[l].x += flx; f[l].y += fly; f[l].z += flz;
+    }
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_torsion(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  int * tag = ka->tag;
+  for (int ii = ka->frebo_from_atom; ii < ka->frebo_to_atom; ii++) {
+    int i = ii;
+    int itag = tag[i];
+    int itype = map[x[i].w];
+    if (itype != 0) continue;
+    flt_t xtmp = x[i].x;
+    flt_t ytmp = x[i].y;
+    flt_t ztmp = x[i].z;
+    int * REBO_neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+    int jnum = ka->neigh_rebo.num[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = REBO_neighs_i[jj];
+      int jtag = tag[j];
+
+      if (itag > jtag) {
+        if (((itag+jtag) & 1) == 0) continue;
+      } else if (itag < jtag) {
+        if (((itag+jtag) & 1) == 1) continue;
+      } else {
+        if (x[j].z < ztmp) continue;
+        if (x[j].z == ztmp && x[j].y < ytmp) continue;
+        if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) continue;
+      }
+
+      int jtype = map[x[j].w];
+      if (jtype != 0) continue;
+      ref_torsion_single_interaction(ka, i, j);
+    }
+  }
+}
+
+/*
+ * Calculate single REBO interaction.
+ * Corresponds to FREBO method. Note that the bondorder() function is
+ * inlined.
+ */
+template<typename flt_t, typename acc_t>
+void ref_frebo_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, 
+    int j) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  int jj;
+  int itype = map[x[i].w];
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int jtype = map[x[j].w];
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+  flt_t rij = overloaded::sqrt(rsq);
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t dwij;
+  flt_t wij = Sp(rij, rcminij, rcmaxij, &dwij);
+  if (wij <= TOL) return;
+
+  flt_t Qij = ka->params.Q[itype][jtype];
+  flt_t Aij = ka->params.A[itype][jtype];
+  flt_t alphaij = ka->params.alpha[itype][jtype];
+
+  flt_t exp_alphar = exp(-alphaij * rij);
+  flt_t VR_by_wij = (1.0 + (Qij / rij)) * Aij * exp_alphar;
+  flt_t VR = wij * VR_by_wij;
+  flt_t pre = wij * Aij * exp_alphar;
+  flt_t dVRdi = pre * ((-alphaij) - (Qij / rsq) - (Qij * alphaij / rij));
+  dVRdi += VR_by_wij * dwij;
+
+  flt_t VA_by_wij = 0, dVA = 0;
+  for (int k = 0; k < 3; k++) {
+    flt_t BIJc = ka->params.BIJc[itype][jtype][k];
+    flt_t Betaij = ka->params.Beta[itype][jtype][k];
+    flt_t term = -BIJc * overloaded::exp(-Betaij * rij);
+    VA_by_wij += term;
+    dVA += -Betaij * wij * term;
+  }
+  dVA += VA_by_wij * dwij;
+  flt_t VA = VA_by_wij * wij;
+
+  acc_t fij[3] = {0};
+  flt_t Nij = ka->nH[i] + ka->nC[i] - wij;
+  flt_t Nji = ka->nH[j] + ka->nC[j] - wij;
+  flt_t NconjtmpI;
+  flt_t pij = frebo_pij(ka, i, j, delx, dely, delz, rij, wij, VA, &NconjtmpI, 
+    fij);
+  flt_t NconjtmpJ;
+  acc_t fji[3] = {0};
+  flt_t pji = frebo_pij(ka, j, i, -delx, -dely, -delz, rij, wij, VA, 
+    &NconjtmpJ, fji);
+  fij[0] -= fji[0]; fij[1] -= fji[1]; fij[2] -= fji[2];
+  flt_t Nijconj = 1.0 + (NconjtmpI * NconjtmpI) + (NconjtmpJ * NconjtmpJ);
+  flt_t dN3[3];
+  flt_t pi_rc = frebo_pi_rc(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  frebo_N_spline_force(ka, i, j, VA, dN3[0], dN3[2], NconjtmpI);
+  frebo_N_spline_force(ka, j, i, VA, dN3[1], dN3[2], NconjtmpJ);
+  flt_t Tij = frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  flt_t sum_omega = 0.0;
+  if (fabs(Tij) > TOL) {
+    sum_omega = frebo_sum_omega(ka, i, j, delx, dely, delz, rij, VA * Tij, fij);
+    frebo_N_spline_force(ka, i, j, VA * sum_omega, dN3[0], dN3[2], NconjtmpI);
+    frebo_N_spline_force(ka, j, i, VA * sum_omega, dN3[1], dN3[2], NconjtmpJ);
+  }
+  flt_t pi_dh = Tij * sum_omega;
+  flt_t bij = static_cast<flt_t>(0.5) * (pij + pji) + pi_rc + pi_dh;
+  flt_t dVAdi = bij * dVA;
+  flt_t fpair = -(dVRdi + dVAdi) / rij;
+
+  result_f[i].x += fpair * delx + fij[0];
+  result_f[i].y += fpair * dely + fij[1];
+  result_f[i].z += fpair * delz + fij[2];
+  result_f[j].x -= fpair * delx + fij[0];
+  result_f[j].y -= fpair * dely + fij[1];
+  result_f[j].z -= fpair * delz + fij[2];
+
+  flt_t evdwl = VR + bij * VA;
+  ka->result_eng += evdwl;
+  result_f[i].w += 0.5 * evdwl;
+  result_f[j].w += 0.5 * evdwl;
+}
+
+
+template<typename flt_t, typename acc_t>
+inline void ref_frebo_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int jj;
+  int itag = tag[i];
+  flt_t x_i = x[i].x;
+  flt_t y_i = x[i].y;
+  flt_t z_i = x[i].z;
+  int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+  int jnum = ka->neigh_rebo.num[i];
+  for (jj = 0; jj < jnum; jj++) {
+    int j = neighs[jj];
+    int jtag = tag[j];
+    if (itag > jtag) {
+      if (((itag + jtag) & 1) == 0)
+        continue;
+    } else if (itag < jtag) {
+      if (((itag + jtag) & 1) == 1)
+        continue;
+    } else {
+      if (x[j].z < z_i)
+        continue;
+      if (x[j].z == z_i && x[j].y < y_i)
+        continue;
+      if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
+        continue;
+    }
+    ref_frebo_single_interaction(ka, i, j);
+  }
+}
+
+
+template<typename flt_t, typename acc_t>
+void ref_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    ref_frebo_single_atom(ka, i);
+  }
+  if (torflag) ref_torsion(ka);
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones_single_interaction(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, int j, int morseflag) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  int itype = map[x[i].w];
+  int jtype = map[x[j].w];
+
+  flt_t delx = x[i].x - x[j].x;
+  flt_t dely = x[i].y - x[j].y;
+  flt_t delz = x[i].z - x[j].z;
+  flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+  if (rsq >= ka->params.cutljsq[itype][jtype]) { return; }
+  flt_t rij = overloaded::sqrt(rsq);
+
+  LennardJonesPathAIREBOT<flt_t> testpath;
+  flt_t cij = 1.0;
+  if (rij < ka->params.cut3rebo) {
+    #pragma noinline
+    cij = ref_lennard_jones_test_path<flt_t,acc_t>(ka, i, j, rij, 
+      ka->params.rcmax[itype][jtype], &testpath);
+  }
+  if (cij == 0) {
+    return;
+  }
+
+  flt_t sigcut = ka->params.sigcut;
+  flt_t sigmin = ka->params.sigmin;
+  flt_t sigma = ka->params.sigma[itype][jtype];
+  flt_t rljmax = sigcut * sigma;
+  flt_t rljmin = sigmin * sigma;
+
+  flt_t dslw, slw = Sp2(rij, rljmin, rljmax, &dslw);
+
+  flt_t vdw, dvdw;
+  if (morseflag) {
+    const flt_t exr = exp(-rij * ka->params.lj4[itype][jtype]);
+    vdw = ka->params.lj1[itype][jtype] * exr * 
+      (ka->params.lj2[itype][jtype]*exr - 2);
+    dvdw = ka->params.lj3[itype][jtype] * exr * 
+      (1 - ka->params.lj2[itype][jtype]*exr);
+  } else {
+    flt_t r2inv = 1 / rsq;
+    flt_t r6inv = r2inv * r2inv * r2inv;
+
+    vdw = r6inv * (ka->params.lj3[itype][jtype]*r6inv - 
+		   ka->params.lj4[itype][jtype]);
+    dvdw = -r6inv * (ka->params.lj1[itype][jtype]*r6inv - 
+		     ka->params.lj2[itype][jtype]) / rij;
+  }
+
+  flt_t VLJ = vdw * slw;
+  flt_t dVLJ = dvdw * slw + vdw * dslw;
+
+  flt_t dStr, Str = Sp2<flt_t>(rij, ka->params.rcLJmin[itype][jtype], 
+    ka->params.rcLJmax[itype][jtype], &dStr);
+  flt_t VA = Str * cij * VLJ;
+  flt_t Stb = 0;
+  acc_t fij[3] = {0};
+  if (Str > 0) {
+    #pragma noinline
+    Stb = ref_lennard_jones_bondorder(ka, i, j, VA, fij);
+  }
+  flt_t fpair = -(dStr * (Stb * cij * VLJ - cij * VLJ) +
+                   dVLJ * (Str * Stb * cij + cij - Str * cij)) / rij;
+  flt_t evdwl = VA * Stb + (1 - Str) * cij * VLJ;
+  result_f[i].x += fpair * delx + fij[0];
+  result_f[i].y += fpair * dely + fij[1];
+  result_f[i].z += fpair * delz + fij[2];
+  result_f[j].x -= fpair * delx + fij[0];
+  result_f[j].y -= fpair * dely + fij[1];
+  result_f[j].z -= fpair * delz + fij[2];
+  ka->result_eng += evdwl;
+
+  if (cij < 1) {
+    #pragma noinline
+    ref_lennard_jones_force_path(ka, Str * Stb * VLJ + (1 - Str) * VLJ, 
+      &testpath);
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
+				   int morseflag) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int jj;
+  int itag = tag[i];
+  int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+  int jnum = ka->neigh_lmp.num_half[i];
+  for (jj = 0; jj < jnum; jj++) {
+    int j = neighs[jj];
+    ref_lennard_jones_single_interaction(ka, i, j, morseflag);
+  }
+}
+
+template<typename flt_t, typename acc_t>
+void ref_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    #pragma noinline
+    ref_lennard_jones_single_atom(ka, i, morseflag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+    Vectorized AIREBO implementation, standalone, using caching to reduce
+    memory access.
+   ---------------------------------------------------------------------- */
+
+template<typename flt_t, typename acc_t>
+struct aut_wrap {
+
+typedef typename intr_types<flt_t, acc_t>::fvec fvec;
+typedef typename intr_types<flt_t, acc_t>::avec avec;
+typedef typename intr_types<flt_t, acc_t>::ivec ivec;
+typedef typename intr_types<flt_t, acc_t>::bvec bvec;
+
+VEC_INLINE inline
+static void aut_loadatoms_vec(
+    AtomAIREBOT<flt_t> * atoms, ivec j_vec,
+    fvec *x, fvec * y, fvec * z, bvec * type_mask, int * map, ivec map_i, 
+    ivec c_1
+) {
+  const ivec c_4 = ivec::set1(4);
+  ivec j_vec_4 = ivec::mullo(c_4, j_vec);
+  fvec w;
+  fvec::gather_4_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z, &w);
+  ivec jtype = fvec::unpackloepi32(w);
+  jtype = ivec::srlv(map_i, jtype); //_mm512_castpd_si512(w));
+  jtype = ivec::the_and(c_1, jtype);
+  bvec jtype_mask = ivec::cmpneq(jtype, ivec::setzero());
+  *type_mask = jtype_mask;
+}
+
+VEC_INLINE inline
+static void aut_loadatoms_vec_notype(
+    AtomAIREBOT<flt_t> * atoms, ivec j_vec,
+    fvec *x, fvec * y, fvec * z
+) {
+  const ivec c_4 = ivec::set1(4);
+  ivec j_vec_4 = ivec::mullo(c_4, j_vec);
+  fvec::gather_3_adjacent(j_vec_4, &atoms[0].x, sizeof(flt_t), x, y, z);
+}
+
+static fvec aut_Sp2_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_3 = fvec::set1(3);
+  fvec c_6 = fvec::set1(6);
+  bvec m_lo = fvec::cmple(r, lo);
+  bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  fvec der = fvec::setzero();
+  if (bvec::test_any_set(m_tr)) {
+    fvec diff = hi -  lo;
+    fvec rcp = fvec::recip(diff);
+    fvec t = (r -  lo) *  rcp;
+    fvec v = c_1 -  t *  t * ( c_3 -  c_2 *  t);
+    ret = fvec::mask_blend(m_tr, ret, v);
+    fvec dv = c_6 *  rcp * ( t *  t -  t);
+    der = fvec::mask_blend(m_tr, der, dv);
+  }
+  *d = der;
+  return ret;
+}
+
+static fvec aut_Sp_deriv(fvec r, fvec lo, fvec hi, fvec * d) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_m0_5 = fvec::set1(-0.5);
+  fvec c_PI = fvec::set1(M_PI);
+  bvec m_lo = fvec::cmple(r, lo);
+  bvec m_hi = fvec::cmpnlt(r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, ~ m_hi);
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  fvec der = fvec::setzero();
+  if (bvec::test_any_set(m_tr)) {
+    fvec diff = hi -  lo;
+    fvec rcp = fvec::mask_recip(c_1, m_tr, diff);
+    fvec t = (r -  lo) /  diff;
+    fvec sinval, cosval;
+    sinval = fvec::mask_sincos(&cosval, fvec::setzero(), c_1, m_tr, c_PI *  t);
+    fvec v = c_0_5 * ( c_1 +  cosval);
+    ret = fvec::mask_blend(m_tr, ret, v);
+    fvec dv = c_PI *  c_m0_5 *  rcp *  sinval;
+    der = fvec::mask_blend(m_tr, der, dv);
+  }
+  *d = der;
+  return ret;
+}
+
+static fvec aut_mask_Sp(bvec mask, fvec r, fvec lo, fvec hi) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_PI = fvec::set1(M_PI);
+  bvec m_lo = fvec::mask_cmple(mask, r, lo);
+  bvec m_hi = fvec::mask_cmpnlt(mask, r, hi); // nlt == ge
+  bvec m_tr = bvec::kandn(m_lo, bvec::kandn(m_hi, mask));
+  fvec ret = c_1;
+  ret = fvec::mask_blend(m_hi, ret, fvec::setzero());
+  if (bvec::test_any_set(m_tr)) {
+    fvec rcp = fvec::mask_recip(c_1, m_tr, hi -  lo);
+    fvec t = (r -  lo) *  rcp;
+    fvec v = c_0_5 * ( c_1 +  fvec::mask_cos(c_1, m_tr, c_PI *  t));
+    ret = fvec::mask_blend(m_tr, ret, v);
+  }
+  return ret;
+}
+
+static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  int offset = ka->neigh_from_atom * ka->num_neighs_per_atom;
+  ivec c_CARBON = ivec::setzero();
+  int map_i = 0;
+  int i;
+  for (i = 1; i < ka->num_types; i++) {
+    if (ka->map[i])
+      map_i |= (1 << i);
+  }
+  ivec c_i1 = ivec::set1(1);
+  ivec c_im = ivec::set1(map_i);
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+
+  for (i = ka->neigh_from_atom; i < ka->neigh_to_atom; i++) {
+
+    fvec x_i = fvec::set1(x[i].x);
+    fvec y_i = fvec::set1(x[i].y);
+    fvec z_i = fvec::set1(x[i].z);
+    int itype = ka->map[ka->x[i].w];
+
+    fvec rcmaxsq0 = fvec::set1(ka->params.rcmaxsq[itype][0]);
+    fvec rcmaxsq1 = fvec::set1(ka->params.rcmaxsq[itype][1]);
+    fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+    fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+    fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+    fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+    fvec rcmaxskinsq0 = fvec::set1(
+        (ka->params.rcmax[itype][0] + ka->skin) * (ka->params.rcmax[itype][0] +
+						   ka->skin));
+    fvec rcmaxskinsq1 = fvec::set1(
+        (ka->params.rcmax[itype][1] + ka->skin) * (ka->params.rcmax[itype][1] +
+						   ka->skin));
+    fvec nC = fvec::setzero();
+    fvec nH = fvec::setzero();
+
+    ka->neigh_rebo.offset[i] = offset;
+
+    int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] : 
+      ka->neigh_rebo.num_half[i];
+    int * neighs = ka->rebuild_flag ? 
+      &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] : 
+      &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
+    int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
+    int n = 0;
+    int n_skin = 0;
+
+    int lowest_idx;
+    #pragma unroll(4)
+    for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) {
+      bvec j_mask = bvec::full();
+      if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx);
+
+      int * _noalias neighs_l = neighs + lowest_idx;
+      fvec x_j, y_j, z_j;
+      bvec jtype_mask;
+      ivec ji = ivec::maskz_loadu(j_mask, neighs_l);
+      aut_loadatoms_vec(x, ji,
+          &x_j, &y_j, &z_j, &jtype_mask, ka->map, c_im, c_i1);
+      fvec delx = x_i -  x_j;
+      fvec dely = y_i -  y_j;
+      fvec delz = z_i -  z_j;
+      fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+      if (ka->rebuild_flag) {
+        fvec rcmaxskinsq = fvec::mask_blend(jtype_mask, rcmaxskinsq0, 
+					    rcmaxskinsq1);
+        bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxskinsq);
+        ivec::mask_compressstore(c_mask, &skin_target[n_skin], ji);
+        n_skin += bvec::popcnt(c_mask);
+      }
+      fvec rcmaxsq = fvec::mask_blend(jtype_mask, rcmaxsq0, rcmaxsq1);
+      bvec c_mask = fvec::mask_cmplt(j_mask, rsq, rcmaxsq);
+      if (bvec::test_all_unset(c_mask)) continue;
+      ivec::mask_compressstore(c_mask, &ka->neigh_rebo.entries[offset + n], ji);
+      n += bvec::popcnt(c_mask);
+      fvec rcmax = fvec::mask_blend(jtype_mask, rcmax0, rcmax1);
+      fvec rcmin = fvec::mask_blend(jtype_mask, rcmin0, rcmin1);
+      fvec sp = aut_mask_Sp(c_mask, fvec::sqrt(rsq), rcmin, rcmax);
+      nC = fvec::mask_add(nC, bvec::kandn(jtype_mask, c_mask), nC, sp);
+      nH = fvec::mask_add(nH, bvec::kand (jtype_mask, c_mask), nH, sp);
+    }
+    ka->neigh_rebo.num[i] = n;
+    if (ka->rebuild_flag) {
+      for (int i = 0; i < n_skin; i++) {
+        ka->neigh_rebo.entries[offset+n_skin+i] = skin_target[i];
+      }
+    }
+    if (ka->rebuild_flag) {
+      assert(n <= n_skin);
+      offset += 2 * n_skin;
+      ka->neigh_rebo.num_half[i] = n_skin;
+    } else {
+      assert(n <= jnum);
+      offset += 2 * jnum;
+    }
+    ka->nC[i] = fvec::reduce_add(nC);
+    ka->nH[i] = fvec::reduce_add(nH);
+  }
+}
+
+
+static fvec aut_eval_poly_lin_pd_2(int n, flt_t * vals, ivec idx, fvec x, 
+				   fvec * deriv) {
+  fvec c_1 = fvec::set1(1);
+  fvec x_i = c_1;
+  fvec x_im1 = fvec::setzero();
+  fvec result = fvec::setzero();
+  fvec i_v = fvec::setzero();
+  *deriv = fvec::setzero();
+  int i;
+  for (i = 0; i < n; i++) {
+    fvec coeff = fvec::gather(idx, vals + i, sizeof(flt_t));
+    result = result +  coeff *  x_i;
+    *deriv = *deriv +  coeff *  x_im1 *  i_v;
+    x_im1 = x_i;
+    x_i = x_i *  x;
+    i_v = i_v +  c_1;
+  }
+  return result;
+}
+
+static fvec aut_mask_gSpline_pd_2(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+				  bvec active_mask, int itype, fvec cosjik, 
+				  fvec Nij, fvec *dgdc, fvec *dgdN) {
+  int i;
+  flt_t * gDom = NULL;
+  int nDom = 0;
+  ivec offs = ivec::setzero();
+  fvec NCmin = fvec::set1(ka->params.NCmin);
+  bvec Ngt = fvec::cmpnle(Nij, NCmin); //gt
+  if (itype == 0) {
+    nDom = 4;
+    gDom = &ka->params.gCdom[0];
+    offs = ivec::mask_blend(Ngt, offs, ivec::set1(4*6));
+  } else {
+    nDom = 3;
+    gDom = &ka->params.gHdom[0];
+    offs = ivec::set1(8 * 6);
+  }
+  cosjik = fvec::max(fvec::set1(gDom[0]), fvec::min(fvec::set1(gDom[nDom]), 
+						    cosjik));
+  ivec index6 = ivec::setzero();
+  for (i = 0; i < nDom; i++) {
+    bvec cosge = fvec::cmpnlt(cosjik, fvec::set1(gDom[i])); //ge
+    bvec cosle = fvec::cmple(cosjik, fvec::set1(gDom[i+1]));
+    index6 = ivec::mask_blend(cosge & cosle, index6, ivec::set1(6*i));
+  }
+  fvec g = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], offs +  index6, 
+				  cosjik, dgdc);
+  *dgdN = fvec::setzero();
+  if (itype == 0) {
+    fvec NCmax = fvec::set1(ka->params.NCmax);
+    bvec Nlt = fvec::cmplt(Nij, NCmax); //gt
+    bvec Nmask = Ngt & Nlt;
+    if (bvec::test_any_set(Nmask)) {
+      fvec dg1;
+      fvec g1 = aut_eval_poly_lin_pd_2(6, &ka->params.gVal[0], index6, cosjik, 
+				       &dg1);
+      fvec dS;
+      fvec cut = aut_Sp_deriv(Nij, NCmin, NCmax, &dS);
+      *dgdN = fvec::mask_mul(*dgdN, Nmask, dS, g1 -  g);
+      g = fvec::mask_add(g, Nmask, g, cut * ( g1 -  g));
+      *dgdc = fvec::mask_add(*dgdc, Nmask, *dgdc, cut * ( dg1 -  *dgdc));
+    }
+  }
+  return g;
+}
+
+static fvec aut_PijSpline(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+			  int jtype, fvec NijC, fvec NijH, fvec *dN2) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN20[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN21[fvec::VL] __attribute__((aligned(64)));
+  flt_t NijC_[fvec::VL] __attribute__((aligned(64)));
+  flt_t NijH_[fvec::VL] __attribute__((aligned(64)));
+  flt_t tmp_dN2[2];
+  fvec::store(NijC_, NijC);
+  fvec::store(NijH_, NijH);
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    ret[i] = PijSpline(ka, itype, jtype, NijC_[i], NijH_[i], tmp_dN2);
+    dN20[i] = tmp_dN2[0];
+    dN21[i] = tmp_dN2[1];
+  }
+  dN2[0] = fvec::load(dN20);
+  dN2[1] = fvec::load(dN21);
+  return fvec::load(ret);
+}
+
+/*
+ * aut_frebo_data stores all the short-ranged coordinations
+ * and intermediate values that get reused frequently during
+ * bondorder calculations.
+ * BUF_CAP should rarely exceed 4, so 8 is a very conservative
+ * value.
+ */
+static const int BUF_CAP = 8;
+struct aut_frebo_data {
+  fvec rikx_buf[BUF_CAP];
+  fvec riky_buf[BUF_CAP];
+  fvec rikz_buf[BUF_CAP];
+  fvec rikmag_buf[BUF_CAP];
+  fvec cosjik_buf[BUF_CAP];
+  ivec k_buf[BUF_CAP];
+  fvec g_buf[BUF_CAP];
+  fvec dgdc_buf[BUF_CAP];
+  fvec ex_lam_buf[BUF_CAP];
+  fvec wik_buf[BUF_CAP];
+  fvec dwik_buf[BUF_CAP];
+  fvec cutN_buf[BUF_CAP];
+  fvec dcutN_buf[BUF_CAP];
+  bvec ktype_buf[BUF_CAP];
+  bvec mask_buf[BUF_CAP];
+  fvec force_k_x_buf[BUF_CAP];
+  fvec force_k_y_buf[BUF_CAP];
+  fvec force_k_z_buf[BUF_CAP];
+  int buf_len;
+  fvec x_i;
+  fvec y_i;
+  fvec z_i;
+  fvec x_j;
+  fvec y_j;
+  fvec z_j;
+  fvec nCi;
+  fvec nHi;
+  fvec force_i_x;
+  fvec force_i_y;
+  fvec force_i_z;
+  fvec force_j_x;
+  fvec force_j_y;
+  fvec force_j_z;
+};
+
+/*
+ * Initialize values in aut_frebo_data and perform the calculations
+ * for p_ij.
+ */
+static fvec aut_frebo_pij_pd_2(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias data,
+    int itype, int jtype,
+    ivec vi, ivec vj,
+    fvec rijx, fvec rijy, fvec rijz, fvec rijmag,
+    fvec wij, fvec VA, fvec * sum_N, fvec fij[3]
+) {
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias map = ka->map;
+  flt_t * _noalias nC = ka->nC;
+  flt_t * _noalias nH = ka->nH;
+  fvec x_i, y_i, z_i;
+  fvec x_j, y_j, z_j;
+  x_i = data->x_i;
+  y_i = data->y_i;
+  z_i = data->z_i;
+  x_j = data->x_j;
+  y_j = data->y_j;
+  z_j = data->z_j;
+  fvec invrijm = fvec::recip(rijmag);
+  fvec invrijm2 = invrijm *  invrijm;
+  fvec rcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec rcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec Nmin = fvec::set1(ka->params.Nmin);
+  fvec Nmax = fvec::set1(ka->params.Nmax);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec nCi = data->nCi;
+  fvec nHi = data->nHi;
+  fvec Nij = nHi +  nCi -  wij;
+  fvec factor_jtype, factor_not_jtype;
+  if (jtype) {
+    factor_jtype = fvec::set1(1);
+    factor_not_jtype = fvec::set1(0);
+  } else {
+    factor_jtype = fvec::set1(0);
+    factor_not_jtype = fvec::set1(1);
+  }
+  fvec NijC = nCi -  wij *  factor_not_jtype;
+  fvec NijH = nHi -  wij *  factor_jtype;
+  fvec sum_pij = fvec::setzero();
+  fvec sum_dpij_dN = fvec::setzero();
+  fvec dN2[2];
+  ivec offseti = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, 
+				   ka->neigh_rebo.offset, sizeof(int));
+  int buf_len = 0;
+  ivec knum = ivec::mask_gather(ivec::setzero(), bvec::full(), vi, 
+				ka->neigh_rebo.num, sizeof(int));
+  ivec kk = ivec::setzero();
+  bvec active_mask = ivec::cmplt(kk, knum);
+  ivec c_i1 = ivec::set1(1);
+  fvec rho_j = fvec::set1(ka->params.rho[jtype][1]);
+  fvec rho_k0 = fvec::set1(ka->params.rho[0][1]);
+  fvec rho_k1 = fvec::set1(ka->params.rho[1][1]);
+  fvec c_4 = fvec::set1(4);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_m2_0 = fvec::set1(-2.0);
+  fvec c_4_0 = fvec::set1(4.0);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_m0_5 = fvec::set1(-0.5);
+  fvec c_1 = fvec::set1(1);
+  fvec c_m1 = fvec::set1(-1);
+  fvec factor_itype = itype ? c_1 : fvec::setzero();
+  fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+  fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+  fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+  fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+  fvec result_f_i_x = fvec::setzero();
+  fvec result_f_i_y = fvec::setzero();
+  fvec result_f_i_z = fvec::setzero();
+  fvec result_f_j_x = fvec::setzero();
+  fvec result_f_j_y = fvec::setzero();
+  fvec result_f_j_z = fvec::setzero();
+  *sum_N = fvec::setzero();
+  {
+    while (bvec::test_any_set(active_mask)) {
+      ivec k = ivec::mask_gather(ivec::setzero(), active_mask, kk +  offseti, 
+				 ka->neigh_rebo.entries, sizeof(int));
+      bvec excluded_mask = ivec::cmpeq(k, vj) & active_mask;
+      if (bvec::test_any_set(excluded_mask)) {
+        kk = ivec::mask_add(kk, excluded_mask, kk, c_i1);
+        active_mask = ivec::cmplt(kk, knum);
+        continue;
+      }
+      fvec x_k, y_k, z_k;
+      bvec ktype_mask;
+      aut_loadatoms_vec(x, k, &x_k, &y_k, &z_k, &ktype_mask, ka->map, map_i, 
+			c_i1);
+      fvec rikx = x_i -  x_k;
+      fvec riky = y_i -  y_k;
+      fvec rikz = z_i -  z_k;
+      fvec rikmag = fvec::sqrt(rikx *  rikx +  riky *  riky +  rikz *  rikz);
+      fvec rho_k = fvec::mask_blend(ktype_mask, rho_k0, rho_k1);
+      fvec lamdajik = c_4 *  factor_itype * ( rho_k -  rikmag - ( rho_j -  
+								  rijmag));
+      fvec ex_lam = fvec::exp(lamdajik);
+      fvec rcmax = fvec::mask_blend(ktype_mask, rcmax0, rcmax1);
+      fvec rcmin = fvec::mask_blend(ktype_mask, rcmin0, rcmin1);
+      fvec dwik;
+      fvec wik = aut_Sp_deriv(rikmag, rcmin, rcmax, &dwik);
+      fvec Nki = fvec::gather(k, nC, sizeof(flt_t)) +  
+	fvec::gather(k, nH, sizeof(flt_t)) -  wik;
+      fvec cosjik = (rijx *  rikx +  rijy *  riky +  rijz *  rikz) / 
+	( rijmag *  rikmag);
+      cosjik = fvec::min(c_1, fvec::max(c_m1, cosjik));
+      fvec dgdc, dgdN;
+      fvec g = aut_mask_gSpline_pd_2(ka, active_mask, itype, cosjik, Nij, 
+				     &dgdc, &dgdN);
+      sum_pij = fvec::mask_add(sum_pij, active_mask, sum_pij, wik * g * ex_lam);
+      sum_dpij_dN = fvec::mask_add(sum_dpij_dN, active_mask, sum_dpij_dN, 
+				   wik * ex_lam * dgdN);
+      fvec dcutN;
+      fvec cutN = aut_Sp_deriv(Nki, Nmin, Nmax, &dcutN);
+      *sum_N = fvec::mask_add(*sum_N, active_mask, *sum_N, 
+			      fvec::mask_blend(ktype_mask, c_1, 
+					       fvec::setzero()) * wik * cutN);
+      if (buf_len == BUF_CAP) goto exceed_buffer;
+      data->rikx_buf[buf_len] = rikx;
+      data->riky_buf[buf_len] = riky;
+      data->rikz_buf[buf_len] = rikz;
+      data->rikmag_buf[buf_len] = rikmag;
+      data->cosjik_buf[buf_len] = cosjik;
+      data->ktype_buf[buf_len] = ktype_mask;
+      data->k_buf[buf_len] = k;
+      data->g_buf[buf_len] = g;
+      data->dgdc_buf[buf_len] = dgdc;
+      data->ex_lam_buf[buf_len] = ex_lam;
+      data->wik_buf[buf_len] = wik;
+      data->dwik_buf[buf_len] = dwik;
+      data->mask_buf[buf_len] = active_mask;
+      data->cutN_buf[buf_len] = cutN;
+      data->dcutN_buf[buf_len] = dcutN;
+      buf_len += 1;
+      kk = ivec::mask_add(kk, active_mask, kk, c_i1);
+      active_mask = ivec::cmplt(kk, knum);
+    }
+    data->buf_len = buf_len;
+    fvec PijS = aut_PijSpline(ka, itype, jtype, NijC, NijH, &dN2[0]);
+    fvec pij = fvec::invsqrt(c_1 + sum_pij + PijS);
+    fvec tmp = c_m0_5 * pij * pij * pij;
+    int buf_idx;
+    for (buf_idx = 0; buf_idx < buf_len; buf_idx++) {
+      fvec rikx = data->rikx_buf[buf_idx];
+      fvec riky = data->riky_buf[buf_idx];
+      fvec rikz = data->rikz_buf[buf_idx];
+      fvec rikmag = data->rikmag_buf[buf_idx];
+      fvec cosjik = data->cosjik_buf[buf_idx];
+      bvec ktype_mask = data->ktype_buf[buf_idx];
+      ivec k = data->k_buf[buf_idx];
+      fvec g = data->g_buf[buf_idx];
+      fvec dgdc = data->dgdc_buf[buf_idx];
+      fvec ex_lam = data->ex_lam_buf[buf_idx];
+      fvec wik = data->wik_buf[buf_idx];
+      fvec dwik = data->dwik_buf[buf_idx];
+      bvec mask = data->mask_buf[buf_idx];
+      fvec invrikm = fvec::recip(rikmag);
+      fvec rjkx = rikx -  rijx;
+      fvec rjky = riky -  rijy;
+      fvec rjkz = rikz -  rijz;
+      fvec rjkmag = fvec::sqrt(
+           rjkx *  rjkx +  rjky *  rjky +  rjkz *  rjkz);
+      fvec rijrik = c_2_0 *  rijmag *  rikmag;
+      fvec rr = rijmag *  rijmag -  rikmag *  rikmag;
+      fvec dctdjk = c_m2_0 /  rijrik;
+      fvec dctdik = (rjkmag *  rjkmag -  rr) / ( rijrik *  rikmag *  rikmag);
+      fvec dctdij = (rjkmag *  rjkmag +  rr) / ( rijrik *  rijmag *  rijmag);
+      fvec fi[3], fj[3], fk[3];
+      fvec pref = c_0_5 *  VA *  tmp;
+      fvec tmp20 = pref *  wik *  dgdc *  ex_lam;
+      fj[0] = fj[1] = fj[2] = fvec::setzero();
+      fvec tmpdik = tmp20 *  dctdik;
+      fi[0] = fvec::setzero() -  tmpdik *  rikx;
+      fi[1] = fvec::setzero() -  tmpdik *  riky;
+      fi[2] = fvec::setzero() -  tmpdik *  rikz;
+      fk[0] = tmpdik *  rikx;
+      fk[1] = tmpdik *  riky;
+      fk[2] = tmpdik *  rikz;
+
+      fvec tmpdij = tmp20 *  dctdij;
+      fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmpdij *  rijx);
+      fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmpdij *  rijy);
+      fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmpdij *  rijz);
+
+      fvec tmpdjk = tmp20 *  dctdjk;
+      fi[0] = fi[0] -  tmpdjk *  rjkx;
+      fi[1] = fi[1] -  tmpdjk *  rjky;
+      fi[2] = fi[2] -  tmpdjk *  rjkz;
+      fk[0] = fk[0] +  tmpdjk *  rjkx;
+      fk[1] = fk[1] +  tmpdjk *  rjky;
+      fk[2] = fk[2] +  tmpdjk *  rjkz;
+      fij[0] = fvec::mask_add(fij[0], mask, fij[0], tmpdjk *  rjkx);
+      fij[1] = fvec::mask_add(fij[1], mask, fij[1], tmpdjk *  rjky);
+      fij[2] = fvec::mask_add(fij[2], mask, fij[2], tmpdjk *  rjkz);
+
+      if (itype) {
+        fvec tmp21 = pref *  wik *  g *  ex_lam *  c_4_0;
+        fvec tmp21ij = tmp21 *  invrijm;
+        fij[0] = fvec::mask_sub(fij[0], mask, fij[0], tmp21ij * rijx);
+        fij[1] = fvec::mask_sub(fij[1], mask, fij[1], tmp21ij * rijy);
+        fij[2] = fvec::mask_sub(fij[2], mask, fij[2], tmp21ij * rijz);
+        fvec tmp21ik = tmp21 * invrikm;
+        fi[0] = fi[0] +  tmp21ik *  rikx;
+        fi[1] = fi[1] +  tmp21ik *  riky;
+        fi[2] = fi[2] +  tmp21ik *  rikz;
+        fk[0] = fk[0] -  tmp21ik *  rikx;
+        fk[1] = fk[1] -  tmp21ik *  riky;
+        fk[2] = fk[2] -  tmp21ik *  rikz;
+      }
+
+      // coordination forces
+
+      // dwik forces
+      fvec tmp22 = pref *  dwik *  g *  ex_lam *  invrikm;
+      fi[0] = fi[0] -  tmp22 *  rikx;
+      fi[1] = fi[1] -  tmp22 *  riky;
+      fi[2] = fi[2] -  tmp22 *  rikz;
+      fk[0] = fk[0] +  tmp22 *  rikx;
+      fk[1] = fk[1] +  tmp22 *  riky;
+      fk[2] = fk[2] +  tmp22 *  rikz;
+
+      // PIJ forces
+      fvec dN2ktype = fvec::mask_blend(ktype_mask, dN2[0], dN2[1]);
+      fvec tmp23 = pref *  dN2ktype *  dwik *  invrikm;
+      fi[0] = fi[0] -  tmp23 *  rikx;
+      fi[1] = fi[1] -  tmp23 *  riky;
+      fi[2] = fi[2] -  tmp23 *  rikz;
+      fk[0] = fk[0] +  tmp23 *  rikx;
+      fk[1] = fk[1] +  tmp23 *  riky;
+      fk[2] = fk[2] +  tmp23 *  rikz;
+
+      // dgdN forces
+      fvec tmp24 = pref *  sum_dpij_dN *  dwik *  invrikm;
+      fi[0] = fi[0] -  tmp24 *  rikx;
+      fi[1] = fi[1] -  tmp24 *  riky;
+      fi[2] = fi[2] -  tmp24 *  rikz;
+      fk[0] = fk[0] +  tmp24 *  rikx;
+      fk[1] = fk[1] +  tmp24 *  riky;
+      fk[2] = fk[2] +  tmp24 *  rikz;
+
+      result_f_i_x = fvec::mask_add(result_f_i_x, mask, result_f_i_x, fi[0]);
+      result_f_i_y = fvec::mask_add(result_f_i_y, mask, result_f_i_y, fi[1]);
+      result_f_i_z = fvec::mask_add(result_f_i_z, mask, result_f_i_z, fi[2]);
+      result_f_j_x = fvec::mask_add(result_f_j_x, mask, result_f_j_x, fj[0]);
+      result_f_j_y = fvec::mask_add(result_f_j_y, mask, result_f_j_y, fj[1]);
+      result_f_j_z = fvec::mask_add(result_f_j_z, mask, result_f_j_z, fj[2]);
+
+      data->force_k_x_buf[buf_idx] = fk[0];
+      data->force_k_y_buf[buf_idx] = fk[1];
+      data->force_k_z_buf[buf_idx] = fk[2];
+    }
+    data->force_i_x = result_f_i_x;
+    data->force_i_y = result_f_i_y;
+    data->force_i_z = result_f_i_z;
+    data->force_j_x = result_f_j_x;
+    data->force_j_y = result_f_j_y;
+    data->force_j_z = result_f_j_z;
+    return pij;
+  }
+  exceed_buffer:
+  data->buf_len = -1;
+  return fvec::setzero();
+}
+
+/*
+ * Apply the force values stored iin aut_frebo_data to
+ * the respective neighbors.
+ */
+static void aut_frebo_data_writeback(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka, 
+    struct aut_frebo_data * _noalias data) {
+  ResultForceT<acc_t> * _noalias result_f = ka->result_f;
+  flt_t fk_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fk_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fk_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fk_k_buf[ivec::VL] __attribute__((aligned(64)));
+  int buf_idx;
+  for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
+    ivec k = data->k_buf[buf_idx];
+    bvec active_mask = data->mask_buf[buf_idx];
+
+    fvec::store(fk_x_buf, data->force_k_x_buf[buf_idx]);
+    fvec::store(fk_y_buf, data->force_k_y_buf[buf_idx]);
+    fvec::store(fk_z_buf, data->force_k_z_buf[buf_idx]);
+    ivec::store(fk_k_buf, k);
+
+    int lane;
+    for (lane = 0; lane < fvec::VL; lane++) {
+      if (bvec::test_at(active_mask, lane)) {} else continue;
+      int kk = fk_k_buf[lane];
+      result_f[kk].x += fk_x_buf[lane];
+      result_f[kk].y += fk_y_buf[lane];
+      result_f[kk].z += fk_z_buf[lane];
+    }
+  }
+}
+
+static void aut_frebo_N_spline_force(
+     KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka, 
+     struct aut_frebo_data * _noalias data, int itype, int jtype, ivec vi, 
+     ivec vj, fvec VA, fvec dN, fvec dNconj, fvec Nconj) {
+  ivec c_i1 = ivec::set1(1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_TOL = fvec::set1(TOL);
+  ResultForceT<acc_t> * _noalias result_f = ka->result_f;
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias map = ka->map;
+  flt_t * _noalias nC = ka->nC;
+  flt_t * _noalias nH = ka->nH;
+  fvec x_i, y_i, z_i;
+  x_i = data->x_i;
+  y_i = data->y_i;
+  z_i = data->z_i;
+  fvec Nmin = fvec::set1(ka->params.Nmin);
+  fvec Nmax = fvec::set1(ka->params.Nmax);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec dN2[2];
+  ivec kk = ivec::setzero();
+  fvec rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+  fvec rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+  fvec rcmin0 = fvec::set1(ka->params.rcmin[itype][0]);
+  fvec rcmin1 = fvec::set1(ka->params.rcmin[itype][1]);
+  fvec result_f_i_x = fvec::setzero();
+  fvec result_f_i_y = fvec::setzero();
+  fvec result_f_i_z = fvec::setzero();
+  int buf_idx;
+  for (buf_idx = 0; buf_idx < data->buf_len; buf_idx++) {
+    ivec k = data->k_buf[buf_idx];
+    bvec active_mask = data->mask_buf[buf_idx];
+    fvec rikx = data->rikx_buf[buf_idx];
+    fvec riky = data->riky_buf[buf_idx];
+    fvec rikz = data->rikz_buf[buf_idx];
+    fvec rikmag = data->rikmag_buf[buf_idx];
+    bvec ktype_mask = data->ktype_buf[buf_idx];
+
+    fvec dwik = data->dwik_buf[buf_idx];
+    fvec wik = data->wik_buf[buf_idx];
+
+    fvec dNki = data->dcutN_buf[buf_idx];
+    fvec SpN = data->cutN_buf[buf_idx];
+
+    fvec invrikmag = fvec::recip(rikmag);
+    fvec pref = VA *  dwik *  invrikmag;
+    fvec fdN = dN *  pref;
+    fvec fdNconj = pref *  SpN *  c_2 *  dNconj *  Nconj;
+    fvec ffactor = fdN;
+    bvec ktype_is_C = ~ ktype_mask;
+    ffactor = fvec::mask_add(ffactor, ktype_is_C, ffactor,  fdNconj);
+
+    fvec fkx = ffactor *  rikx;
+    fvec fky = ffactor *  riky;
+    fvec fkz = ffactor *  rikz;
+
+    data->force_k_x_buf[buf_idx] = data->force_k_x_buf[buf_idx] +  fkx;
+    data->force_k_y_buf[buf_idx] = data->force_k_y_buf[buf_idx] +  fky;
+    data->force_k_z_buf[buf_idx] = data->force_k_z_buf[buf_idx] +  fkz;
+
+    result_f_i_x = fvec::mask_sub(result_f_i_x, active_mask, result_f_i_x, fkx);
+    result_f_i_y = fvec::mask_sub(result_f_i_y, active_mask, result_f_i_y, fky);
+    result_f_i_z = fvec::mask_sub(result_f_i_z, active_mask, result_f_i_z, fkz);
+
+    bvec need_k_neighs = fvec::mask_cmpnle(active_mask, fvec::abs(dNki), c_TOL)
+      & ktype_is_C;
+    if (bvec::test_any_set(need_k_neighs)) {
+      int lane;
+      for (lane = 0; lane < fvec::VL; lane++) {
+        if (! bvec::test_at(need_k_neighs, lane)) continue;
+        int kk = ivec::at(k, lane);
+        int k = kk;
+        int ktype = map[x[k].w];
+        int i = ivec::at(vi, lane);
+        fvec oldVA = VA;
+        double VA = fvec::at(oldVA, lane);
+        fvec oldwik = wik;
+        double wik = fvec::at(oldwik, lane);
+        fvec olddNconj = dNconj;
+        double dNconj = fvec::at(olddNconj, lane);
+        fvec oldNconj = Nconj;
+        double Nconj = fvec::at(oldNconj, lane);
+        fvec olddNki = dNki;
+        double dNki = fvec::at(olddNki, lane);
+        int * neighs_k = ka->neigh_rebo.entries + ka->neigh_rebo.offset[k];
+        int nnum = ka->neigh_rebo.num[k];
+        int nn;
+        for (nn = 0; nn < nnum; nn++) {
+          int n = neighs_k[nn];
+          if (n == i) continue;
+          double rknx = x[k].x - x[n].x;
+          double rkny = x[k].y - x[n].y;
+          double rknz = x[k].z - x[n].z;
+          double rknmag = sqrt(rknx * rknx + rkny * rkny + rknz * rknz);
+          int ntype = map[x[n].w];
+          double rcminkn = ka->params.rcmin[ktype][ntype];
+          double rcmaxkn = ka->params.rcmax[ktype][ntype];
+          double dwkn;
+          Sp(rknmag, rcminkn, rcmaxkn, &dwkn);
+          double ffactor = VA * dNconj * 2 * Nconj * wik * dNki * dwkn / rknmag;
+          result_f[k].x -= ffactor * rknx;
+          result_f[k].y -= ffactor * rkny;
+          result_f[k].z -= ffactor * rknz;
+          result_f[n].x += ffactor * rknx;
+          result_f[n].y += ffactor * rkny;
+          result_f[n].z += ffactor * rknz;
+        }
+      }
+    }
+  }
+  data->force_i_x = data->force_i_x +  result_f_i_x;
+  data->force_i_y = data->force_i_y +  result_f_i_y;
+  data->force_i_z = data->force_i_z +  result_f_i_z;
+}
+
+static fvec aut_frebo_pi_rc_pd(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype,
+			       int jtype, fvec Nij, fvec Nji, fvec Nijconj, 
+			       fvec * dN3) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    flt_t dN3tmp[3];
+    ret[i] = frebo_pi_rc(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), 
+			 fvec::at(Nijconj, i), &dN3tmp[0]);
+    dN3ret[0][i] = dN3tmp[0];
+    dN3ret[1][i] = dN3tmp[1];
+    dN3ret[2][i] = dN3tmp[2];
+  }
+  dN3[0] = fvec::load(&dN3ret[0][0]);
+  dN3[1] = fvec::load(&dN3ret[1][0]);
+  dN3[2] = fvec::load(&dN3ret[2][0]);
+  return fvec::load(&ret[0]);
+}
+
+static fvec aut_frebo_Tij(KernelArgsAIREBOT<flt_t,acc_t> * ka, int itype, 
+			  int jtype, fvec Nij, fvec Nji, fvec Nijconj, 
+			  fvec * dN3) {
+  flt_t ret[fvec::VL] __attribute__((aligned(64)));
+  flt_t dN3ret[3][fvec::VL] __attribute__((aligned(64)));
+  int i;
+  for (i = 0; i < fvec::VL; i++) {
+    flt_t dN3tmp[3];
+    ret[i] = frebo_Tij(ka, itype, jtype, fvec::at(Nij, i), fvec::at(Nji, i), 
+		       fvec::at(Nijconj, i), &dN3tmp[0]);
+    dN3ret[0][i] = dN3tmp[0];
+    dN3ret[1][i] = dN3tmp[1];
+    dN3ret[2][i] = dN3tmp[2];
+  }
+  dN3[0] = fvec::load(&dN3ret[0][0]);
+  dN3[1] = fvec::load(&dN3ret[1][0]);
+  dN3[2] = fvec::load(&dN3ret[2][0]);
+  return fvec::load(&ret[0]);
+}
+
+static fvec aut_frebo_sum_omega(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias i_data,
+    struct aut_frebo_data * _noalias j_data,
+    int itype, int jtype,
+    ivec vi, ivec vj,
+    fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
+    fvec VA, fvec fij[3]
+) {
+  fvec c_1 = fvec::set1(1);
+  fvec c_m1 = fvec::set1(-1);
+  fvec c_2 = fvec::set1(2);
+  fvec c_m2 = fvec::set1(-2);
+  fvec sum_omega = fvec::setzero();
+  fvec thmin = fvec::set1(ka->params.thmin);
+  fvec thmax = fvec::set1(ka->params.thmax);
+  // 2 == i, 3 == j
+  fvec r32x = fvec::setzero() -  r23x;
+  fvec r32y = fvec::setzero() -  r23y;
+  fvec r32z = fvec::setzero() -  r23z;
+  int buf_idx_i, buf_idx_j;
+  for (buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
+    // a1 == k == buf_idx_i
+    bvec mask_start = i_data->mask_buf[buf_idx_i];
+    fvec r21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21mag = i_data->rikmag_buf[buf_idx_i];
+    // TODO use buffered cosjik
+    fvec cos321 = (
+        r23x *  r21x +  r23y *  r21y +  r23z *  r21z) / ( r23mag *  r21mag);
+    cos321 = fvec::min(c_1, fvec::max(c_m1, cos321));
+    fvec sin321 = fvec::sqrt(c_1 -  cos321 *  cos321);
+    bvec mask_outer = fvec::cmpneq(fvec::setzero(), sin321) & mask_start;
+    // add "continue"
+    fvec sink2i = fvec::mask_recip(fvec::undefined(), mask_outer, 
+				   sin321 * sin321);
+    fvec rik2i = fvec::mask_recip(fvec::undefined(), mask_outer, 
+				  r21mag * r21mag);
+    fvec rr = r23mag *  r23mag -  r21mag *  r21mag;
+    fvec r31x = r21x -  r23x;
+    fvec r31y = r21y -  r23y;
+    fvec r31z = r21z -  r23z;
+    fvec r31mag2 = r31x *  r31x +  r31y *  r31y +  r31z *  r31z;
+    fvec rijrik = c_2 *  r23mag *  r21mag;
+    fvec r21mag2 = r21mag *  r21mag;
+    fvec dctik = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 -  rr, 
+				rijrik *  r21mag2);
+    fvec dctij = fvec::mask_div(fvec::undefined(), mask_outer, r31mag2 +  rr, 
+				rijrik *  r23mag *  r23mag);
+    fvec dctjk = fvec::mask_div(fvec::undefined(), mask_outer, c_m2, rijrik);
+    fvec dw21 = i_data->dwik_buf[buf_idx_i];
+    fvec w21 = i_data->wik_buf[buf_idx_i];
+    fvec dtsjik;
+    fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
+    dtsjik = fvec::setzero() -  dtsjik; // todo replace by appropriate xor.
+    ivec k = i_data->k_buf[buf_idx_i];
+    for (buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
+      // check l == k in second loop.
+      // l == a4 == buf_idx_j
+      ivec l = j_data->k_buf[buf_idx_j];
+      bvec mask_inner_0 = ivec::mask_cmpneq(mask_outer, k, l) & 
+	j_data->mask_buf[buf_idx_j];
+      // add "continue"
+      fvec r34x = j_data->rikx_buf[buf_idx_j];
+      fvec r34y = j_data->riky_buf[buf_idx_j];
+      fvec r34z = j_data->rikz_buf[buf_idx_j];
+      fvec r34mag = j_data->rikmag_buf[buf_idx_j];
+      fvec cos234 = fvec::mask_div(fvec::undefined(), mask_inner_0, 
+				   r32x * r34x + r32y * r34y + r32z * r34z, 
+				   r23mag * r34mag);
+      cos234 = fvec::min(c_1, fvec::max(c_m1, cos234));
+      fvec sin234 = fvec::mask_sqrt(fvec::undefined(), mask_inner_0, 
+				    c_1 - cos234 * cos234);
+      bvec mask_inner_1 = fvec::mask_cmpneq(mask_inner_0, sin234, 
+					    fvec::setzero());
+      // add "continue"
+      fvec sinl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, 
+				     sin234 * sin234);
+      fvec rjl2i = fvec::mask_recip(fvec::undefined(), mask_inner_1, 
+				    r34mag * r34mag);
+      fvec dw34 = j_data->dwik_buf[buf_idx_j];
+      fvec w34 = j_data->wik_buf[buf_idx_j];
+      fvec rr = r23mag *  r23mag - r34mag * r34mag;
+      fvec r24x = r23x +  r34x;
+      fvec r24y = r23y +  r34y;
+      fvec r24z = r23z +  r34z;
+      fvec r242 = r24x *  r24x +  r24y *  r24y +  r24z *  r24z;
+      fvec rijrjl = c_2 *  r23mag *  r34mag;
+      fvec rjl2 = r34mag *  r34mag;
+      fvec dctjl = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 -  rr, 
+				  rijrjl *  rjl2);
+      fvec dctji = fvec::mask_div(fvec::undefined(), mask_inner_1, r242 +  rr, 
+				  rijrjl *  r23mag *  r23mag);
+      fvec dctil = fvec::mask_div(fvec::undefined(), mask_inner_1, c_m2, 
+				  rijrjl);
+      fvec dtsijl;
+      fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
+      dtsijl = fvec::setzero() -  dtsijl;
+      fvec prefactor = VA;
+
+      fvec cross321x = r32y *  r21z -  r32z *  r21y;
+      fvec cross321y = r32z *  r21x -  r32x *  r21z;
+      fvec cross321z = r32x *  r21y -  r32y *  r21x;
+      fvec cross234x = r23y *  r34z -  r23z *  r34y;
+      fvec cross234y = r23z *  r34x -  r23x *  r34z;
+      fvec cross234z = r23x *  r34y -  r23y *  r34x;
+
+      fvec cwnum = cross321x * cross234x + cross321y * cross234y + cross321z *
+	cross234z;
+      fvec cwnom = r21mag * r34mag * r23mag * r23mag * sin321 * sin234;
+      fvec om1234 = fvec::mask_div(fvec::undefined(), mask_inner_1, cwnum, 
+				   cwnom);
+      fvec cw = om1234;
+      fvec sum_omega_contrib = (c_1 -  om1234 *  om1234) *  w21 *  w34 *
+	(c_1 -  tspjik) * ( c_1 -  tspijl);
+      sum_omega = fvec::mask_add(sum_omega, mask_inner_1, sum_omega, 
+				 sum_omega_contrib);
+      fvec dt1dik = rik2i -  dctik *  sink2i *  cos321;
+      fvec dt1djk = fvec::setzero() -  dctjk *  sink2i *  cos321;
+      fvec dt1djl = rjl2i -  dctjl *  sinl2i *  cos234;
+      fvec dt1dil = fvec::setzero() -  dctil *  sinl2i *  cos234;
+      fvec dt1dij =   fvec::mask_div(fvec::undefined(), mask_inner_1, c_2, 
+				     r23mag * r23mag) - 
+	dctij * sink2i * cos321 -  dctji *  sinl2i *  cos234;
+
+      fvec dt2dikx = r23y *  cross234z -  r23z *  cross234y;
+      fvec dt2diky = r23z *  cross234x -  r23x *  cross234z;
+      fvec dt2dikz = r23x *  cross234y -  r23y *  cross234x;
+
+      fvec dt2djlx = r23z *  cross321y -  r23y *  cross321z;
+      fvec dt2djly = r23x *  cross321z -  r23z *  cross321x;
+      fvec dt2djlz = r23y *  cross321x -  r23x *  cross321y;
+
+      fvec dt2dijx = r21z *  cross234y +  r34y *  cross321z -
+	( r34z *  cross321y +  r21y *  cross234z);
+      fvec dt2dijy = r21x *  cross234z +  r34z *  cross321x -
+	( r34x *  cross321z +  r21z *  cross234x);
+      fvec dt2dijz = r21y *  cross234x +  r34x *  cross321y -
+	( r34y *  cross321x +  r21x *  cross234y);
+
+      fvec aa = prefactor *  c_2 *  fvec::mask_div(fvec::undefined(), 
+						   mask_inner_1, cw, cwnom) *
+	w21 *  w34 *  (c_1 -  tspjik) * ( c_1 -  tspijl);
+      fvec aaa1 = (fvec::setzero() - prefactor) * (c_1 - om1234 * om1234) *
+	(c_1 - tspjik) * (c_1 - tspijl);
+      fvec aaa2 = (fvec::setzero() -  prefactor) * (c_1 -  om1234 *  om1234) *
+	w21 * w34;
+      fvec at2 = aa * cwnum;
+
+      fvec fcijpc = aaa2 * dtsjik * dctij * (c_1 - tspijl) +  aaa2 * dtsijl * 
+	dctji * (c_1 - tspjik) - dt1dij * at2;
+      fvec fcikpc =  aaa2 * dtsjik * dctik * (c_1 - tspijl) - dt1dik * at2;
+      fvec fcjlpc =  aaa2 * dtsijl * dctjl * (c_1 - tspjik) - dt1djl * at2;
+      fvec fcjkpc =  aaa2 * dtsjik * dctjk * (c_1 - tspijl) - dt1djk * at2;
+      fvec fcilpc =  aaa2 * dtsijl * dctil * (c_1 - tspjik) - dt1dil * at2;
+
+      fvec F23x = fcijpc *  r23x +  aa *  dt2dijx;
+      fvec F23y = fcijpc *  r23y +  aa *  dt2dijy;
+      fvec F23z = fcijpc *  r23z +  aa *  dt2dijz;
+
+      fvec F12x = fcikpc *  r21x +  aa *  dt2dikx;
+      fvec F12y = fcikpc *  r21y +  aa *  dt2diky;
+      fvec F12z = fcikpc *  r21z +  aa *  dt2dikz;
+
+      fvec F34x = fcjlpc *  r34x +  aa *  dt2djlx;
+      fvec F34y = fcjlpc *  r34y +  aa *  dt2djly;
+      fvec F34z = fcjlpc *  r34z +  aa *  dt2djlz;
+
+      fvec F31x = fcjkpc *  r31x;
+      fvec F31y = fcjkpc *  r31y;
+      fvec F31z = fcjkpc *  r31z;
+
+      fvec F24x = fcilpc *  r24x;
+      fvec F24y = fcilpc *  r24y;
+      fvec F24z = fcilpc *  r24z;
+
+      fvec f1x = fvec::setzero() - ( F12x +  F31x);
+      fvec f1y = fvec::setzero() - ( F12y +  F31y);
+      fvec f1z = fvec::setzero() - ( F12z +  F31z);
+      fvec f2x = F12x +  F31x;
+      fvec f2y = F12y +  F31y;
+      fvec f2z = F12z +  F31z;
+      fvec f3x = F34x +  F24x;
+      fvec f3y = F34y +  F24y;
+      fvec f3z = F34z +  F24z;
+      fvec f4x = fvec::setzero() - ( F34x +  F24x);
+      fvec f4y = fvec::setzero() - ( F34y +  F24y);
+      fvec f4z = fvec::setzero() - ( F34z +  F24z);
+
+      fij[0] = fvec::mask_add(fij[0], mask_inner_1, fij[0],
+          F23x +  F24x -  F31x);
+      fij[1] = fvec::mask_add(fij[1], mask_inner_1, fij[1],
+          F23y +  F24y -  F31y);
+      fij[2] = fvec::mask_add(fij[2], mask_inner_1, fij[2],
+          F23z +  F24z -  F31z);
+
+      fvec tmp20 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * 
+	(c_1 - tspijl) * dw21 * w34 * fvec::mask_recip(fvec::undefined(), 
+						       mask_inner_1, r21mag);
+      f2x = f2x -  tmp20 *  r21x;
+      f2y = f2y -  tmp20 *  r21y;
+      f2z = f2z -  tmp20 *  r21z;
+      f1x = f1x +  tmp20 *  r21x;
+      f1y = f1y +  tmp20 *  r21y;
+      f1z = f1z +  tmp20 *  r21z;
+
+      fvec tmp21 = VA * (c_1 - om1234 * om1234) * (c_1 - tspjik) * 
+	(c_1 - tspijl) * w21 * dw34 * fvec::mask_recip(fvec::undefined(), 
+						       mask_inner_1, r34mag);
+      f3x = f3x -  tmp21 *  r34x;
+      f3y = f3y -  tmp21 *  r34y;
+      f3z = f3z -  tmp21 *  r34z;
+      f4x = f4x +  tmp21 *  r34x;
+      f4y = f4y +  tmp21 *  r34y;
+      f4z = f4z +  tmp21 *  r34z;
+
+      // 1 == buf_idx_i, 2 == i, 3 == j, 4 == buf_idx_j
+      i_data->force_k_x_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], 
+		       mask_inner_1, i_data->force_k_x_buf[buf_idx_i], f1x);
+      i_data->force_k_y_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_1, 
+		       i_data->force_k_y_buf[buf_idx_i], f1y);
+      i_data->force_k_z_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_1, 
+		       i_data->force_k_z_buf[buf_idx_i], f1z);
+      i_data->force_i_x = 
+	fvec::mask_add(i_data->force_i_x, mask_inner_1, i_data->force_i_x, f2x);
+      i_data->force_i_y = 
+	fvec::mask_add(i_data->force_i_y, mask_inner_1, i_data->force_i_y, f2y);
+      i_data->force_i_z = 
+	fvec::mask_add(i_data->force_i_z, mask_inner_1, i_data->force_i_z, f2z);
+      j_data->force_i_x = 
+	fvec::mask_add(j_data->force_i_x, mask_inner_1, j_data->force_i_x, f3x);
+      j_data->force_i_y = 
+	fvec::mask_add(j_data->force_i_y, mask_inner_1, j_data->force_i_y, f3y);
+      j_data->force_i_z = 
+	fvec::mask_add(j_data->force_i_z, mask_inner_1, j_data->force_i_z, f3z);
+      j_data->force_k_x_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_x_buf[buf_idx_j], f4x);
+      j_data->force_k_y_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_y_buf[buf_idx_j], f4y);
+      j_data->force_k_z_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_1, 
+		       j_data->force_k_z_buf[buf_idx_j], f4z);
+    }
+  }
+  return sum_omega;
+}
+
+static fvec aut_frebo_pi_dh(
+    KernelArgsAIREBOT<flt_t,acc_t> * _noalias ka,
+    struct aut_frebo_data * _noalias i_data,
+    struct aut_frebo_data * _noalias j_data,
+    int itype, int jtype, ivec vi, ivec vj,
+    fvec r23x, fvec r23y, fvec r23z, fvec r23mag,
+    fvec VA,
+    fvec Nij, fvec Nji, fvec Nijconj, fvec NconjtmpI, fvec NconjtmpJ,
+    fvec fij[3]
+) {
+  fvec c_TOL = fvec::set1(TOL);
+  fvec dN3[3];
+  fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3[0]);
+  bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
+  fvec sum_omega = fvec::setzero();
+  if (bvec::test_any_set(TijgtTOLmask)) {
+    sum_omega = aut_frebo_sum_omega(
+        ka, i_data, j_data, itype, jtype, vi, vj,
+        r23x, r23y, r23z, r23mag, VA *  Tij, fij);
+    sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
+    aut_frebo_N_spline_force(ka, i_data, itype, jtype, vi, vj, VA * sum_omega,
+			     dN3[0], dN3[2], NconjtmpI);
+    aut_frebo_N_spline_force(ka, j_data, jtype, itype, vj, vi, VA * sum_omega,
+			     dN3[1], dN3[2], NconjtmpJ);
+  }
+  return Tij *  sum_omega;
+}
+
+/*
+ We can reuse the aut_frebo_data buffers here to do this calculation very 
+ cheaply.
+*/
+static void aut_torsion_vec(
+    KernelArgsAIREBOT<flt_t,acc_t> * ka,
+    struct aut_frebo_data * i_data,
+    struct aut_frebo_data * j_data,
+    ivec i, ivec j, fvec wij, fvec dwij
+) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t (*epsilonT)[2] = ka->params.epsilonT;
+  fvec epsilonT00 = fvec::set1(epsilonT[0][0]);
+  fvec epsilonT01 = fvec::set1(epsilonT[0][1]);
+  fvec epsilonT10 = fvec::set1(epsilonT[1][0]);
+  fvec epsilonT11 = fvec::set1(epsilonT[1][1]);
+  fvec thmin = fvec::set1(ka->params.thmin);
+  fvec thmax = fvec::set1(ka->params.thmax);
+
+  const fvec c_1_0 = fvec::set1(1.0);
+  const fvec c_0_5 = fvec::set1(0.5);
+  const fvec c_0_1 = fvec::set1(0.1);
+  const fvec c_2_0 = fvec::set1(2.0);
+  const fvec c_2_5 = fvec::set1(2.5);
+  const fvec c_256_405 = fvec::set1(256.0/405.0);
+
+  fvec del32x = j_data->x_i -  i_data->x_i;
+  fvec del32y = j_data->y_i -  i_data->y_i;
+  fvec del32z = j_data->z_i -  i_data->z_i;
+  fvec rsq = del32x * del32x +  del32y * del32y +  del32z * del32z;
+  fvec r32 = fvec::sqrt(rsq);
+  fvec del23x = fvec::setzero() -  del32x;
+  fvec del23y = fvec::setzero() -  del32y;
+  fvec del23z = fvec::setzero() -  del32z;
+  fvec r23 = r32;
+  fvec w23 = wij;
+  fvec dw23 = dwij;
+
+  for (int buf_idx_i = 0; buf_idx_i < i_data->buf_len; buf_idx_i++) {
+    bvec mask_start = i_data->mask_buf[buf_idx_i];
+    fvec del21x = i_data->rikx_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec del21y = i_data->riky_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec del21z = i_data->rikz_buf[buf_idx_i]; // a2 - a1 -> i - k
+    fvec r21 = i_data->rikmag_buf[buf_idx_i];
+    fvec cos321 = i_data->cosjik_buf[buf_idx_i];
+    fvec sin321 = fvec::sqrt(c_1_0 -  cos321 *  cos321);
+    // strictly equivalent to sin321 < TOL
+    mask_start = fvec::mask_cmpneq(mask_start, fvec::setzero(), sin321);
+    if (! bvec::test_any_set(mask_start)) continue;
+
+    fvec deljkx = del21x -  del23x;
+    fvec deljky = del21y -  del23y;
+    fvec deljkz = del21z -  del23z;
+    fvec rjk2 = deljkx * deljkx +  deljky * deljky + deljkz * deljkz;
+    fvec rjk = fvec::sqrt(rjk2);
+    fvec rik2 = r21 *  r21;
+    fvec w21 = i_data->wik_buf[buf_idx_i];
+    fvec dw21 = i_data->dwik_buf[buf_idx_i];
+
+    fvec rij = r32;
+    fvec rik = r21;
+    fvec rij2 = r32 *  r32;
+    fvec dtsjik;
+    fvec tspjik = aut_Sp2_deriv(cos321, thmin, thmax, &dtsjik);
+    dtsjik = fvec::setzero() -  dtsjik;
+
+    bvec ktype_mask = i_data->ktype_buf[buf_idx_i];
+    fvec epsilonT0 = fvec::mask_blend(ktype_mask, epsilonT00, epsilonT10);
+    fvec epsilonT1 = fvec::mask_blend(ktype_mask, epsilonT01, epsilonT11);
+
+    ivec k = i_data->k_buf[buf_idx_i];
+    for (int buf_idx_j = 0; buf_idx_j < j_data->buf_len; buf_idx_j++) {
+      ivec l = j_data->k_buf[buf_idx_j];
+      bvec mask_inner_0 = ivec::mask_cmpneq(mask_start, k, l) & 
+	j_data->mask_buf[buf_idx_j];
+      if (! bvec::test_any_set(mask_inner_0)) continue;
+      fvec del34x = j_data->rikx_buf[buf_idx_j];
+      fvec del34y = j_data->riky_buf[buf_idx_j];
+      fvec del34z = j_data->rikz_buf[buf_idx_j];
+      fvec r34 = j_data->rikmag_buf[buf_idx_j];
+      bvec ltype_mask = j_data->ktype_buf[buf_idx_j];
+      fvec cos234 = j_data->cosjik_buf[buf_idx_j];
+      fvec sin234 = fvec::sqrt(c_1_0 -  cos234 *  cos234);
+      // strictly equivalent to sin234 < TOL
+      mask_inner_0 = fvec::mask_cmpneq(mask_inner_0, sin234, fvec::setzero());
+      if (! bvec::test_any_set(mask_inner_0)) continue;
+      fvec dw34 = j_data->dwik_buf[buf_idx_j];
+      fvec w34 = j_data->wik_buf[buf_idx_j];
+      fvec delilx = del23x +  del34x;
+      fvec delily = del23y +  del34y;
+      fvec delilz = del23z +  del34z;
+      fvec ril2 = delilx * delilx +  delily * delily + delilz * delilz;
+      fvec ril = fvec::sqrt(ril2);
+      fvec rjl2 = r34 *  r34;
+
+      fvec rjl = r34;
+      fvec dtsijl;
+      fvec tspijl = aut_Sp2_deriv(cos234, thmin, thmax, &dtsijl);
+      dtsijl = fvec::setzero() -  dtsijl;
+      fvec cross321x = del32y * del21z - del32z * del21y;
+      fvec cross321y = del32z * del21x - del32x * del21z;
+      fvec cross321z = del32x * del21y - del32y * del21x;
+      fvec cross321mag = fvec::sqrt(cross321x * cross321x + 
+				    cross321y * cross321y + 
+				    cross321z * cross321z);
+      fvec cross234x = del23y * del34z - del23z * del34y;
+      fvec cross234y = del23z * del34x - del23x * del34z;
+      fvec cross234z = del23x * del34y - del23y * del34x;
+      fvec cross234mag = fvec::sqrt(cross234x * cross234x + 
+				    cross234y * cross234y + 
+				    cross234z * cross234z);
+      fvec cwnum = cross321x * cross234x + cross321y * cross234y + 
+	cross321z * cross234z;
+      fvec cwnom = r21 * r34 * r32 * r32 * sin321 * sin234;
+      fvec cw = cwnum /  cwnom;
+
+      fvec cw2 = c_0_5 * ( c_1_0 - cw);
+      fvec ekijl = fvec::mask_blend(ltype_mask, epsilonT0, epsilonT1);
+      fvec Ec = c_256_405 * ekijl;
+      fvec cw2_5 = cw2 *  cw2 *  cw2 *  cw2 *  cw2;
+      fvec Vtors = Ec *  cw2_5 -  ekijl *  c_0_1;
+
+      fvec evdwl = Vtors * w21 * w23 * w34 * (c_1_0-tspjik) * (c_1_0-tspijl);
+      ka->result_eng += fvec::mask_reduce_add(mask_inner_0, evdwl);
+
+      fvec dndijx  = cross234y * del21z - cross234z * del21y;
+      fvec dndijy  = cross234z * del21x - cross234x * del21z;
+      fvec dndijz  = cross234x * del21y - cross234y * del21x;
+
+      fvec tmpvecx = del34y * cross321z - del34z * cross321y;
+      fvec tmpvecy = del34z * cross321x - del34x * cross321z;
+      fvec tmpvecz = del34x * cross321y - del34y * cross321x;
+
+      dndijx = dndijx + tmpvecx;
+      dndijy = dndijy + tmpvecy;
+      dndijz = dndijz + tmpvecz;
+
+      fvec dndikx = del23y * cross234z - del23z * cross234y;
+      fvec dndiky = del23z * cross234x - del23x * cross234z;
+      fvec dndikz = del23x * cross234y - del23y * cross234x;
+
+      fvec dndjlx = cross321y * del23z - cross321z * del23y;
+      fvec dndjly = cross321z * del23x - cross321x * del23z;
+      fvec dndjlz = cross321x * del23y - cross321y * del23x;
+
+      fvec r23sq = r23 *  r23;
+      fvec r21sq = r21 *  r21;
+      fvec r34sq = r34 *  r34;
+      fvec rjksq = rjk *  rjk;
+      fvec rilsq = ril *  ril;
+      fvec dcidij = (r23sq -  r21sq +  rjksq) / ( c_2_0 *  r23sq *  r21);
+      fvec dcidik = (r21sq -  r23sq +  rjksq) / ( c_2_0 *  r21sq *  r23);
+      fvec dcidjk = fvec::setzero() -  rjk / ( r23 *  r21);
+      fvec dcjdji = (r23sq -  r34sq +  rilsq) / ( c_2_0 *  r23sq *  r34);
+      fvec dcjdjl = (r34sq -  r23sq +  rilsq) / ( c_2_0 *  r34sq *  r23);
+      fvec dcjdil = fvec::setzero() -  ril / ( r23 *  r34);
+
+      fvec dsidij = fvec::setzero() -  cos321 / sin321 * dcidij;
+      fvec dsidik = fvec::setzero() -  cos321 / sin321 * dcidik;
+      fvec dsidjk = fvec::setzero() -  cos321 / sin321 * dcidjk;
+
+      fvec dsjdji = fvec::setzero() -  cos234 / sin234 * dcjdji;
+      fvec dsjdjl = fvec::setzero() -  cos234 / sin234 * dcjdjl;
+      fvec dsjdil = fvec::setzero() -  cos234 / sin234 * dcjdil;
+
+      fvec dxidij = r21 * sin321 + r23 * r21 * dsidij;
+      fvec dxidik = r23 * sin321 + r23 * r21 * dsidik;
+      fvec dxidjk = r23 * r21 * dsidjk;
+
+      fvec dxjdji = r34 * sin234 + r23 * r34 * dsjdji;
+      fvec dxjdjl = r23 * sin234 + r23 * r34 * dsjdjl;
+      fvec dxjdil = r23 * r34 * dsjdil;
+
+      fvec ddndij = dxidij * cross234mag + cross321mag * dxjdji;
+      fvec ddndik = dxidik * cross234mag;
+      fvec ddndjk = dxidjk * cross234mag;
+      fvec ddndjl = cross321mag * dxjdjl;
+      fvec ddndil = cross321mag * dxjdil;
+      fvec dcwddn = fvec::setzero() -  cwnum / ( cwnom * cwnom);
+      fvec dcwdn = fvec::recip(cwnom);
+      fvec cw2_4 = cw2 *  cw2 *  cw2 *  cw2;
+      fvec dvpdcw = c_2_5 * Ec * cw2_4 * w23 * w21 * w34 * (c_1_0 - tspjik) *
+	(c_1_0 - tspijl);
+
+      fvec Ftmpx = dvpdcw * (dcwdn * dndijx + dcwddn * ddndij * del23x / r23);
+      fvec Ftmpy = dvpdcw * (dcwdn * dndijy + dcwddn * ddndij * del23y / r23);
+      fvec Ftmpz = dvpdcw * (dcwdn * dndijz + dcwddn * ddndij * del23z / r23);
+      fvec fix = Ftmpx;
+      fvec fiy = Ftmpy;
+      fvec fiz = Ftmpz;
+      fvec fjx = fvec::setzero() - Ftmpx;
+      fvec fjy = fvec::setzero() - Ftmpy;
+      fvec fjz = fvec::setzero() - Ftmpz;
+
+      Ftmpx = dvpdcw * (dcwdn * dndikx + dcwddn * ddndik * del21x / r21);
+      Ftmpy = dvpdcw * (dcwdn * dndiky + dcwddn * ddndik * del21y / r21);
+      Ftmpz = dvpdcw * (dcwdn * dndikz + dcwddn * ddndik * del21z / r21);
+      fix = fix +  Ftmpx;
+      fiy = fiy +  Ftmpy;
+      fiz = fiz +  Ftmpz;
+      fvec fkx = fvec::setzero() -  Ftmpx;
+      fvec fky = fvec::setzero() -  Ftmpy;
+      fvec fkz = fvec::setzero() -  Ftmpz;
+
+      Ftmpx = dvpdcw * dcwddn * ddndjk * deljkx / rjk;
+      Ftmpy = dvpdcw * dcwddn * ddndjk * deljky / rjk;
+      Ftmpz = dvpdcw * dcwddn * ddndjk * deljkz / rjk;
+      fjx = fjx +  Ftmpx;
+      fjy = fjy +  Ftmpy;
+      fjz = fjz +  Ftmpz;
+      fkx = fkx -  Ftmpx;
+      fky = fky -  Ftmpy;
+      fkz = fkz -  Ftmpz;
+
+      Ftmpx = dvpdcw * (dcwdn * dndjlx + dcwddn * ddndjl * del34x / r34);
+      Ftmpy = dvpdcw * (dcwdn * dndjly + dcwddn * ddndjl * del34y / r34);
+      Ftmpz = dvpdcw * (dcwdn * dndjlz + dcwddn * ddndjl * del34z / r34);
+      fjx = fjx +  Ftmpx;
+      fjy = fjy +  Ftmpy;
+      fjz = fjz +  Ftmpz;
+      fvec flx = fvec::setzero() -  Ftmpx;
+      fvec fly = fvec::setzero() -  Ftmpy;
+      fvec flz = fvec::setzero() -  Ftmpz;
+
+      Ftmpx = dvpdcw * dcwddn * ddndil * delilx / ril;
+      Ftmpy = dvpdcw * dcwddn * ddndil * delily / ril;
+      Ftmpz = dvpdcw * dcwddn * ddndil * delilz / ril;
+      fix = fix +  Ftmpx;
+      fiy = fiy +  Ftmpy;
+      fiz = fiz +  Ftmpz;
+      flx = flx -  Ftmpx;
+      fly = fly -  Ftmpy;
+      flz = flz -  Ftmpz;
+
+      // coordination forces
+
+      fvec fpair = Vtors * dw21 * w23 * w34 * (c_1_0 - tspjik) * 
+	(c_1_0 - tspijl) /  r21;
+      fix = fix -  del21x * fpair;
+      fiy = fiy -  del21y * fpair;
+      fiz = fiz -  del21z * fpair;
+      fkx = fkx +  del21x * fpair;
+      fky = fky +  del21y * fpair;
+      fkz = fkz +  del21z * fpair;
+
+      fpair = Vtors * w21 * dw23 * w34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
+	r23;
+      fix = fix -  del23x * fpair;
+      fiy = fiy -  del23y * fpair;
+      fiz = fiz -  del23z * fpair;
+      fjx = fjx +  del23x * fpair;
+      fjy = fjy +  del23y * fpair;
+      fjz = fjz +  del23z * fpair;
+
+      fpair = Vtors * w21 * w23 * dw34 * (c_1_0 - tspjik) * (c_1_0 - tspijl) /
+	r34;
+      fjx = fjx -  del34x * fpair;
+      fjy = fjy -  del34y * fpair;
+      fjz = fjz -  del34z * fpair;
+      flx = flx +  del34x * fpair;
+      fly = fly +  del34y * fpair;
+      flz = flz +  del34z * fpair;
+
+      // additional cut off function forces
+
+      fvec fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * dtsjik * (c_1_0 -
+									tspijl);
+      fpair = fcpc * dcidij / rij;
+      fix = fix +  fpair * del23x;
+      fiy = fiy +  fpair * del23y;
+      fiz = fiz +  fpair * del23z;
+      fjx = fjx -  fpair * del23x;
+      fjy = fjy -  fpair * del23y;
+      fjz = fjz -  fpair * del23z;
+
+      fpair = fcpc * dcidik / rik;
+      fix = fix +  fpair * del21x;
+      fiy = fiy +  fpair * del21y;
+      fiz = fiz +  fpair * del21z;
+      fkx = fkx -  fpair * del21x;
+      fky = fky -  fpair * del21y;
+      fkz = fkz -  fpair * del21z;
+
+      fpair = fcpc * dcidjk / rjk;
+      fjx = fjx +  fpair * deljkx;
+      fjy = fjy +  fpair * deljky;
+      fjz = fjz +  fpair * deljkz;
+      fkx = fkx -  fpair * deljkx;
+      fky = fky -  fpair * deljky;
+      fkz = fkz -  fpair * deljkz;
+
+      fcpc = fvec::setzero() - Vtors * w21 * w23 * w34 * (c_1_0 - tspjik) * 
+	dtsijl;
+      fpair = fcpc * dcjdji / rij;
+      fix = fix +  fpair * del23x;
+      fiy = fiy +  fpair * del23y;
+      fiz = fiz +  fpair * del23z;
+      fjx = fjx -  fpair * del23x;
+      fjy = fjy -  fpair * del23y;
+      fjz = fjz -  fpair * del23z;
+
+      fpair = fcpc * dcjdjl / rjl;
+      fjx = fjx +  fpair * del34x;
+      fjy = fjy +  fpair * del34y;
+      fjz = fjz +  fpair * del34z;
+      flx = flx -  fpair * del34x;
+      fly = fly -  fpair * del34y;
+      flz = flz -  fpair * del34z;
+
+      fpair = fcpc * dcjdil / ril;
+      fix = fix +  fpair * delilx;
+      fiy = fiy +  fpair * delily;
+      fiz = fiz +  fpair * delilz;
+      flx = flx -  fpair * delilx;
+      fly = fly -  fpair * delily;
+      flz = flz -  fpair * delilz;
+
+      // sum per-atom forces into atom force array
+
+      i_data->force_i_x = fvec::mask_add(i_data->force_i_x, mask_inner_0, 
+					 i_data->force_i_x, fix);
+      i_data->force_i_y = fvec::mask_add(i_data->force_i_y, mask_inner_0, 
+					 i_data->force_i_y, fiy);
+      i_data->force_i_z = fvec::mask_add(i_data->force_i_z, mask_inner_0, 
+					 i_data->force_i_z, fiz);
+      i_data->force_j_x = fvec::mask_add(i_data->force_j_x, mask_inner_0, 
+					 i_data->force_j_x, fjx);
+      i_data->force_j_y = fvec::mask_add(i_data->force_j_y, mask_inner_0, 
+					 i_data->force_j_y, fjy);
+      i_data->force_j_z = fvec::mask_add(i_data->force_j_z, mask_inner_0, 
+					 i_data->force_j_z, fjz);
+      i_data->force_k_x_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_x_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_x_buf[buf_idx_i], fkx);
+      i_data->force_k_y_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_y_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_y_buf[buf_idx_i], fky);
+      i_data->force_k_z_buf[buf_idx_i] = 
+	fvec::mask_add(i_data->force_k_z_buf[buf_idx_i], mask_inner_0, 
+		       i_data->force_k_z_buf[buf_idx_i], fkz);
+      j_data->force_k_x_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_x_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_x_buf[buf_idx_j], flx);
+      j_data->force_k_y_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_y_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_y_buf[buf_idx_j], fly);
+      j_data->force_k_z_buf[buf_idx_j] = 
+	fvec::mask_add(j_data->force_k_z_buf[buf_idx_j], mask_inner_0, 
+		       j_data->force_k_z_buf[buf_idx_j], flz);
+    }
+  }
+}
+
+/*
+ * Processes VL elements of the same type itype/jtype for REBO and TORSION
+ * interactions. This allows us to reuse the aut_frebo_data buffes in the 
+ * torsion calculaltion.
+ */
+static void aut_frebo_batch_of_kind(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+				    int torflag, int itype, int jtype, 
+				    int * i_buf, int * j_buf) {
+ { // jump-scope for exceed_limits
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  flt_t rcminij = ka->params.rcmin[itype][jtype];
+  flt_t rcmaxij = ka->params.rcmax[itype][jtype];
+  flt_t Qij = ka->params.Q[itype][jtype];
+  flt_t Aij = ka->params.A[itype][jtype];
+  flt_t alphaij = ka->params.alpha[itype][jtype];
+  fvec vrcminij = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec vrcmaxij = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec vQij = fvec::set1(ka->params.Q[itype][jtype]);
+  fvec vAij = fvec::set1(ka->params.A[itype][jtype]);
+  fvec malphaij = fvec::set1(-ka->params.alpha[itype][jtype]);
+  fvec c_1_0 = fvec::set1(1);
+  fvec c_0_5 = fvec::set1(0.5);
+  fvec c_TOL = fvec::set1(1e-9);
+  struct aut_frebo_data i_data, j_data;
+
+  fvec evdwl_vacc = fvec::setzero();
+  ivec vi = ivec::maskz_loadu(bvec::full(), i_buf);
+  int tmp;
+  ivec vj = ivec::maskz_loadu(bvec::full(), j_buf);
+  fvec x_i, y_i, z_i;
+  fvec x_j, y_j, z_j;
+  aut_loadatoms_vec_notype(x, vi, &x_i, &y_i, &z_i);
+  aut_loadatoms_vec_notype(x, vj, &x_j, &y_j, &z_j);
+  i_data.x_i = x_i;
+  i_data.y_i = y_i;
+  i_data.z_i = z_i;
+  i_data.x_j = x_j;
+  i_data.y_j = y_j;
+  i_data.z_j = z_j;
+  j_data.x_i = x_j;
+  j_data.y_i = y_j;
+  j_data.z_i = z_j;
+  j_data.x_j = x_i;
+  j_data.y_j = y_i;
+  j_data.z_j = z_i;
+  fvec delx = x_i -  x_j;
+  fvec dely = y_i -  y_j;
+  fvec delz = z_i -  z_j;
+  fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+  fvec rij = fvec::sqrt(rsq);
+  fvec dwij;
+  fvec wij = aut_Sp_deriv(rij, vrcminij, vrcmaxij, &dwij);
+
+  fvec exp_alphar = fvec::exp(malphaij *  rij);
+  fvec Qij_over_rij = vQij /  rij;
+  fvec Qij_over_rsq = vQij /  rsq;
+  fvec VR_by_wij = ( c_1_0 +  Qij_over_rij) *  vAij *  exp_alphar;
+  fvec VR = wij * VR_by_wij;
+  fvec pre = wij *  vAij *  exp_alphar;
+  fvec dVRdi = pre * ( malphaij +  malphaij *  Qij_over_rij -  Qij_over_rsq);
+  dVRdi = dVRdi + VR_by_wij *  dwij;
+
+  fvec VA_by_wij = fvec::setzero();
+  fvec dVA = fvec::setzero();
+
+  int k;
+  for (k = 0; k < 3; k++) {
+    fvec mBIJc = fvec::set1(-ka->params.BIJc[itype][jtype][k]);
+    fvec mBetaij = fvec::set1(-ka->params.Beta[itype][jtype][k]);
+    fvec term = mBIJc *  fvec::exp(mBetaij *  rij);
+    VA_by_wij = VA_by_wij +  term;
+    dVA = dVA +  mBetaij * wij * term;
+  }
+
+  dVA = dVA +  dwij *  VA_by_wij;
+  fvec VA = wij * VA_by_wij;
+
+  bvec tol_check = fvec::cmplt(wij, c_TOL);
+  VA = fvec::mask_blend(tol_check, VA, fvec::setzero());
+  dVA = fvec::mask_blend(tol_check, dVA, fvec::setzero());
+  VR = fvec::mask_blend(tol_check, VR, fvec::setzero());
+  dVRdi = fvec::mask_blend(tol_check, dVRdi, fvec::setzero());
+
+  fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
+  fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
+  fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
+  fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
+  fvec Nij = (nHi +  nCi) -  wij;
+  fvec Nji = (nHj +  nCj) -  wij;
+  i_data.nHi = nHi;
+  i_data.nCi = nCi;
+  j_data.nHi = nHj;
+  j_data.nCi = nCj;
+  fvec fij[3], fji[3];
+  fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
+  fij[2] = fvec::setzero();
+  fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
+  fji[2] = fvec::setzero();
+
+  fvec NconjtmpI;
+  fvec pij = aut_frebo_pij_pd_2(
+      ka, &i_data, itype, jtype, vi, vj,
+      delx, dely, delz, rij, wij, VA, &NconjtmpI, fij);
+
+  if (i_data.buf_len < 0) goto exceed_limits;
+
+  fvec NconjtmpJ;
+  fvec rjix = fvec::setzero() -  delx;
+  fvec rjiy = fvec::setzero() -  dely;
+  fvec rjiz = fvec::setzero() -  delz;
+  fvec pji = aut_frebo_pij_pd_2(
+      ka, &j_data, jtype, itype, vj, vi,
+      rjix, rjiy, rjiz, rij, wij, VA, &NconjtmpJ, fji);
+  fij[0] = fij[0] -  fji[0];
+  fij[1] = fij[1] -  fji[1];
+  fij[2] = fij[2] -  fji[2];
+
+  if (j_data.buf_len < 0) goto exceed_limits;
+
+  if (torflag && itype == 0 && jtype == 0)
+    aut_torsion_vec(ka, &i_data, &j_data, vi, vj, wij, dwij);
+
+  fvec Nijconj = c_1_0 +  NconjtmpI *  NconjtmpI +  NconjtmpJ *  NconjtmpJ;
+  fvec dN3[3];
+  fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+  aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, VA, dN3[0], 
+			   dN3[2], NconjtmpI);
+  aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, VA, dN3[1], 
+			   dN3[2], NconjtmpJ);
+  fvec pi_dh = aut_frebo_pi_dh(ka, &i_data, &j_data, itype, jtype, vi, vj, 
+			       delx, dely, delz, rij, VA, Nij, Nji, Nijconj,
+			       NconjtmpI, NconjtmpJ, fij);
+
+  fvec bij = c_0_5 * ( pij +  pji) +  pi_rc +  pi_dh;
+  fvec dVAdi = bij *  dVA;
+  fvec fpair = (dVAdi +  dVRdi) *  fvec::recip(rij);
+  fvec result_f_j_x = fpair *  delx -  fij[0];
+  fvec result_f_j_y = fpair *  dely -  fij[1];
+  fvec result_f_j_z = fpair *  delz -  fij[2];
+  fvec result_f_i_x = fvec::setzero() -  result_f_j_x;
+  fvec result_f_i_y = fvec::setzero() -  result_f_j_y;
+  fvec result_f_i_z = fvec::setzero() -  result_f_j_z;
+  fvec evdwl = VR +  bij *  VA;
+  evdwl_vacc = evdwl_vacc +  evdwl;
+
+  aut_frebo_data_writeback(ka, &i_data);
+  aut_frebo_data_writeback(ka, &j_data);
+
+  flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
+
+  result_f_i_x = i_data.force_i_x +  result_f_i_x;
+  result_f_i_y = i_data.force_i_y +  result_f_i_y;
+  result_f_i_z = i_data.force_i_z +  result_f_i_z;
+  result_f_j_x = i_data.force_j_x +  result_f_j_x;
+  result_f_j_y = i_data.force_j_y +  result_f_j_y;
+  result_f_j_z = i_data.force_j_z +  result_f_j_z;
+
+  result_f_i_x = j_data.force_j_x +  result_f_i_x;
+  result_f_i_y = j_data.force_j_y +  result_f_i_y;
+  result_f_i_z = j_data.force_j_z +  result_f_i_z;
+  result_f_j_x = j_data.force_i_x +  result_f_j_x;
+  result_f_j_y = j_data.force_i_y +  result_f_j_y;
+  result_f_j_z = j_data.force_i_z +  result_f_j_z;
+
+  fvec::store(fi_x_buf, result_f_i_x);
+  fvec::store(fi_y_buf, result_f_i_y);
+  fvec::store(fi_z_buf, result_f_i_z);
+  ivec::store(fi_i_buf, vi);
+  fvec::store(fj_x_buf, result_f_j_x);
+  fvec::store(fj_y_buf, result_f_j_y);
+  fvec::store(fj_z_buf, result_f_j_z);
+  ivec::store(fj_j_buf, vj);
+  fvec::store(evdwl_buf, evdwl);
+
+  int lane;
+  for (lane = 0; lane < fvec::VL; lane++) {
+    int ii = fi_i_buf[lane];
+    result_f[ii].x += fi_x_buf[lane];
+    result_f[ii].y += fi_y_buf[lane];
+    result_f[ii].z += fi_z_buf[lane];
+    result_f[ii].w += 0.5 * evdwl_buf[lane];
+    int jj = fj_j_buf[lane];
+    result_f[jj].x += fj_x_buf[lane];
+    result_f[jj].y += fj_y_buf[lane];
+    result_f[jj].z += fj_z_buf[lane];
+    result_f[jj].w += 0.5 * evdwl_buf[lane];
+  }
+  ka->result_eng += fvec::reduce_add(evdwl_vacc);
+  return;
+ }
+exceed_limits:
+  for (int l = 0; l < fvec::VL; l++) {
+    int i = i_buf[l];
+    int j = j_buf[l];
+    ref_frebo_single_interaction(ka, i, j);
+    if (torflag && itype == 0 && jtype == 0) 
+      ref_torsion_single_interaction(ka, i, j);
+  }
+}
+
+/*
+ Orders the interactions by itype and jtype and passes chunks to the above 
+ method.
+*/
+static void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torflag) {
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  int * _noalias tag = ka->tag;
+  int * _noalias map = ka->map;
+  int i_buf[2][2][fvec::VL];
+  int j_buf[2][2][fvec::VL];
+  int n_buf[2][2] = {0};
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    int itag = tag[i];
+    int itype = map[x[i].w];
+    flt_t x_i = x[i].x;
+    flt_t y_i = x[i].y;
+    flt_t z_i = x[i].z;
+    int * neighs = ka->neigh_rebo.entries + ka->neigh_rebo.offset[i];
+    int jnum = ka->neigh_rebo.num[i];
+    for (int jj = 0; jj < jnum; jj++) {
+      int j = neighs[jj];
+      int jtag = tag[j];
+      if (itag > jtag) {
+        if (((itag + jtag) & 1) == 0)
+          continue;
+      } else if (itag < jtag) {
+        if (((itag + jtag) & 1) == 1)
+          continue;
+      } else {
+        if (x[j].z < z_i)
+          continue;
+        if (x[j].z == z_i && x[j].y < y_i)
+          continue;
+        if (x[j].z == z_i && x[j].y == y_i && x[j].x < x_i)
+          continue;
+      }
+      int jtype = map[x[j].w];
+      int ins = n_buf[itype][jtype];
+      i_buf[itype][jtype][ins] = i;
+      j_buf[itype][jtype][ins] = j;
+      n_buf[itype][jtype] += 1;
+      if (n_buf[itype][jtype] == fvec::VL) {
+        aut_frebo_batch_of_kind(ka, torflag, itype, jtype,
+            i_buf[itype][jtype], j_buf[itype][jtype]);
+        n_buf[itype][jtype] = 0;
+      }
+    }
+  }
+  for (int itype = 0; itype < 2; itype++) {
+    for (int jtype = 0; jtype < 2; jtype++) {
+      for (int l = 0; l < n_buf[itype][jtype]; l++) {
+        int i = i_buf[itype][jtype][l];
+        int j = j_buf[itype][jtype][l];
+        ref_frebo_single_interaction(ka, i, j);
+        if (torflag && itype == 0 && jtype == 0) 
+	  ref_torsion_single_interaction(ka, i, j);
+      }
+    }
+  }
+}
+
+/*
+ * Apply paths in scalar fashion, not crucial for performance.
+ */
+static void aut_airebo_lj_force_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+   bvec mask, fvec dC, LennardJonesPathAIREBOT<flt_t> path[fvec::VL]) {
+  for (int i = 0; i < fvec::VL; i++) {
+    if (bvec::test_at(mask, i)) {
+      ref_lennard_jones_force_path(ka, fvec::at(dC, i), &path[i]);
+    }
+  }
+}
+
+/*
+ * Hash-Map for efficient calculation of C_ij.
+ * Can have up to ITEMS entries with associated paths, as well as
+ * 1024 entries. Open addressing, invalidation by using a different i.
+ * Only needs to be reset once per timestep.
+ */
+static const int OPT_TEST_PATH_SIZE = 1024;
+static const int OPT_TEST_PATH_ITEMS = 128;
+struct aut_airebo_lj_test_path_result_data {
+  LennardJonesPathAIREBOT<flt_t> testpath[OPT_TEST_PATH_ITEMS];
+  int i[OPT_TEST_PATH_SIZE];
+  int j[OPT_TEST_PATH_SIZE];
+  flt_t cij[OPT_TEST_PATH_SIZE];
+  int testpath_idx[OPT_TEST_PATH_SIZE];
+};
+static const unsigned int OPT_TEST_PATH_HASH = 2654435761;
+
+static int aut_lj_tap_hash_fn(int j, int attempt) {
+  uint32_t result = j;
+  result *= (uint32_t) OPT_TEST_PATH_HASH;
+  result += (uint32_t) attempt;
+  result %= (uint32_t) OPT_TEST_PATH_SIZE;
+  return result;
+}
+
+static ivec aut_airebo_lj_tap_hash_fn_vec(ivec val, ivec attempt) {
+  const ivec golden = ivec::set1(OPT_TEST_PATH_HASH);
+  const ivec mask = ivec::set1(OPT_TEST_PATH_SIZE - 1);
+  ivec a = ivec::mullo(golden, val);
+  ivec b = a +  attempt;
+  ivec c = ivec::the_and(b, mask);
+  return c;
+}
+
+/*
+ * Enter all those (potential) neighbors of i (including 2nd and 3rd degree) 
+ * into the hash-map. There is no good way to vectorize this, and it does not 
+ * seem time-critical.
+ */
+static bool aut_airebo_lj_test_all_paths(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+    int i, struct aut_airebo_lj_test_path_result_data * result) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * map = ka->map;
+  flt_t (*rcmin)[2] = &ka->params.rcmin[0];
+  flt_t (*rcmax)[2] = &ka->params.rcmax[0];
+  flt_t rcminsq[2][2];
+  rcminsq[0][0] = rcmin[0][0] * rcmin[0][0];
+  rcminsq[0][1] = rcmin[0][1] * rcmin[0][1];
+  rcminsq[1][0] = rcmin[1][0] * rcmin[1][0];
+  rcminsq[1][1] = rcmin[1][1] * rcmin[1][1];
+  int * neighs_i = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]];
+  int itype = map[x[i].w];
+  int path_insert_pos = 0;
+  for (int jj = 0; jj < ka->neigh_rebo.num[i]; jj++) {
+    int j = neighs_i[jj];
+    int jtype = map[x[j].w];
+    flt_t dijx = x[j].x - x[i].x;
+    flt_t dijy = x[j].y - x[i].y;
+    flt_t dijz = x[j].z - x[i].z;
+    flt_t rijsq = dijx * dijx + dijy * dijy + dijz * dijz;
+    flt_t wj = 1, dwj = 0;
+    flt_t rij = 0;
+    if (rijsq >= rcminsq[itype][jtype]) {
+      rij = overloaded::sqrt(rijsq);
+      wj = Sp(rij, rcmin[itype][jtype], rcmax[itype][jtype], &dwj);
+    }
+    int attempt = 0;
+    int start_hash_slot = aut_lj_tap_hash_fn(j, attempt);
+    int hash_slot = start_hash_slot;
+    while (result->i[hash_slot] == i && result->j[hash_slot] != j && 
+	   attempt < OPT_TEST_PATH_SIZE) {
+      hash_slot = aut_lj_tap_hash_fn(j, ++attempt);
+    }
+    if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+    bool init_slot = result->i[hash_slot] != i;
+    if (init_slot || (1 - wj < result->cij[hash_slot])) {
+      result->i[hash_slot] = i;
+      result->j[hash_slot] = j;
+      result->cij[hash_slot] = 1 - wj;
+      if (wj != 1.0) {
+        if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+        result->testpath_idx[hash_slot] = path_insert_pos;
+        LennardJonesPathAIREBOT<flt_t> *path = 
+	  &result->testpath[path_insert_pos++];
+        path->num = 2;
+        path->del[0].x = dijx;
+        path->del[0].y = dijy;
+        path->del[0].z = dijz;
+        if (rij == 0) rij = sqrt(rijsq);
+        path->r[0] = rij;
+        path->w[0] = wj;
+        path->dw[0] = dwj;
+        path->idx[0] = i;
+        path->idx[1] = j;
+      }
+    }
+    int * neighs_j = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[j]];
+    for (int kk = 0; kk < ka->neigh_rebo.num[j]; kk++) {
+      int k = neighs_j[kk];
+      if (k == i) continue;
+      int ktype = map[x[k].w];
+      flt_t djkx = x[k].x - x[j].x;
+      flt_t djky = x[k].y - x[j].y;
+      flt_t djkz = x[k].z - x[j].z;
+      flt_t rjksq = djkx * djkx + djky * djky + djkz * djkz;
+      flt_t wk = 1, dwk = 0;
+      flt_t rjk = 0;
+      if (rjksq >= rcminsq[jtype][ktype]) {
+        rjk = overloaded::sqrt(rjksq);
+        wk = Sp(rjk, rcmin[jtype][ktype], rcmax[jtype][ktype], &dwk);
+      }
+      int attempt = 0;
+      int start_hash_slot = aut_lj_tap_hash_fn(k, attempt);
+      int hash_slot = start_hash_slot;
+      while (result->i[hash_slot] == i && result->j[hash_slot] != k && 
+	     attempt < OPT_TEST_PATH_SIZE) {
+        hash_slot = aut_lj_tap_hash_fn(k, ++attempt);
+      }
+      if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+      bool init_slot = result->i[hash_slot] != i;
+      if (init_slot || (1 - wj * wk < result->cij[hash_slot])) {
+        result->i[hash_slot] = i;
+        result->j[hash_slot] = k;
+        result->cij[hash_slot] = 1 - wj * wk;
+        if (wj * wk != 1.0) {
+          if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+          result->testpath_idx[hash_slot] = path_insert_pos;
+          LennardJonesPathAIREBOT<flt_t> *path = 
+	    &result->testpath[path_insert_pos++];
+          path->num = 3;
+          path->del[0].x = dijx;
+          path->del[0].y = dijy;
+          path->del[0].z = dijz;
+          if (rij == 0) rij = sqrt(rijsq);
+          path->r[0] = rij;
+          path->del[1].x = djkx;
+          path->del[1].y = djky;
+          path->del[1].z = djkz;
+          if (rjk == 0) rjk = sqrt(rjksq);
+          path->r[1] = rjk;
+          path->w[0] = wj;
+          path->dw[0] = dwj;
+          path->w[1] = wk;
+          path->dw[1] = dwk;
+          path->idx[0] = i;
+          path->idx[1] = j;
+          path->idx[2] = k;
+        }
+      }
+      int * neighs_k = &ka->neigh_rebo.entries[ka->neigh_rebo.offset[k]];
+      for (int ll = 0; ll < ka->neigh_rebo.num[k]; ll++) {
+        int l = neighs_k[ll];
+        if ((l == i) || (l == j)) continue;
+        int ltype = map[x[l].w];
+        flt_t dklx = x[l].x - x[k].x;
+        flt_t dkly = x[l].y - x[k].y;
+        flt_t dklz = x[l].z - x[k].z;
+        flt_t rklsq = dklx * dklx + dkly * dkly + dklz * dklz;
+        flt_t wl = 1, dwl = 0;
+        flt_t rkl = 0;
+        if (rklsq >= rcminsq[ktype][ltype]) {
+          rkl = overloaded::sqrt(rklsq);
+          wl = Sp(rkl, rcmin[ktype][ltype], rcmax[ktype][ltype], &dwl);
+        }
+        int attempt = 0;
+        int start_hash_slot = aut_lj_tap_hash_fn(l, attempt);
+        int hash_slot = start_hash_slot;
+        while (result->i[hash_slot] == i && result->j[hash_slot] != l && 
+	       attempt < OPT_TEST_PATH_SIZE) {
+          hash_slot = aut_lj_tap_hash_fn(l, ++attempt);
+        }
+        if (attempt >= OPT_TEST_PATH_SIZE) goto exceed_limits;
+        bool init_slot = result->i[hash_slot] != i;
+        if (init_slot || (1 - wj * wk * wl < result->cij[hash_slot])) {
+          result->i[hash_slot] = i;
+          result->j[hash_slot] = l;
+          result->cij[hash_slot] = 1 - wj * wk * wl;
+          if (wj * wk * wl != 1.0) {
+            if (path_insert_pos >= OPT_TEST_PATH_ITEMS) goto exceed_limits;
+            result->testpath_idx[hash_slot] = path_insert_pos;
+            LennardJonesPathAIREBOT<flt_t> *path = 
+	      &result->testpath[path_insert_pos++];
+            path->num = 4;
+            path->del[0].x = dijx;
+            path->del[0].y = dijy;
+            path->del[0].z = dijz;
+            if (rij == 0) rij = sqrt(rijsq);
+            path->r[0] = rij;
+            path->del[1].x = djkx;
+            path->del[1].y = djky;
+            path->del[1].z = djkz;
+            if (rjk == 0) rjk = sqrt(rjksq);
+            path->r[1] = rjk;
+            path->del[2].x = dklx;
+            path->del[2].y = dkly;
+            path->del[2].z = dklz;
+            if (rkl == 0) rkl = sqrt(rklsq);
+            path->r[2] = rkl;
+            path->w[0] = wj;
+            path->dw[0] = dwj;
+            path->w[1] = wk;
+            path->dw[1] = dwk;
+            path->w[2] = wl;
+            path->dw[2] = dwl;
+            path->idx[0] = i;
+            path->idx[1] = j;
+            path->idx[2] = k;
+            path->idx[3] = l;
+          }
+        }
+      }
+    }
+  }
+  return true;
+exceed_limits:
+  return false;
+}
+
+/*
+ * Attempt to look up an element in the hash-map.
+ */
+static fvec aut_airebo_lj_tap_test_path(KernelArgsAIREBOT<flt_t,acc_t> * ka, 
+  struct aut_airebo_lj_test_path_result_data * test_path_result,
+  bvec need_search, ivec i_bc, ivec j, 
+  LennardJonesPathAIREBOT<flt_t> path[fvec::VL]
+) {
+  const ivec c_i1 = ivec::set1(1);
+  fvec cij = fvec::set1(1.0);
+  // first round: hash all j
+  // lookup i/j in hash list.
+  // if i matches and j matches: congrats
+  // if i matches and j does not: look up attempts
+  // if attempts > current_attempts:
+  //   do another round of hashing
+  // for all those found:
+
+  //   fill in the path
+  // -----------------------------------------------
+  // find all the correct hash slots, and a mask of where found.
+  ivec attempt = ivec::setzero();
+  ivec hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
+  ivec lookup_i = ivec::mask_gather(ivec::undefined(), need_search, hash_slot,
+      &test_path_result->i[0], sizeof(int));
+  bvec correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
+  ivec lookup_j = ivec::mask_gather(ivec::undefined(), correct_i, hash_slot,
+      &test_path_result->j[0], sizeof(int));
+  bvec found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
+  bvec another_attempt = correct_i & ~ found_items;
+  while (bvec::test_any_set(another_attempt)) {
+    attempt = ivec::mask_add(attempt, another_attempt, attempt, c_i1);
+    hash_slot = aut_airebo_lj_tap_hash_fn_vec(j, attempt);
+    ivec lookup_i_2 = ivec::mask_gather(lookup_i, another_attempt, hash_slot,
+        &test_path_result->i[0], sizeof(int));
+    lookup_i = lookup_i_2;
+    correct_i = ivec::mask_cmpeq(need_search, lookup_i, i_bc);
+    lookup_j = ivec::mask_gather(lookup_j, another_attempt, hash_slot,
+        &test_path_result->j[0], sizeof(int));
+    found_items = ivec::mask_cmpeq(correct_i, lookup_j, j);
+    another_attempt = correct_i & ~ found_items;
+  }
+  cij = fvec::mask_gather(cij, found_items, hash_slot, 
+			  &test_path_result->cij[0], sizeof(flt_t));
+  bvec need_testpath = fvec::mask_cmplt(found_items, fvec::setzero(), cij);
+  if (bvec::test_any_set(need_testpath)) {
+    for (int i = 0; i < fvec::VL; i++) {
+      if (bvec::test_at(need_testpath, i)) {
+        int testpath_idx = 
+          test_path_result->testpath_idx[ivec::at(hash_slot, i)];
+        path[i] = test_path_result->testpath[testpath_idx];
+      }
+    }
+  }
+  return cij;
+}
+
+/*
+ * This function calculates the Lennard-Jones interaciton for those
+ * elements that require a bond-order calculation.
+ * It is similarly structured as the aut_frebo_batch_of_kind function.
+ * The forces due to bondorders are calculated speculatively and later
+ * updated with the correct outer derivative.
+ */
+template<int MORSEFLAG>
+static void aut_lj_with_bo(
+    KernelArgsAIREBOT<flt_t,acc_t> * ka,
+    int itype, int jtype,
+    ivec i, ivec j,
+    fvec cij, LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL]
+) {
+ { // jump-scope for exceed_limits
+  AtomAIREBOT<flt_t> * _noalias x = ka->x;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+
+  ivec c_i4 = ivec::set1(4);
+  fvec c_1_0 = fvec::set1(1.0);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_0_5 = fvec::set1(0.5);
+
+  fvec x_i, y_i, z_i;
+  aut_loadatoms_vec_notype(x, i, &x_i, &y_i, &z_i);
+  fvec x_j, y_j, z_j;
+  aut_loadatoms_vec_notype(x, j, &x_j, &y_j, &z_j);
+  fvec delx = x_i -  x_j;
+  fvec dely = y_i -  y_j;
+  fvec delz = z_i -  z_j;
+  fvec rsq = delx *  delx +  dely *  dely +  delz *  delz;
+
+  fvec rij = fvec::sqrt(rsq);
+  bvec need_path_force = fvec::cmplt(cij, c_1_0);
+  flt_t sigcut = ka->params.sigcut;
+  flt_t sigmin = ka->params.sigmin;
+  flt_t sigma = ka->params.sigma[itype][jtype];
+  flt_t rljmax = sigcut * sigma;
+  flt_t rljmin = sigmin * sigma;
+  fvec p_rljmin = fvec::set1(rljmin);
+  fvec p_rljmax = fvec::set1(rljmax);
+
+  fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
+
+  fvec p_lj1 = fvec::set1(ka->params.lj1[itype][jtype]);
+  fvec p_lj2 = fvec::set1(ka->params.lj2[itype][jtype]);
+  fvec p_lj3 = fvec::set1(ka->params.lj3[itype][jtype]);
+  fvec p_lj4 = fvec::set1(ka->params.lj4[itype][jtype]);
+
+  fvec r2inv = fvec::recip(rsq);
+
+  fvec vdw, dvdw;
+  if (MORSEFLAG) {
+    fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
+    vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
+    dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
+  } else {
+    fvec r6inv = r2inv *  r2inv *  r2inv;
+
+    vdw = r6inv * ( p_lj3 *  r6inv -  p_lj4);
+    fvec r7inv = r6inv *  rij *  r2inv;
+    dvdw = r7inv * ( p_lj2 -  p_lj1 *  r6inv);
+  }
+
+  fvec VLJ = vdw *  slw;
+  fvec dVLJ = dvdw *  slw +  vdw *  dslw;
+
+  fvec p_rcLJmin = fvec::set1(ka->params.rcLJmin[itype][jtype]);
+  fvec p_rcLJmax = fvec::set1(ka->params.rcLJmax[itype][jtype]);
+  fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
+  fvec VA = cij *  VLJ *  Str;
+
+  fvec fij[3], fji[3];
+  fij[0] = fvec::setzero(); fij[1] = fvec::setzero();
+  fij[2] = fvec::setzero();
+  fji[0] = fvec::setzero(); fji[1] = fvec::setzero();
+  fji[2] = fvec::setzero();
+
+  ivec vi = i;
+  ivec vj = j;
+
+  struct aut_frebo_data i_data, j_data;
+  i_data.x_i = x_i;
+  i_data.y_i = y_i;
+  i_data.z_i = z_i;
+  i_data.x_j = x_j;
+  i_data.y_j = y_j;
+  i_data.z_j = z_j;
+  j_data.x_i = x_j;
+  j_data.y_i = y_j;
+  j_data.z_i = z_j;
+  j_data.x_j = x_i;
+  j_data.y_j = y_i;
+  j_data.z_j = z_i;
+
+  fvec p_rcmin = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec p_rcmax = fvec::set1(ka->params.rcmax[itype][jtype]);
+  fvec dwij;
+  fvec wij = aut_Sp_deriv(rij, p_rcmin, p_rcmax, &dwij);
+
+  fvec nHi = fvec::gather(vi, ka->nH, sizeof(flt_t));
+  fvec nCi = fvec::gather(vi, ka->nC, sizeof(flt_t));
+  fvec nHj = fvec::gather(vj, ka->nH, sizeof(flt_t));
+  fvec nCj = fvec::gather(vj, ka->nC, sizeof(flt_t));
+  fvec Nij = nHi +  nCi -  wij;
+  fvec Nji = nHj +  nCj -  wij;
+  i_data.nHi = nHi;
+  i_data.nCi = nCi;
+  j_data.nHi = nHj;
+  j_data.nCi = nCj;
+
+  fvec the_r = fvec::set1(ka->params.rcmin[itype][jtype]);
+  fvec scale = the_r / rij;
+
+  fvec NconjtmpI;
+  fvec pij = aut_frebo_pij_pd_2(ka, &i_data, itype, jtype, vi, vj, 
+				delx * scale, dely * scale, delz * scale, 
+				the_r, wij, VA, &NconjtmpI, fij);
+
+  if (i_data.buf_len < 0) goto exceed_limits;
+
+  fvec NconjtmpJ;
+  fvec rjix = fvec::setzero() -  delx;
+  fvec rjiy = fvec::setzero() -  dely;
+  fvec rjiz = fvec::setzero() -  delz;
+  fvec pji = aut_frebo_pij_pd_2(ka, &j_data, jtype, itype, vj, vi, 
+				rjix * scale, rjiy * scale, rjiz * scale, 
+				the_r, wij, VA, &NconjtmpJ, fji);
+  fij[0] = fij[0] -  fji[0];
+  fij[1] = fij[1] -  fji[1];
+  fij[2] = fij[2] -  fji[2];
+
+  if (j_data.buf_len < 0) goto exceed_limits;
+
+  fvec Nijconj = c_1_0 +  NconjtmpI *  NconjtmpI +  NconjtmpJ *  NconjtmpJ;
+  fvec dN3[3];
+  fvec pi_rc = aut_frebo_pi_rc_pd(ka, itype, jtype, Nij, Nji, Nijconj, dN3);
+
+  fvec c_TOL = fvec::set1(TOL);
+  fvec dN3_dh[3];
+  fvec Tij = aut_frebo_Tij(ka, itype, jtype, Nij, Nji, Nijconj, &dN3_dh[0]);
+  bvec TijgtTOLmask = fvec::cmpnle(fvec::abs(Tij), c_TOL);
+  fvec sum_omega = fvec::setzero();
+  if (bvec::test_any_set(TijgtTOLmask)) {
+    sum_omega = aut_frebo_sum_omega(
+        ka, &i_data, &j_data, itype, jtype, vi, vj,
+        delx * scale, dely * scale, delz * scale, the_r, VA *  Tij, fij);
+    sum_omega = fvec::mask_blend(TijgtTOLmask, fvec::setzero(), sum_omega);
+  }
+  fvec pi_dh = Tij *  sum_omega;
+
+  fvec bij = c_0_5 * ( pij +  pji) + pi_rc +  pi_dh;
+
+  fvec p_bLJmin = fvec::set1(ka->params.bLJmin[itype][jtype]);
+  fvec p_bLJmax = fvec::set1(ka->params.bLJmax[itype][jtype]);
+  fvec dStb, Stb = aut_Sp2_deriv(bij, p_bLJmin, p_bLJmax, &dStb);
+
+  bvec need_bo_deriv = fvec::cmpneq(dStb, fvec::setzero());
+  // fix up j_data, i_data, fij:
+  // multiply each by dStb
+  if (bvec::test_any_set(need_bo_deriv)) {
+    i_data.force_i_x = dStb * i_data.force_i_x;
+    i_data.force_i_y = dStb * i_data.force_i_y;
+    i_data.force_i_z = dStb * i_data.force_i_z;
+    i_data.force_j_x = dStb * i_data.force_j_x;
+    i_data.force_j_y = dStb * i_data.force_j_y;
+    i_data.force_j_z = dStb * i_data.force_j_z;
+    j_data.force_i_x = dStb * j_data.force_i_x;
+    j_data.force_i_y = dStb * j_data.force_i_y;
+    j_data.force_i_z = dStb * j_data.force_i_z;
+    j_data.force_j_x = dStb * j_data.force_j_x;
+    j_data.force_j_y = dStb * j_data.force_j_y;
+    j_data.force_j_z = dStb * j_data.force_j_z;
+    for (int k = 0; k < i_data.buf_len; k++) {
+      i_data.force_k_x_buf[k] = dStb * i_data.force_k_x_buf[k];
+      i_data.force_k_y_buf[k] = dStb * i_data.force_k_y_buf[k];
+      i_data.force_k_z_buf[k] = dStb * i_data.force_k_z_buf[k];
+    }
+    for (int k = 0; k < j_data.buf_len; k++) {
+      j_data.force_k_x_buf[k] = dStb * j_data.force_k_x_buf[k];
+      j_data.force_k_y_buf[k] = dStb * j_data.force_k_y_buf[k];
+      j_data.force_k_z_buf[k] = dStb * j_data.force_k_z_buf[k];
+    }
+    fvec fijc[3];
+    fijc[0] = dStb * fij[0];
+    fijc[1] = dStb * fij[1];
+    fijc[2] = dStb * fij[2];
+    fij[0] = scale * (fijc[0] - (delx * delx * fijc[0] + dely * delx * 
+				 fijc[1] + delz * delx * fijc[2]) / rsq);
+    fij[1] = scale * (fijc[1] - (delx * dely * fijc[0] + dely * dely * 
+				 fijc[1] + delz * dely * fijc[2]) / rsq);
+    fij[2] = scale * (fijc[2] - (delx * delz * fijc[0] + dely * delz * 
+				 fijc[1] + delz * delz * fijc[2]) / rsq);
+
+    aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, dStb * VA, 
+			     dN3[0], dN3[2], NconjtmpI);
+    aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, dStb * VA, 
+			     dN3[1], dN3[2], NconjtmpJ);
+    if (bvec::test_any_set(TijgtTOLmask)) {
+      aut_frebo_N_spline_force(ka, &i_data, itype, jtype, vi, vj, 
+			       dStb * VA * sum_omega, dN3_dh[0], dN3_dh[2], 
+			       NconjtmpI);
+      aut_frebo_N_spline_force(ka, &j_data, jtype, itype, vj, vi, 
+			       dStb * VA * sum_omega, dN3_dh[1], dN3_dh[2], 
+			       NconjtmpJ);
+    }
+
+    aut_frebo_data_writeback(ka, &i_data);
+    aut_frebo_data_writeback(ka, &j_data);
+  } else {
+    fij[0] = fvec::setzero();
+    fij[1] = fvec::setzero();
+    fij[2] = fvec::setzero();
+  }
+
+  fvec fpdVLJ = cij *  dVLJ * ( c_1_0 +  Str * ( Stb -  c_1_0));
+  fvec fpdStr = dStr *  cij * ( Stb *  VLJ -  VLJ);
+  fvec fpair = r2inv *  rij * ( fvec::setzero() - ( fpdVLJ +  fpdStr));
+  fvec evdwl = VA *  Stb +  cij *  VLJ * ( c_1_0 -  Str);
+
+  fvec result_f_i_x = fpair *  delx +  fij[0];
+  fvec result_f_i_y = fpair *  dely +  fij[1];
+  fvec result_f_i_z = fpair *  delz +  fij[2];
+  fvec result_f_j_x = fvec::setzero() -  result_f_i_x;
+  fvec result_f_j_y = fvec::setzero() -  result_f_i_y;
+  fvec result_f_j_z = fvec::setzero() -  result_f_i_z;
+
+  flt_t fi_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fi_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fi_i_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t fj_x_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_y_buf[fvec::VL] __attribute__((aligned(64)));
+  flt_t fj_z_buf[fvec::VL] __attribute__((aligned(64)));
+  int fj_j_buf[ivec::VL] __attribute__((aligned(64)));
+  flt_t evdwl_buf[fvec::VL] __attribute__((aligned(64)));
+
+  if (bvec::test_any_set(need_bo_deriv)) {
+    result_f_i_x = i_data.force_i_x +  result_f_i_x;
+    result_f_i_y = i_data.force_i_y +  result_f_i_y;
+    result_f_i_z = i_data.force_i_z +  result_f_i_z;
+    result_f_j_x = i_data.force_j_x +  result_f_j_x;
+    result_f_j_y = i_data.force_j_y +  result_f_j_y;
+    result_f_j_z = i_data.force_j_z +  result_f_j_z;
+
+    result_f_i_x = j_data.force_j_x +  result_f_i_x;
+    result_f_i_y = j_data.force_j_y +  result_f_i_y;
+    result_f_i_z = j_data.force_j_z +  result_f_i_z;
+    result_f_j_x = j_data.force_i_x +  result_f_j_x;
+    result_f_j_y = j_data.force_i_y +  result_f_j_y;
+    result_f_j_z = j_data.force_i_z +  result_f_j_z;
+  }
+
+  fvec::store(fi_x_buf, result_f_i_x);
+  fvec::store(fi_y_buf, result_f_i_y);
+  fvec::store(fi_z_buf, result_f_i_z);
+  ivec::store(fi_i_buf, vi);
+  fvec::store(fj_x_buf, result_f_j_x);
+  fvec::store(fj_y_buf, result_f_j_y);
+  fvec::store(fj_z_buf, result_f_j_z);
+  ivec::store(fj_j_buf, vj);
+  fvec::store(evdwl_buf, evdwl);
+
+  int lane;
+  for (lane = 0; lane < fvec::VL; lane++) {
+    int ii = fi_i_buf[lane];
+    result_f[ii].x += fi_x_buf[lane];
+    result_f[ii].y += fi_y_buf[lane];
+    result_f[ii].z += fi_z_buf[lane];
+    result_f[ii].w += 0.5 * evdwl_buf[lane];
+    int jj = fj_j_buf[lane];
+    result_f[jj].x += fj_x_buf[lane];
+    result_f[jj].y += fj_y_buf[lane];
+    result_f[jj].z += fj_z_buf[lane];
+    result_f[jj].w += 0.5 * evdwl_buf[lane];
+  }
+  ka->result_eng += fvec::reduce_add(evdwl);
+
+  if (bvec::test_any_set(need_path_force)) {
+    fvec dC = VLJ * ( Str *  Stb +  c_1_0 -  Str);
+    aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
+  }
+  return;
+ }
+exceed_limits:
+  for (int l = 0; l < fvec::VL; l++) {
+    ref_lennard_jones_single_interaction(ka, ivec::at(i, l), ivec::at(j, l), 
+					 MORSEFLAG);
+  }
+  return;
+}
+
+/*
+ * Calculate the lennard-jones interaction.
+ * Uses the above hash-map, and outlines the calculation if the bondorder is
+ *  needed.
+ * Agressively compresses to get the most values calculated.
+ */
+template<int MORSEFLAG>
+static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+  AtomAIREBOT<flt_t> * x = ka->x;
+  int * tag = ka->tag;
+  int * map = ka->map;
+  ResultForceT<acc_t> * result_f = ka->result_f;
+  ivec c_i1 = ivec::set1(1);
+  ivec c_i4 = ivec::set1(4);
+  fvec c_1_0 = fvec::set1(1.0);
+  fvec c_2_0 = fvec::set1(2.0);
+  fvec c_0_0 = fvec::set1(0.0);
+  int map_i_scalar = 0;
+  {
+    int i;
+    for (i = 1; i < ka->num_types; i++) {
+      if (ka->map[i])
+        map_i_scalar |= (1 << i);
+    }
+  }
+  ivec map_i = ivec::set1(map_i_scalar);
+  fvec result_eng = fvec::setzero();
+
+  struct aut_airebo_lj_test_path_result_data test_path_result;
+  for (int i = 0; i < OPT_TEST_PATH_SIZE; i++) {
+    test_path_result.i[i] = -1;
+  }
+
+  ivec i_bo[2][2];
+  ivec j_bo[2][2];
+  fvec cij_bo[2][2];
+  LennardJonesPathAIREBOT<flt_t> testpath_bo[2][2][fvec::VL];
+  int num_bo[2][2] = {0};
+
+  for (int i = ka->frebo_from_atom; i < ka->frebo_to_atom; i++) {
+    ivec itag_bc = ivec::set1(tag[i]);
+    int itype = map[x[i].w];
+    fvec x_i = fvec::set1(x[i].x);
+    fvec y_i = fvec::set1(x[i].y);
+    fvec z_i = fvec::set1(x[i].z);
+    ivec i_bc = ivec::set1(i);
+
+    fvec cutljsq0 = fvec::set1(ka->params.cutljsq[itype][0]);
+    fvec cutljsq1 = fvec::set1(ka->params.cutljsq[itype][1]);
+    fvec p_rcmax0 = fvec::set1(ka->params.rcmax[itype][0]);
+    fvec p_rcmax1 = fvec::set1(ka->params.rcmax[itype][1]);
+    flt_t sigcut = ka->params.sigcut;
+    flt_t sigmin = ka->params.sigmin;
+    flt_t sigma0 = ka->params.sigma[itype][0];
+    flt_t rljmax0 = sigcut * sigma0;
+    flt_t rljmin0 = sigmin * sigma0;
+    flt_t sigma1 = ka->params.sigma[itype][1];
+    flt_t rljmax1 = sigcut * sigma1;
+    flt_t rljmin1 = sigmin * sigma1;
+    fvec p_rljmax0 = fvec::set1(rljmax0);
+    fvec p_rljmax1 = fvec::set1(rljmax1);
+    fvec p_rljmin0 = fvec::set1(rljmin0);
+    fvec p_rljmin1 = fvec::set1(rljmin1);
+    fvec p_rcLJmax0 = fvec::set1(ka->params.rcLJmax[itype][0]);
+    fvec p_rcLJmax1 = fvec::set1(ka->params.rcLJmax[itype][1]);
+    fvec p_rcLJmin0 = fvec::set1(ka->params.rcLJmin[itype][0]);
+    fvec p_rcLJmin1 = fvec::set1(ka->params.rcLJmin[itype][1]);
+    fvec p_lj10 = fvec::set1(ka->params.lj1[itype][0]);
+    fvec p_lj11 = fvec::set1(ka->params.lj1[itype][1]);
+    fvec p_lj20 = fvec::set1(ka->params.lj2[itype][0]);
+    fvec p_lj21 = fvec::set1(ka->params.lj2[itype][1]);
+    fvec p_lj30 = fvec::set1(ka->params.lj3[itype][0]);
+    fvec p_lj31 = fvec::set1(ka->params.lj3[itype][1]);
+    fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
+    fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);
+
+    int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+    int jnum = ka->neigh_lmp.num_half[i];
+
+    bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);
+    if (! tap_success) {
+      for (int jj = 0; jj < jnum; jj++) {
+        ref_lennard_jones_single_interaction(ka, i, neighs[jj], MORSEFLAG);
+      }
+      continue;
+    }
+
+    ivec j_2;
+    fvec delx_2, dely_2, delz_2, rsq_2;
+    bvec jtype_mask_2;
+    int num_2 = 0;
+
+    fvec result_f_i_x = fvec::setzero();
+    fvec result_f_i_y = fvec::setzero();
+    fvec result_f_i_z = fvec::setzero();
+
+    int jj = 0;
+    bool rest_j = jj < jnum;
+    bool rest_2 = fvec::fast_compress();
+    #pragma forceinline recursive
+    while (rest_j || rest_2) {
+      fvec delx, dely, delz, rsq;
+      bvec jtype_mask, within_cutoff;
+      ivec j;
+      if (rest_j) {
+        bvec mask_0 = bvec::full();
+	//0xFF >> (8 - (jnum - jj));
+        if (jj + (fvec::VL - 1) >= jnum) mask_0 = bvec::only(jnum - jj);
+        j = ivec::maskz_loadu(mask_0, &neighs[jj]);
+        fvec x_j, y_j, z_j;
+        aut_loadatoms_vec(x, j, &x_j, &y_j, &z_j, &jtype_mask, map, map_i, 
+			  c_i1);
+        fvec::gather_prefetch0(ivec::mullo(c_i4, 
+	  ivec::maskz_loadu(bvec::full(), &neighs[jj + fvec::VL])), x);
+        _mm_prefetch((const char*)&neighs[jj + 2 * fvec::VL], _MM_HINT_T0);
+        delx = x_i -  x_j;
+        dely = y_i -  y_j;
+        delz = z_i -  z_j;
+        rsq = delx *  delx +  dely *  dely +  delz *  delz;
+        fvec cutoff_sq = fvec::mask_blend(jtype_mask, cutljsq0, cutljsq1);
+        within_cutoff = fvec::mask_cmplt(mask_0, rsq, cutoff_sq);
+
+        if (fvec::fast_compress()) {
+          j = ivec::masku_compress(within_cutoff, j);
+          delx = fvec::masku_compress(within_cutoff, delx);
+          dely = fvec::masku_compress(within_cutoff, dely);
+          delz = fvec::masku_compress(within_cutoff, delz);
+          rsq = fvec::masku_compress(within_cutoff, rsq);
+          jtype_mask = bvec::masku_compress(within_cutoff, jtype_mask);
+          //within_cutoff = 0xFF >> (8 - _cc_popcnt(within_cutoff));
+
+          bvec mask_2 = bvec::after(num_2);//0xFF << num_2;
+          j_2 = ivec::mask_expand(j_2, mask_2, j);
+          delx_2 = fvec::mask_expand(delx_2, mask_2, delx);
+          dely_2 = fvec::mask_expand(dely_2, mask_2, dely);
+          delz_2 = fvec::mask_expand(delz_2, mask_2, delz);
+          rsq_2 = fvec::mask_expand(rsq_2, mask_2, rsq);
+          jtype_mask_2 = bvec::mask_expand(jtype_mask_2, mask_2, jtype_mask);
+          num_2 = num_2 + bvec::popcnt(within_cutoff);
+          if (num_2 < fvec::VL) {
+            jj += fvec::VL;
+            rest_j = jj < jnum;
+            continue;
+          }
+
+          num_2 -= fvec::VL;
+	  //(0xFF >> (8 - num_2)) << (_cc_popcnt(within_cutoff) - num_2);
+          mask_2 = bvec::onlyafter(num_2, bvec::popcnt(within_cutoff) - num_2);
+          {
+            ivec tmp_j = j_2;
+            j_2 = ivec::masku_compress(mask_2, j);
+            j = tmp_j;
+            fvec tmp_delx = delx_2;
+            delx_2 = fvec::masku_compress(mask_2, delx);
+            delx = tmp_delx;
+            fvec tmp_dely = dely_2;
+            dely_2 = fvec::masku_compress(mask_2, dely);
+            dely = tmp_dely;
+            fvec tmp_delz = delz_2;
+            delz_2 = fvec::masku_compress(mask_2, delz);
+            delz = tmp_delz;
+            fvec tmp_rsq = rsq_2;
+            rsq_2 = fvec::masku_compress(mask_2, rsq);
+            rsq = tmp_rsq;
+            bvec tmp_jtype_mask = jtype_mask_2;
+            jtype_mask_2 = bvec::masku_compress(mask_2, jtype_mask);
+            jtype_mask = tmp_jtype_mask;
+            within_cutoff = bvec::full();
+          }
+        }
+      } else if (rest_2) {
+        rest_2 = false;
+        j = j_2;
+        delx = delx_2;
+        dely = dely_2;
+        delz = delz_2;
+        rsq = rsq_2;
+        jtype_mask = jtype_mask_2;
+        within_cutoff = bvec::only(num_2);
+        num_2 = 0;
+      }
+
+      bvec current_mask = within_cutoff;
+      if (bvec::test_all_unset(current_mask)) {
+        jj += fvec::VL;
+        rest_j = jj < jnum;
+        continue;
+      }
+
+      fvec rij = fvec::sqrt(rsq);
+      LennardJonesPathAIREBOT<flt_t> testpath[fvec::VL];
+      fvec cij = c_1_0;
+      fvec p_cut3rebo = fvec::set1(ka->params.cut3rebo);
+      bvec need_search = fvec::mask_cmplt(current_mask, rij, p_cut3rebo);
+      if (bvec::test_any_set(need_search)) {
+        fvec p_rcmax = fvec::mask_blend(jtype_mask, p_rcmax0, p_rcmax1);
+        #pragma noinline
+        cij = aut_airebo_lj_tap_test_path(ka, &test_path_result, need_search, 
+					  i_bc, j, testpath);
+      }
+      current_mask = fvec::mask_cmplt(current_mask, c_0_0, cij);
+      if (bvec::test_all_unset(current_mask)) {
+        jj += fvec::VL;
+        rest_j = jj < jnum;
+        continue;
+      }
+      bvec need_path_force = fvec::mask_cmplt(current_mask, cij, c_1_0);
+
+      fvec p_rljmax = fvec::mask_blend(jtype_mask, p_rljmax0, p_rljmax1);
+      fvec p_rljmin = fvec::mask_blend(jtype_mask, p_rljmin0, p_rljmin1);
+
+      fvec dslw, slw = aut_Sp2_deriv(rij, p_rljmin, p_rljmax, &dslw);
+
+      fvec p_lj1 = fvec::mask_blend(jtype_mask, p_lj10, p_lj11);
+      fvec p_lj2 = fvec::mask_blend(jtype_mask, p_lj20, p_lj21);
+      fvec p_lj3 = fvec::mask_blend(jtype_mask, p_lj30, p_lj31);
+      fvec p_lj4 = fvec::mask_blend(jtype_mask, p_lj40, p_lj41);
+
+      fvec vdw, dvdw;
+
+      fvec r2inv = fvec::recip(rsq);
+
+      if (MORSEFLAG) {
+        fvec exr = fvec::exp(fvec::setzero() - rij * p_lj4);
+        vdw = p_lj1 * exr * (p_lj2 * exr - c_2_0);
+        dvdw = p_lj3 * exr * (c_1_0 - p_lj2 * exr);
+      } else {
+        fvec r6inv = r2inv *  r2inv *  r2inv;
+
+        vdw = r6inv * ( p_lj3 *  r6inv -  p_lj4);
+        fvec r7inv = r6inv *  rij *  r2inv;
+        dvdw = r7inv * ( p_lj2 -  p_lj1 *  r6inv);
+      }
+
+      fvec VLJ = vdw *  slw;
+      fvec dVLJ = dvdw *  slw +  vdw *  dslw;
+
+      fvec p_rcLJmin = fvec::mask_blend(jtype_mask, p_rcLJmin0, p_rcLJmin1);
+      fvec p_rcLJmax = fvec::mask_blend(jtype_mask, p_rcLJmax0, p_rcLJmax1);
+      fvec dStr, Str = aut_Sp2_deriv(rij, p_rcLJmin, p_rcLJmax, &dStr);
+      fvec VA = cij *  VLJ *  Str;
+      bvec need_bondorder = fvec::mask_cmplt(current_mask, c_0_0, Str);
+      fvec Stb = fvec::setzero();
+      fvec fij[3];
+      fij[0] = fvec::setzero();
+      fij[1] = fvec::setzero();
+      fij[2] = fvec::setzero();
+      if (bvec::test_any_set(need_bondorder)) {
+        for (int jtype = 0; jtype < 2; jtype++) {
+          bvec need_bo_with_jtype = need_bondorder;
+          if (jtype) need_bo_with_jtype = need_bo_with_jtype & jtype_mask;
+          else need_bo_with_jtype = need_bo_with_jtype & ~ jtype_mask;
+          ivec jtmp = ivec::masku_compress(need_bo_with_jtype, j);
+          ivec itmp = ivec::masku_compress(need_bo_with_jtype, ivec::set1(i));
+          fvec cijtmp = fvec::masku_compress(need_bo_with_jtype, cij);
+          bvec insert_mask = bvec::after(num_bo[itype][jtype]);
+          i_bo[itype][jtype] = ivec::mask_expand(i_bo[itype][jtype], 
+						 insert_mask, itmp);
+          j_bo[itype][jtype] = ivec::mask_expand(j_bo[itype][jtype], 
+						 insert_mask, jtmp);
+          cij_bo[itype][jtype] = fvec::mask_expand(cij_bo[itype][jtype], 
+						   insert_mask, cijtmp);
+          bvec need_path_force_with_jtype = need_bo_with_jtype & 
+	    need_path_force;
+          int testpath_end = fvec::VL;
+          if (bvec::test_any_set(need_path_force_with_jtype)) {
+            int pos = num_bo[itype][jtype];
+            for (int l = 0; l < fvec::VL; l++) {
+              if (pos >= fvec::VL) {
+                testpath_end = l;
+                break;
+              }
+              if (bvec::test_at(need_path_force_with_jtype, l)) {
+                testpath_bo[itype][jtype][pos] = testpath[l];
+              }
+              if (bvec::test_at(need_bo_with_jtype, l)) {
+                pos += 1;
+              }
+            }
+          }
+          num_bo[itype][jtype] = num_bo[itype][jtype] + 
+	    bvec::popcnt(need_bo_with_jtype);
+          if (num_bo[itype][jtype] >= fvec::VL) {
+            #pragma noinline
+            aut_lj_with_bo<MORSEFLAG>(ka, itype, jtype, i_bo[itype][jtype], 
+				      j_bo[itype][jtype], cij_bo[itype][jtype],
+				      testpath_bo[itype][jtype]);
+            num_bo[itype][jtype] -= fvec::VL;
+            insert_mask = bvec::onlyafter(num_bo[itype][jtype], 
+					  bvec::popcnt(need_bo_with_jtype) - 
+					  num_bo[itype][jtype]);
+            i_bo[itype][jtype] = ivec::masku_compress(insert_mask, itmp);
+            j_bo[itype][jtype] = ivec::masku_compress(insert_mask, jtmp);
+            cij_bo[itype][jtype] = fvec::masku_compress(insert_mask, cijtmp);
+            if (bvec::test_any_set(need_path_force_with_jtype)) {
+              int pos = 0;
+              for (int l = testpath_end; l < fvec::VL; l++) {
+                if (bvec::test_at(need_path_force_with_jtype, l)) {
+                  testpath_bo[itype][jtype][pos] = testpath[l];
+                }
+                if (bvec::test_at(need_bo_with_jtype, l)) {
+                  pos += 1;
+                }
+              }
+            }
+          }
+        }
+        current_mask = current_mask & ~ need_bondorder;
+        need_path_force = need_path_force & ~ need_bondorder;
+      }
+
+      fvec fpdVLJ = cij *  dVLJ * ( c_1_0 +  Str * ( Stb -  c_1_0));
+      fvec fpdStr = dStr *  cij * ( Stb *  VLJ -  VLJ);
+      fvec fpair = r2inv *  rij * ( fvec::setzero() - ( fpdVLJ +  fpdStr));
+      fvec evdwl = VA *  Stb +  cij *  VLJ * ( c_1_0 -  Str);
+
+      fvec fix = fpair *  delx +  fij[0];
+      fvec fiy = fpair *  dely +  fij[1];
+      fvec fiz = fpair *  delz +  fij[2];
+      result_f_i_x = fvec::mask_add(result_f_i_x, current_mask, result_f_i_x, 
+				    fix);
+      result_f_i_y = fvec::mask_add(result_f_i_y, current_mask, result_f_i_y, 
+				    fiy);
+      result_f_i_z = fvec::mask_add(result_f_i_z, current_mask, result_f_i_z, 
+				    fiz);
+      result_eng = fvec::mask_add(result_eng, current_mask, result_eng, evdwl);
+
+      ivec j_dbl_idx = ivec::mullo(j, c_i4);
+      avec fjx = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].x, sizeof(acc_t));
+      avec fjy = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].y, sizeof(acc_t));
+      avec fjz = avec::mask_gather(avec::undefined(), current_mask, j_dbl_idx, 
+				   &ka->result_f[0].z, sizeof(acc_t));
+
+      fjx = fjx -  fix;
+      fjy = fjy -  fiy;
+      fjz = fjz -  fiz;
+      avec::mask_i32loscatter(&ka->result_f[0].x, current_mask, j_dbl_idx, fjx, 
+			      sizeof(acc_t));
+      avec::mask_i32loscatter(&ka->result_f[0].y, current_mask, j_dbl_idx, fjy, 
+			      sizeof(acc_t));
+      avec::mask_i32loscatter(&ka->result_f[0].z, current_mask, j_dbl_idx, fjz, 
+			      sizeof(acc_t));
+
+      if (bvec::test_any_set(need_path_force)) {
+        fvec dC = VLJ * ( Str *  Stb +  c_1_0 -  Str);
+        #pragma noinline
+        aut_airebo_lj_force_path(ka, need_path_force, dC, testpath);
+      }
+      jj += fvec::VL;
+      rest_j = jj < jnum;
+    }
+    ka->result_f[i].x += fvec::reduce_add(result_f_i_x);
+    ka->result_f[i].y += fvec::reduce_add(result_f_i_y);
+    ka->result_f[i].z += fvec::reduce_add(result_f_i_z);
+  }
+  for (int itype = 0; itype < 2; itype++) {
+    for (int jtype = 0; jtype < 2; jtype++) {
+      for (int l = 0; l < num_bo[itype][jtype]; l++) {
+        ref_lennard_jones_single_interaction(ka,ivec::at(i_bo[itype][jtype],l),
+					     ivec::at(j_bo[itype][jtype], l),
+					     MORSEFLAG);
+      }
+    }
+  }
+  ka->result_eng += fvec::reduce_add(result_eng);
+}
+
+};
+
+template<typename flt_t, typename acc_t>
+void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka, int morseflag) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_lennard_jones(ka, morseflag);
+#else
+  if (morseflag) {
+    aut_wrap<flt_t,acc_t>::template aut_lennard_jones<1>(ka);
+  } else {
+    aut_wrap<flt_t,acc_t>::template aut_lennard_jones<0>(ka);
+  }
+#endif
+}
+
+template<typename flt_t, typename acc_t>
+void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_rebo_neigh(ka);
+#else
+  aut_wrap<flt_t,acc_t>::aut_rebo_neigh(ka);
+#endif
+}
+
+template<typename flt_t, typename acc_t>
+void aut_frebo(KernelArgsAIREBOT<flt_t,acc_t> * ka, int torsion_flag) {
+#ifdef LMP_INTEL_AIREBO_REF
+  ref_frebo(ka, torsion_flag);
+#else
+  aut_wrap<flt_t,acc_t>::aut_frebo(ka, torsion_flag);
+#endif
+}
+
+#ifdef __INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+}
+
diff --git a/src/USER-INTEL/pair_airebo_intel.h b/src/USER-INTEL/pair_airebo_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3179c09f1d5db7b8fc292e83a61cbf7dd13f12b
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_intel.h
@@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/intel,PairAIREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_INTEL_H
+#define LMP_PAIR_AIREBO_INTEL_H
+
+#include "pair.h"
+#include "fix_intel.h"
+#include "pair_airebo.h"
+//#include "airebo_common.h"
+
+namespace LAMMPS_NS {
+
+template<class flt_t, class acc_t>
+struct PairAIREBOIntelParam;
+
+class PairAIREBOIntel : public PairAIREBO {
+ public:
+  PairAIREBOIntel(class LAMMPS *);
+  virtual ~PairAIREBOIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+ protected:
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers);
+
+  template <int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(IntelBuffers<flt_t,acc_t> * buffers);
+
+  template <class flt_t, class acc_t>
+  PairAIREBOIntelParam<flt_t,acc_t> get_param();
+
+  FixIntel * fix;
+  int _cop;
+
+  int * REBO_cnumneigh;
+  int * REBO_num_skin;
+  int * REBO_list_data;
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair style AIREBO requires atom IDs
+
+This is a requirement to use the AIREBO potential.
+
+E: Pair style AIREBO requires newton pair on
+
+See the newton command.  This is a restriction to use the AIREBO
+potential.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Neighbor list overflow, boost neigh_modify one
+
+There are too many neighbors of a single atom.  Use the neigh_modify
+command to increase the max number of neighbors allowed for one atom.
+You may also want to boost the page size.
+
+E: Cannot open AIREBO potential file %s
+
+The specified AIREBO potential file cannot be opened.  Check that the
+path and name are correct.
+
+*/
diff --git a/src/USER-INTEL/pair_airebo_morse_intel.cpp b/src/USER-INTEL/pair_airebo_morse_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c0f3b8ed0f4089acb2556aac48e0f2ecd473243
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_morse_intel.cpp
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_airebo_morse_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOMorseIntel::PairAIREBOMorseIntel(LAMMPS *lmp) 
+  : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairAIREBOMorseIntel::settings(int narg, char **arg)
+{
+  PairAIREBOIntel::settings(narg,arg);
+
+  morseflag = 1;
+}
diff --git a/src/USER-INTEL/pair_airebo_morse_intel.h b/src/USER-INTEL/pair_airebo_morse_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5210ea80ee782cd1fa07362a57a9c1087694a88e
--- /dev/null
+++ b/src/USER-INTEL/pair_airebo_morse_intel.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/morse/intel,PairAIREBOMorseIntel)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_MORSE_INTEL_H
+#define LMP_PAIR_AIREBO_MORSE_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairAIREBOMorseIntel : public PairAIREBOIntel {
+ public:
+  PairAIREBOMorseIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_eam_alloy_intel.cpp b/src/USER-INTEL/pair_eam_alloy_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4f47c7ee23967fd11c5042316b583b8157b3b19f
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_alloy_intel.cpp
@@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_alloy_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyIntel::PairEAMAlloyIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,setfl->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+                 "pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-INTEL/pair_eam_alloy_intel.h b/src/USER-INTEL/pair_eam_alloy_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..4967c3709d5af20aaa78a1c1c4c4c9c7259d5586
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_alloy_intel.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/intel,PairEAMAlloyIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_ALLOY_INTEL_H
+#define LMP_PAIR_EAM_ALLOY_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/alloy/opt inherits from it
+
+class PairEAMAlloyIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMAlloyIntel(class LAMMPS *);
+  virtual ~PairEAMAlloyIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_eam_fs_intel.cpp b/src/USER-INTEL/pair_eam_fs_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfcc8200cc751a8f3aafde06e79d4a6827a59900
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_fs_intel.cpp
@@ -0,0 +1,335 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_eam_fs_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSIntel::PairEAMFSIntel(LAMMPS *lmp) : PairEAMIntel(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(FLERR,i,fs->mass[map[i]]);
+        count++;
+      }
+      scale[i][j] = 1.0;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = force->open_potential(filename);
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+                                              "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+                 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+                 file->nr+1,"pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSIntel::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+  rhomax = (nrho-1) * drho;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-INTEL/pair_eam_fs_intel.h b/src/USER-INTEL/pair_eam_fs_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..da2ab9d2d74dbc353066128f0ca459779369e4da
--- /dev/null
+++ b/src/USER-INTEL/pair_eam_fs_intel.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/intel,PairEAMFSIntel)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_INTEL_H
+#define LMP_PAIR_EAM_FS_INTEL_H
+
+#include "pair_eam_intel.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/fs/opt inherits from it
+
+class PairEAMFSIntel : virtual public PairEAMIntel {
+ public:
+  PairEAMFSIntel(class LAMMPS *);
+  virtual ~PairEAMFSIntel() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index ed7dd424af34e665cdc97b05726c2b8d0ded39d4..3fbb58308b0d37bfeea2f918cf44de0e7e67a8e5 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -428,7 +428,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           } else
             multiple_forms = true;
         }
-        const int edge = (packed_j % pad_width);
+        const int edge = packed_j & (pad_width - 1);
         if (edge) {
           const int packed_end = packed_j + (pad_width - edge);
           #if defined(LMP_SIMD_COMPILER)
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0dc2c275e87d3d583da1b42a4a493ae04cdd4469
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@@ -0,0 +1,595 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_charmm_coul_charmm_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::PairLJCharmmCoulCharmmIntel(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmIntel::~PairLJCharmmCoulCharmmIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::compute(int eflag, int vflag,
+                                        IntelBuffers<flt_t,acc_t> *buffers,
+                                        const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  // -------------------- Regular version
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
+                                     IntelBuffers<flt_t,acc_t> *buffers,
+                                     const ForceConst<flt_t> &fc,
+                                     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  flt_t * _noalias const q = buffers->get_q(offload);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * _noalias const special_coul = fc.special_coul;
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
+  const flt_t inv_denom_coul = (flt_t)1.0/denom_coul;
+
+  const flt_t * _noalias const cutsq = fc.cutsq[0];
+  const LJ_T * _noalias const lj = fc.lj[0];
+  const flt_t cut_ljsq = fc.cut_ljsq;
+  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
+  const flt_t cut_coul_innersq = fc.cut_coul_innersq;
+  const flt_t cut_coulsq = fc.cut_coulsq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccache_stride,nthreads,qqrd2e,inum,nall,ntypes,cut_coulsq) \
+    in(vflag,eatom,f_stride,separate_flag,offload) \
+    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
+    in(inv_denom_coul,cut_coul_innersq) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      flt_t cutboth = cut_coulsq;
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
+        //        const int i = ilist[ii];
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
+        const LJ_T * _noalias const lji = lj + ptr_off;
+
+        const int   * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+        acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+        int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int j = jlist[jj] & NEIGHMASK;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq < cut_coulsq) {
+            trsq[ej]=rsq;
+            tdelx[ej]=delx;
+            tdely[ej]=dely;
+            tdelz[ej]=delz;
+            tjtype[ej]=x[j].w;
+            tj[ej]=jlist[jj];
+            ej++;
+          }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+                               sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcelj, evdwl;
+          forcecoul = forcelj = evdwl = (flt_t)0.0;
+
+          const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+          const flt_t rsq = trsq[jj];
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+	  const flt_t r_inv = (flt_t)1.0 / sqrt(rsq);
+	  forcecoul = qqrd2e * qtmp * q[j] * r_inv;
+	  if (rsq > cut_coul_innersq) {
+	    const flt_t ccr = cut_coulsq - rsq;
+	    const flt_t switch1 = ccr * ccr * inv_denom_coul *
+              (cut_coulsq + (flt_t)2.0 * rsq - (flt_t)3.0 * cut_coul_innersq);
+            forcecoul *= switch1; 
+          }
+
+          #ifdef INTEL_VMASK
+          if (rsq < cut_ljsq) {
+          #endif
+	    const int jtype = tjtype[jj];
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
+            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+
+            #ifdef INTEL_VMASK
+            if (rsq > cut_lj_innersq) {
+            #endif
+              const flt_t drsq = cut_ljsq - rsq;
+              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
+              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
+                  inv_denom_lj;
+              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
+              if (EFLAG) {
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq) {
+                #endif
+                  forcelj = forcelj * switch1 + evdwl * switch2;
+                  evdwl *= switch1;
+                #ifndef INTEL_VMASK
+                }
+                #endif
+              } else {
+                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
+                    lji[jtype].w);
+                #ifndef INTEL_VMASK
+                if (rsq > cut_lj_innersq)
+                #endif
+                  forcelj =  forcelj * switch1 + philj * switch2;
+              }
+            #ifdef INTEL_VMASK
+            }
+            #endif
+
+          #ifdef INTEL_VMASK
+          }
+          #else
+          if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+          #endif
+	  if (sbindex) {
+  	    const flt_t factor_coul = special_coul[sbindex];
+	    forcecoul *= factor_coul;
+	    const flt_t factor_lj = special_lj[sbindex];
+	    forcelj *= factor_lj;
+	    if (EFLAG) evdwl *= factor_lj;
+          }
+
+          const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+          if (EFLAG) {
+            sevdwl += evdwl;
+            secoul += forcecoul;
+            if (eatom) {
+              fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+              if (NEWTON_PAIR)
+                f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * forcecoul;
+            }
+          }
+          if (NEWTON_PAIR == 0)
+            IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+                                  fpx, fpy, fpz);
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+    } // end of omp parallel region
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
+      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmIntel::init_style()
+{
+  PairLJCharmmCoulCharmm::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  _cop = fix->coprocessor_number();
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
+  int tp1 = atom->ntypes + 1;
+
+  fc.set_ntypes(tp1, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  if (cut_lj > cut_coul)
+    error->all(FLERR,
+         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
+
+  fc.cut_coulsq = cut_coulsq;
+  fc.cut_ljsq = cut_ljsq;
+  fc.cut_coul_innersq = cut_coul_innersq;
+  fc.cut_lj_innersq = cut_lj_innersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.lj[i][j].x = lj1[i][j];
+      fc.lj[i][j].y = lj2[i][j];
+      fc.lj[i][j].z = lj3[i][j];
+      fc.lj[i][j].w = lj4[i][j];
+      fc.cutsq[i][j] = cutsq[i][j];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  flt_t * cutsq = fc.cutsq[0];
+  LJ_T * lj = fc.lj[0];
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCharmmCoulCharmmIntel::ForceConst<flt_t>::set_ntypes(
+  const int ntypes, Memory *memory, const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+          nocopy(ocutsq, olj: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(cutsq);
+      _memory->destroy(lj);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
+      memory->create(lj,ntypes,ntypes,"fc.lj");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..64d6077477137db534a4ff13ccb6be33d314bb37
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/intel,PairLJCharmmCoulCharmmIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_INTEL_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmIntel : public PairLJCharmmCoulCharmm {
+
+ public:
+  PairLJCharmmCoulCharmmIntel(class LAMMPS *);
+  virtual ~PairLJCharmmCoulCharmmIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop, _ccache_stride;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    _alignvar(flt_t special_coul[4],64);
+    _alignvar(flt_t special_lj[4],64);
+    flt_t **cutsq;
+    flt_t cut_coulsq, cut_ljsq;
+    flt_t cut_coul_innersq, cut_lj_innersq;
+    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+
+    ForceConst() : _ntypes(0) {}
+    ~ForceConst() { set_ntypes(0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Intel varient of lj/charmm/coul/charmm expects lj cutoff<=coulombic
+
+The intel accelerated version of the CHARMM style requires that the
+Lennard-Jones cutoff is not greater than the coulombic cutoff.
+
+*/
diff --git a/src/USER-INTEL/pair_rebo_intel.cpp b/src/USER-INTEL/pair_rebo_intel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..006830a5fa41e004a31bf2a92e8fad2a55703c63
--- /dev/null
+++ b/src/USER-INTEL/pair_rebo_intel.cpp
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#include "pair_rebo_intel.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairREBOIntel::PairREBOIntel(LAMMPS *lmp) : PairAIREBOIntel(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairREBOIntel::settings(int narg, char **arg)
+{
+  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
+
+  cutlj = 0.0;
+  ljflag = torflag = 0;
+  //
+  // this one parameter for C-C interactions is different in REBO vs AIREBO
+  // see Favata, Micheletti, Ryu, Pugno, Comp Phys Comm (2016)
+  
+  PCCf_2_0 = 0.0;
+}
diff --git a/src/USER-INTEL/pair_rebo_intel.h b/src/USER-INTEL/pair_rebo_intel.h
new file mode 100644
index 0000000000000000000000000000000000000000..e76279a2487c224683ca0aacdd5a118850d3fddc
--- /dev/null
+++ b/src/USER-INTEL/pair_rebo_intel.h
@@ -0,0 +1,40 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Markus Hohnerbach (RWTH)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(rebo/intel,PairREBOIntel)
+
+#else
+
+#ifndef LMP_PAIR_REBO_INTEL_H
+#define LMP_PAIR_REBO_INTEL_H
+
+#include "pair_airebo_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairREBOIntel : public PairAIREBOIntel {
+ public:
+  PairREBOIntel(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 7a6b7afd92197b22cd26125369aaed8dc8dee090..fff104f39b8467fed7f9280bffd7954d7b480748 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -345,16 +345,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
             if (jj < jnumhalf) ejnumhalf++;
           }
         }
-        int ejnum_pad = ejnum;
-
-        while ( (ejnum_pad % pad_width) != 0) {
-          tdelx[ejnum_pad] = (flt_t)0.0;
-          tdely[ejnum_pad] = (flt_t)0.0;
-          tdelz[ejnum_pad] = (flt_t)0.0;
-          trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
-          tj[ejnum_pad] = nall;
-          if (!ONETYPE) tjtype[ejnum_pad] = 0;
-          ejnum_pad++;
+
+	int ejrem = ejnum & (pad_width - 1);
+	if (ejrem) ejrem = pad_width - ejrem;
+	const int ejnum_pad = ejnum + ejrem;
+	for (int jj = ejnum; jj < ejnum_pad; jj++) {
+          tdelx[jj] = (flt_t)0.0;
+          tdely[jj] = (flt_t)0.0;
+          tdelz[jj] = (flt_t)0.0;
+          trsq[jj] = p2[3].cutsq + (flt_t)1.0;
+          tj[jj] = nall;
+          if (!ONETYPE) tjtype[jj] = 0;
         }
 
         #if defined(LMP_SIMD_COMPILER)