From 9b9f6d6fe2f4559c7123a017171b2698bd3837d5 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Fri, 16 Jun 2017 16:56:28 -0600
Subject: [PATCH] USER-INTEL upgrade from M Brown

---
 doc/src/JPG/user_intel.png                    |  Bin 14684 -> 14487 bytes
 doc/src/accelerate_intel.txt                  |  112 +-
 doc/src/fix_neb.txt                           |  261 +-
 doc/src/kspace_modify.txt                     |    3 +-
 doc/src/kspace_style.txt                      |    4 +
 doc/src/pair_lj_long.txt                      |    1 +
 examples/neb/in.neb.hop1                      |    2 +-
 .../{in.neb.hop1freeend => in.neb.hop1.end}   |    4 +-
 .../{initial.hop1freeend => initial.hop1.end} |    0
 src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi  |    2 +-
 .../OPTIONS/Makefile.intel_knl_coprocessor    |    2 +-
 src/MAKE/OPTIONS/Makefile.knl                 |    2 +-
 src/REPLICA/fix_neb.cpp                       |  188 +-
 src/USER-INTEL/README                         |   13 +-
 src/USER-INTEL/TEST/README                    |   57 +-
 src/USER-INTEL/TEST/in.intel.eam              |    3 +
 src/USER-INTEL/TEST/in.intel.lc               |    2 +
 src/USER-INTEL/TEST/in.intel.lj               |    2 +
 src/USER-INTEL/TEST/in.intel.rhodo            |    4 +-
 src/USER-INTEL/TEST/in.intel.sw               |    2 +
 src/USER-INTEL/TEST/in.intel.tersoff          |    2 +
 src/USER-INTEL/TEST/in.intel.water            |    2 +
 src/USER-INTEL/TEST/in.lc_generate_restart    |   12 +-
 src/USER-INTEL/TEST/run_benchmarks.sh         |   86 +
 src/USER-INTEL/angle_charmm_intel.cpp         |  113 +-
 src/USER-INTEL/angle_harmonic_intel.cpp       |  113 +-
 src/USER-INTEL/bond_fene_intel.cpp            |  101 +-
 src/USER-INTEL/bond_harmonic_intel.cpp        |   98 +-
 src/USER-INTEL/dihedral_charmm_intel.cpp      |  270 +-
 src/USER-INTEL/dihedral_harmonic_intel.cpp    |   88 +-
 src/USER-INTEL/dihedral_opls_intel.cpp        |   88 +-
 src/USER-INTEL/fix_intel.cpp                  |  166 +-
 src/USER-INTEL/fix_intel.h                    |   26 +-
 src/USER-INTEL/improper_cvff_intel.cpp        |  176 +-
 src/USER-INTEL/improper_harmonic_intel.cpp    |  121 +-
 src/USER-INTEL/intel_buffers.cpp              |  132 +-
 src/USER-INTEL/intel_buffers.h                |   25 +-
 src/USER-INTEL/intel_preprocess.h             |  548 ++-
 src/USER-INTEL/intel_simd.h                   |    8 +-
 src/USER-INTEL/nbin_intel.cpp                 |   20 +-
 src/USER-INTEL/npair_full_bin_intel.cpp       |  505 +--
 src/USER-INTEL/npair_full_bin_intel.h         |    5 +-
 .../npair_half_bin_newtoff_intel.cpp          |  451 ---
 src/USER-INTEL/npair_half_bin_newtoff_intel.h |   52 -
 .../npair_half_bin_newton_intel.cpp           |  534 +--
 src/USER-INTEL/npair_half_bin_newton_intel.h  |    3 -
 .../npair_half_bin_newton_tri_intel.cpp       |  435 +--
 .../npair_half_bin_newton_tri_intel.h         |    3 -
 src/USER-INTEL/npair_intel.cpp                |  872 ++++-
 src/USER-INTEL/npair_intel.h                  |    8 +-
 src/USER-INTEL/pair_buck_coul_cut_intel.cpp   |  195 +-
 src/USER-INTEL/pair_buck_coul_cut_intel.h     |    2 +-
 src/USER-INTEL/pair_buck_coul_long_intel.cpp  |  364 +-
 src/USER-INTEL/pair_buck_coul_long_intel.h    |    4 +-
 src/USER-INTEL/pair_buck_intel.cpp            |  199 +-
 src/USER-INTEL/pair_buck_intel.h              |    2 +-
 src/USER-INTEL/pair_eam_intel.cpp             |  451 +--
 src/USER-INTEL/pair_eam_intel.h               |    4 +-
 src/USER-INTEL/pair_gayberne_intel.cpp        |  273 +-
 src/USER-INTEL/pair_gayberne_intel.h          |    2 +-
 .../pair_lj_charmm_coul_long_intel.cpp        |  209 +-
 .../pair_lj_charmm_coul_long_intel.h          |    2 +-
 .../pair_lj_cut_coul_long_intel.cpp           |  375 +-
 src/USER-INTEL/pair_lj_cut_coul_long_intel.h  |    4 +-
 src/USER-INTEL/pair_lj_cut_intel.cpp          |  250 +-
 src/USER-INTEL/pair_lj_cut_intel.h            |    3 +-
 .../pair_lj_long_coul_long_intel.cpp          |   50 +
 src/USER-INTEL/pair_lj_long_coul_long_intel.h |   39 +
 src/USER-INTEL/pair_sw_intel.cpp              |  371 +-
 src/USER-INTEL/pair_sw_intel.h                |    2 +-
 src/USER-INTEL/pair_tersoff_intel.cpp         |  223 +-
 src/USER-INTEL/pair_tersoff_intel.h           |    2 +-
 src/USER-INTEL/pppm_disp_intel.cpp            | 3034 +++++++++++++++++
 src/USER-INTEL/pppm_disp_intel.h              |  238 ++
 src/USER-INTEL/pppm_intel.cpp                 | 1072 ++++--
 src/USER-INTEL/pppm_intel.h                   |   67 +-
 src/USER-INTEL/verlet_lrt_intel.cpp           |    6 +-
 src/USER-INTEL/verlet_lrt_intel.h             |    2 +-
 src/atom.cpp                                  |   51 +
 79 files changed, 8553 insertions(+), 4670 deletions(-)
 rename examples/neb/{in.neb.hop1freeend => in.neb.hop1.end} (91%)
 rename examples/neb/{initial.hop1freeend => initial.hop1.end} (100%)
 create mode 100755 src/USER-INTEL/TEST/run_benchmarks.sh
 delete mode 100644 src/USER-INTEL/npair_half_bin_newtoff_intel.cpp
 delete mode 100644 src/USER-INTEL/npair_half_bin_newtoff_intel.h
 create mode 100644 src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
 create mode 100644 src/USER-INTEL/pair_lj_long_coul_long_intel.h
 create mode 100644 src/USER-INTEL/pppm_disp_intel.cpp
 create mode 100644 src/USER-INTEL/pppm_disp_intel.h

diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 0ebb2d1ae08cdd8ddd0d150f29d0da7b12e5520d..302b50124a0429d0f64df1a9979a5265051f8112 100755
GIT binary patch
literal 14487
zcmeHuXH-*Z7j76UqB4p!L2#rA(h&iv77z$UDI&cGLX}=ZQ&a>gLk~5e6lo%aUIhiD
zmrx8{I)n}hC6s#-(7|tJ?sxB6_x`%~hs#CIIs4uFd7u64z2B4gKU8{f?#$&gAQ0%B
zoa}uS5a_oQ5QrrB^l!i?ZTMFLz<(qTDi7|0KD5!#0)HGgyQ6pq1j-8`+c7)={7w2)
zR?`6ly68ywM=}gybOnKgIpyx(QFGN>NDvR3ED3{*sJNfXX}g>_d{;B1V^#YOH*<@O
z`AW{!z6<tbs;Oeg0-c6H-|F*hPiJ@TywG2!cgVVShi!(DWSMOEb>QYe^&U2NpA1+l
z;hN08(ALQQF%ZZ{>B75)c@hxF{Uq`f2sA_A!T<s#Iq<@OOKmlT3%5`Gw+nmXo4acb
zjNUsd<I_t}ZyVaJ&G^e)9MD$wTfc#%sts@}TRXD1_SvZ_V=HNJ6#}j<orMh)e%VgQ
zZ>K>ZabI*bStF{sO-D4W!ei^&RhHBQBPI~Y@f>8IySr6lZ7Fi+1vK}Y@B#@)D%}9L
zbhj0r^~(DE*XZN<8dmYn^gG*FFa!wezEol(3gt#FS3f`UJU=77csoFMnoR*#R}Uoz
z!Z?BSran`@Q#(t6ZotnvR+*85mKAvME;Vq^elC&S4hh;4e%w!&8ra<OSv4!HwRN=P
zb_;|Is+B$lfj}KRaB&bwYT5utK+zWs0O)ug0$A@5`M-bRk4?Pf-qzCoinr2>cwr|J
zO2Q_ce)bJFvbRuu^dl0;$a%sx-t|Ej9|Jbe=XV3#!Y5!auO5eciV~mtzk17v?}#Ko
zh6YASuf0Uv%RK1lxPt}ql7krAZy*m5z=dr={0IWkL^~4G9%8bFFzRfYZ>Trau%0@+
z=NRE00sMpkMR;hc5PD;yKu21ov?40~-b<Htqu}ZLAfF`=()NKZq`E4*sdqj|x5Sh6
znuc1hM$1b4Y0xqk5bsROd=D&(AHU;-Q)~=GmlX7tJp`_9KG(1IYP2E&nb%Mv5oNA4
zs|tyUZ7^ATGl*ndG>#9wh@KDfRX%ittO8y9P5`OAD2R4n#4jyDOZm+7t5WX0%ry-V
zNu(eFRUK=ox{?sHYqFs|Jkn*4>`SX$S*m^Jgmk_J`XuBFJS~b;W~NZBz=JD81u&Y`
z*`2=?qBZQB2v7(BVV2zlHqlj*7w=5q)bGC$&3SIWC|gIb?=%V`w2y)Ap8!JjgH-ct
zm)q1Wl7Nh!17;r}5#D0fTZU_<?{H%P0Y}b5MuOW~^sIDa+*WZWtCpBSKzTZ68t3=-
zLcHbG+3qUEmz6}a!7JSngnWN{?Xj88Ag%paw=~UL=xob;pYnXeLd|&i=a8T#96%K+
zc;#`(lF3KhOOu6JG9OAk;Gw(E(O?6d8BKydz4zWlP}V8X=`#?XXFwWAwHx3r!Yjjf
zjCUx9XMsC?33qa@F17P4tGWS>)XcN+!pIvjY{nXr(Fi@9Tiz+q>vM#MVef^-Y{)^s
zGqmXahTKSMk+5z@3}YYwwMhw=%XxEgH(bR*QdYb$(w5-|_V~$$m9F!Tf$XIstO?}f
z`q^Kc@M{lQWy2LZgFvr;BQ8KwC1>m9+qNDA1G3`kO=4iiJG47!bUNbeH1!M#=*1~y
z^r!iuv*-86r`cox(=r6pyBDUsjoZyOK%ga}Aq~wv!QC6KH9tJ>`|7Q#N2#~^BV0fR
z1P+XEDoDcKj1&P<Ax8B2^K$53{kQO;abS=04xpgaZU6+WmWUGrkdr<xCkQ_1GHVy_
zH`#2ni0+_7K|WuJu!b~Aei^R>d>1B~Z=jJN_;~V{@ISv=r#M3~iI97Uuroo}cJ~A1
zBqu=XqOz5?`$!czbbS4O-ZpwnwNSfULox@xJ8G131vDy3bcedJ;tc&dQl2O8tR(G@
zN~l^zCtQt_l%u@&JZL$G=!D;d@mzwFeV*^Ds4k(cd&VDKMw$62V=V@vqKZGj3Y~qX
zmp8n3R2edeRP#Cc^~$Zb>9ZnsbJxyx7C4&YPWbf=;7x3+ZbVKknJ(j3HU6ltXF;0a
zEX(~4QqtAsgV^hI?j#cNpyodFP_xx_$KiWnqJppmqcS%{Q}be;Qk73(lG;kzgid{#
zhUF>O^BN3aRH-|B66;xFqX6=Yu8#VU`B{~XVlvnx*cV~WY1dd3L;=?tI>Iy7?<<dA
zR0P{X5OMbXA~sn6$rk<#R{Aj9^2^qbNE=hy+ok>ImoK^VM<&idb%2~`o4376JIe6K
zSrzKr-doA7Uj5C%Apow4ca+n*D8z%nH@N`zi3KMrq_VkRjyBiY=EGx6Bm2}ru5v%`
zNPt&5Z<AJ0oC8t*W%={`nynhyY&kkze@Wk?Ju&wx$UTWob=IF4h(wDQUSM*7O7dk0
z=|viGDV}u`4)IP@$$ivww5Xgu$?`i8$n&p&Bq7zGo!(XLVorfJS%0qK7Kqlz&9=6m
z0DUU&`WZYU#G5B_2~FlIE=7IBs%?gc(kGXB68GdEPEY}v!tmf{6u-vFix1u2m=)Me
z6PF_YX=U{+(q$*BbeD0aVdfV@RZ_G1=v3~_3*tT!GmUEQZ`1Ak(@r2qR2riogA7Sz
zKV7bwFYa#LlVSt?9{m@jys-9=zAaDp8SAx+h>AoZm=E_)BdO#_WtobG(CAfp?~1--
z>woQwC)|@P@cWtyc?Y?8jJ)#tP43PDKwp=Ik1VFYieC$-cK2BX?Ct%fk*9stOl?X8
zCcb36;k}RQ@<T7}ChZKboSPHanO(!LtHL}Y$U9EK1CDFO0&I$gRIlKCA&9OA#?d+G
z?{+XBn@?)Y9bFOTTD*8J4(*uT8YR$G?!(*3Al1?$75!r0NI_ptsD?^`RRNkc%x^5?
z3$}YUOPAL8B%~M(I4+t)<pFMa`Zw850BN5@&Pz#Ja1)@oL`&6)DYDagb0pg&KOX-)
zKdDGBa1zAx!zD%tVz@TWqrQ!&hh*l0g7j}(iUhoQ-b#E%Nbrv$(8tz~rCoa#l}6wb
zwd_z(J-0UUGoVaEJ{ht}sqCEv>&7@y`ei6K)t4VObQWZL?(p_i8l>bXs_8o)%!i@o
zFiL>)cn`RK<}siHI>3YsqCe*8l@Xj-upasPc08p%kf<mBy2gafc)h8D;KZxz&V7D8
z@8GWmyX3x&AN-vnhHHuxb08Szp;FD<@Ll}1O#N*~XN@lElPIMe@w4JwuCL{k9nNxH
zG)DuQ@6G%fwi~kDu(Ac)2lOE1o-!GOQ5TB--kalE&a>04ULn4@xsAsKJBPg<bRreW
zdtxsJ1@Rz`b@z{Vpq~4|*Kam-06^$&^TIGKEWssClW+W<yg~%%aPXf`{2JF=sdXq#
z(AG$4ekYRUu*4}EEoO!+>BB~Ial?402f6Fa%Dg{7gYR&CAJ=PgOLuj?RpS`ZUk$3p
z?h~KD?y<(ifD&SNE!NlqIR!pP%CBuCpOxsjWs_E4s-SN7*ds^$3=rL(=^cfM^Kp&i
zM_{Oo*&TIaN*7m%dC!`}L)C48>?18r(t@3RvV4)GaQ(Q00&#~^8Ie3RBhWKdiLuu%
z8;5)xW%<owmUIe!z*9zuB;Bs3O?<`fiEm!u|9FxdIg&T^g#_ljPPMSMQ)N&CKHl9$
z(ZY8?x(2v7Zb#y8_Q-W>o=2a4pUMU|KIH2zcyT8~a=G?DKW-YG??)}ToLa``K0Cp4
z;UQR=x8X;Myqv&pH*B8F(x>a<mVB|#^0#GjC9HGvmQanYJ>~dOEKVHZg3tB&)yg(6
zz6S2{4LdvE>NWoef!z9&QR~3Bbp72M>M?YSN6_?y?}~qGKV@&GjB9V$)c=i44?yn@
zKo}sK$smY4ulokW-~EnN^m(7-bOiu70dS3wjk~d)`DTM#T!uAZAV+}hB;;<*{2%nc
z+yxhZ;d_Dy$gQLwY3KJ82&tp%Qo%uL8Q^|DPgMb|_MGYfU9VP#VX9v~F@t+<Q?FM{
zR{w_sz{lUbjIdZErsIA`*GQ#67FX{qHnfjqwXeaJhR^<DO`>IZXLuVuZMU`Y;+Ln$
zk;Rg32%Oz^@ylO6*GEpn)@{iSak($bwGRx64%{@<{xXCDcg3~Sn-SoTIrNMcb3c@i
zYLUEN`fX?wz!KZ-rDK0RR$seYQzCn_3sBH|8b2^7KpAlkG6uf6&jggtpHRI`pfdix
zXQ0D;`4>bN3TVhSPK{sdlm#<Fsy9#@JC1wWz{Ud5cU<dKdXx9mo0t1yP=O;9M`znG
zKXT?UaDQj5z?XPmyHN9#6Ue{t|F3Hkl=##9$jSd@GoEm9#}j{#_&NHX@ZS-lYS~+$
zt@eAgcTe&&<2GYsRnQ~L5R$(?-Ia@Oz##{I&)Q#ES$SuUn{D>V(6#Ozc=zcaA~(YM
zA}y7bS<_~_m%<lA+Sf;(8>hoQ&M*l0wQ9j={_X`(LWawR46v5>zV`Le!Gp@IaBdia
zcK-9unJs08_hI_f9V*Df6hq!Ny|UJtWQb_-vffz6^zp=S<&@^jrg7K$5Xqc$xQpkZ
zKYBM949S=+v4(Rj5ke}*LgebJ*j9U;a(fy4ZoDx8`a)tF{buo#L_p{BkyCMTzH#pz
zZNGXw57a)eJ|chcwWD8J!as+j)S~xfsNJ>i>D5@oP=J>xW7Rym%pI~^h`as|oZE%(
z`ewuc<NTl1*uH+_e?as|ejcF_P-_?^GIB4h3{mR+0|D0!ahUWj%34qD%X<G{6S$m9
zJB-i6ZvgT~)P^I3cYBUDll*jY<%h1Ar`J6xktAjtgCUM@{~2vJpvOeXIeKXg5`wNK
zCuol=@8j=43O=_ZsFX4<tjcU3LI{1V=E+OI4rx8Hq8sAMC4<^s=0`{g>=vWRS_TrU
zrsSZh*qf|IPN(h-5f-ZR%{xO#t3YCJ@_9b_27yE?2HCYh+8!S#Ha4)+rwF`d2vpw$
zvdln-D4pcr|MvI~dHlu2!@Q>=ujkkS1cH~F*qt1t=}({Qd^0*22`$_2!9HYrH_bzV
z*v0e+g2Kzqpzw0<HPF!bJw2|ooPOsGAW>5*b3_PM3^>*w5W4agOry^w&k+SGM1vU4
zn^UUEk_U=SLcIee3W!_9Hqn1E_?g`cfclevKV34a>`Mjge9L$bG2XEnwV4p0mxG;t
z@acs2<pNniMHq4kT17)mo_5!p&_mr5{*8MRl_U-9>Y6K-h^s3v<u+3+&_vf>cv^!v
z$#rp|0N#*A+>ZaVp%LoLzW@SMoPSyC8IZ33;vvuvQT<C?TFd~a{t*o+pnU!dhCqh@
z3-ZT+!ul`!^}iUDZOaluat}^8Y!pzFiF#-ma_8tC9rbFz=(XYg<`!(2vlKTCoKz~X
zTOfxSULNJvQBbPL!Uk-`L}|BHDW#g=&K3)XzYybSP~}(Gmi%|fe2A*{ul9}+l@s$U
z44u2c;I+HWf~@n8+1@Jw)YH<Bgh-E=aq=HKrvL6og9)5u#;rfr&&TxBdF1Rmw{&kb
z0G+6=D4?7`Mzqj-ABD=>)<OyKRboInZLe+wDWp2W_><`|B748UQN@fH7T)bBal*EY
zS}q6~r&UP;X}(Yp|H|%g!+->lC4=l2F(caKKI8ziFWuMb>+cIk0syyxL@z+uU#>so
zp434(FQ`@U^ahltA1ZQg{E$XKQiK9opk?@P3y|mV0)BG30x75Tx#W<sKyZZ0{TQCH
zL{5||AU}_8&rf?q0-ZWaO!Joq{vA;N=td<|Lgw&<=8VYiqqzP5g@_9G|DhvRmH&rW
zx0kR*cyB|&h)~s*;WFI#B6^UF2i4)IK=0yLw+_zpZ^Z>@)h_yP7#w8u-!c$i%(At<
zs+FkMvmokDuo?HSHp|0w+&$UaH%?lRma`52YvPVjj(4q=*@%@#eT0pX?QLU#zSDDg
z8sV*^y+p2M^gIf_o^nJl!-lBW2_EvyKr@qASk3)OwT#C4uEoH2Vfpx+4Z<a-C(C?y
zzzQ7~!_6#06<YQbAEi~=u&!~J5F1C&|F~*R8}Dhb#jF?S$!D0<TXl*|;=t&t0#}$n
z+LxNj1F%F<@MR)Ki<ts>iidL(o@Cm*dL`3$B(sd7o`to&8(3Z3)v%^vf?X@jR@r1E
zQ$;UP<<*iRWviFSwjghV(Ggn=$t-xgi1rXPG6>Ccsdev-VQ}3M(_(kQ_D*Y$tM-yI
zco_!VD{kT4U^{32SjbR46;%{Ed*3rTPj1y~XGes!z0%g;*GKO)&y?U)H#>{d6T%vJ
z=qyAr@+@Dcy5E=gh^OEp!uorA&$Q&h#Z=#z7~w`=o?r~0Hm4PTti1<C>MMu!=1Gn(
zTaLSzZuu-0o^jwj8WvP5Y`^WKM>=|1Tebda9(~JRrCS<R<CpmwG61c1RDKyY0fN}t
z_88PLxVxeCQiSRNd4`IQqsFVpxyDLS;Nm~Co%_Lh6%0=lUCWb;Za{eGEHTeZYhV9u
zHCnB(*|zIz331fb42SnhvT|lqcSbV|IOVE84+49o&o7?8Fb%(4a!<@*0n7>^*9%sP
zT<Od}_;0D=E@rH$f<3=vu)ww%IfvMBZ-4*(sl<Gs%(+v>V|@Jj5S^29n^t3hGFYOH
z)kcds*6~VD4ti*Ov*c3R{%nSx#3L<bY_Cah&*LZ?Tiy>|L>5H*s<PzNg@jnuQ8R!>
zUZ}qDN<7<)lfvq=_KI?EF{w6*V&7Xe3rPfbEoe5}2pX=#J18`tqxSI6lR=g@tk^*%
zcgf|Slz0PHA6xVZ-ND&bu^d`9RyEZLz=SDlldUy%1U2*1P&a>kn1*Aq)_pX)oMyZ)
zZ*M^>tssgK^qZP@$)x$ieWUPOD$227Cr%Hy4Al%6QQ=g$jJ+O+y+rq*C_%P=&o+_I
z12ZlQ7Ul<021Dhjh58(aR3ZxkI}7xi<g6hsvgq%dB`I&<Rcq`eEpL;P)L>psNlq1v
z7=O#z&z{19vM%j;I=%8ayGlC;jQ2}xs!&7C=|q`S#rnMzUCK<byMwVVH16}>-tV8%
z+fm{|20X*A&c3yrEpsjmzZDfxIC@|1>(k-E%y1VwC|^%DCpI=AJF&F#8~l2PfJ>va
za>zSlt*P>&(X!N8pQ_L93aeK{)Lm?yH-sg)i}>%b$DUw0`3TA;DXlJycp`@36>86W
z-a(RDbmb7UW2(|Q@|99ZCqwGS{1S|&#jZk}nMQ<avkIz2d}%@>=QR536Zea~!P8Z%
zdcEWLl7b1AT(y7^dYXYZL4r<&;HtOj4V`>kQw+Clq4KI+5Rcx@mRl4Mmyq#sBk+aZ
zxz?Htc!glD_?7DjCwScSw@i))h3U0y@-fuQmVl%%RBWcp#|}2%Rue^A6uvPLft&Uy
z#oU_o=A!l}YpQimu2}c5Wp`ux%<CkCNJwSun(Q)q6Q6O(78A1?aMXcWhRi=UM!G}<
z5_GP#wg&Q-!{`~Qon~VSZ>vtLDR>2BK9%PodH~O`FdWcpuJF+(W^bLidJ}o(>?i(#
z1(bniIhM0fE3FlYX~;0mU}vkgkp2qJyWybkCNo(p?V+_AOU+Sddp>ZXu0FP`iBi7{
zH9iB6awPBhld&rGb24Ho+s$XS^~FIB9MA5Fo_;hO+n0DgqWyNukb_lZBodm-qW39R
znx7B0JeI%#RWi%s<?1qwN)N(*?Gjs3@XHrWSa{#VZ#(x!a4&H;Ad~44)!R$6kmf-R
z)&l46*Oa*KMQYvw#|(_6J6DF^dK0uY6;KN1c6X=0f-<uG-l-kCnvSTv5jX{rxe{Yg
zyJAO+iE)-u!$KX(_8zX5AX(qb7}@|hL<~%Sd)TZMFdr;x9E4GPxV=<+=u|vMD!X`b
z5K2<Njsk6@Df-n}tq(IbIGvoj)Tc71*2DMcqEX$4m#kC0rMU{`3$@ZaVBsu-g=)1O
zwRU^+%D%Yz(3~<&^o6P&yND5-wn!QarO~TFJ8(}&>nXMHMdeui`9(Vqoz=18p4<jM
zQb9`OrChom|7}jR#5Kd@I{N1Bxy;ppr<ubVn}xCjJ#vrbFh?_qr{3azVpS3TPD|xZ
zoLSa~4%R1^IgEnc9#!p~5OdHW){}M}<`w9>)S2h!a_lBPQW;N;$0c*vwLjp6QCk-R
zZ(~;u2q)$c`XxOmN{A0XxDak>QeD{jO;$dU)<H&kyzg`^W+GW=BbO8~T{GWKEQjK*
zZd2F%+dmh+=aG^@$+lJAx~|a+TkAXh?w#^uf%!{H3E`@7i_3z)R)lm*O)E_?x2M)n
znjz7cb)#0KaeJVqW|O;xh+AnG+0+w-#0(-0YM+NRq6(Oq5@`}#R!4h^*_zvebP{f<
z-ozUg#|NS}J_#J)Kq$7&=8LN%Xn(&~?Sl<LIm%Y85}q*+fp2lZMvs=%<r&7N#BdF>
zbN<PXJa0<=c^SEcwiVLg#WT1b)SBkf4EuSKQr>DK+6mf<o6+tc;)`l>rS<vPwrWUC
z{S){rIb@W0zLyo;puaKv^f5S=Y&#!nb!N@RDp79hv$!mRBydKcsBxBK@4|*TRN%Na
zgbP&{rMva~tJ5yZVV+F*7Drwoecb`0@<r8Ou`M>0fLx-D<iANgO(|Ty)T9-i=tSO@
zo(}7e#?Z(rw=j{@*1gd?b>8h0nx#VHi{^Ps+nHIriH`j9H7^?{uIBvSQ{G<*t88Hw
z7odj3Oc&AEmD6{Ho$0`mm#z0$-3?CPyKzVt=XCi_@nG&ce=YsYyg`SRFXx7#`<caL
zWNyW>h=9;=@a9*RV7uh$M1}`=t?nQT%o)i2^$)fmgM;3zXWXWQ2H%KS8H{o)WGY^S
zN2xvIkksWYAH?AMoS%f46ZL9dDRQg=5p5)~pgtGRn-+Msy37RC7c25{{&D$-@p#-^
zz`bHLj~XpcoXW|lHy`-(?X%_rwzw&orlUTFvCj8a6v&5PDtaF^#R6sDlv;TOIFg9v
zAAIb&7EPx?ANsvMQ|^6O6rGK7i#uXZtxe(f;^l0nasK9flR9q`xs3ODud~JK)ZX5%
zh}SLG3-Wca%2AZPY*Ag<*c&Cs&a9YVQatGsNK#E~Dw-G79#MFk>IFcfavd~N=Z+5*
z&4SsOrzjNrw)9Sn6pTbfIK>BZt6NZlqx#W~f(sQ*h&UJF!WT~Bn1oB+&#<|tGwYLO
z)J=-s*~iz>0Wz}yeI#V%(45JXKfE+%t)z%vv3t;_{ejt}{H0zIJx#QQx7&*J%Q%O%
zS7R?)+*hAE24`HZr%UH9v5B%2crO1|b%2lL%unZN(-#6Y&u<k(KSn+-ZZ=Ya$G~DW
zaU04*qj9+H0Nk9ZbXr7el$L*?M`d9{1`Fp<JGGMF3^?*bMQ@CH{K9+g-5*YEWIfe>
z!xWalpOT^b2SysMd~-Z>Db`o|d>~lwT`m0+m%dD)fb66bPD$=HXZ)tW*^`u-dx<tr
zX7qCYv0&x*-F%Doup$m+<Y$w#ibbzEC99+#7nmmVj0toa;ViAqpz}cK>38Z?W`BjW
zillieFbD-~ahOJ{f}iLL>!NsdMI`yt%rcQzdQz)h?*WBboD2QU)+Q@iOgE}!)@ESb
zAr~UHs_XFa>oYrax3KCB?r@t+lMPY6OsHpbCvMOqj8T-mMrqnn>KfS#()B*;_60j<
ze?k=#CDkSt4b%_S^cOp>8M&qDF{a-8-RG*Hxw+=KdXG)yH7znCpEUPr!{W7SyLLSc
zwLb^SKN8V&E_%261E$W6jXLWdll1FT&ag4D^M~$)+Qq!dv+8fKXf{fpm`_@iy7FE(
zx87fbHTq9&=KX|RftxcU{c0IpS^KG74nNbZsf0N#A;y$DH~hBZ_N1u^RCyoKzxpDk
zJ98lF{F=0yp32FQf<6tMW<(<obAzgdg^pIw7SJ*56qIHeO>|#;9NknzDR0*wUxcj9
zPY~QcNClWy{y<{LovPxuub7QXze9~1pTI)b99__h_KP>p2Q@#u@BGZrOnc|gl9uei
zjLh}wRJNWw{oMmE88YKzjAw%@cXAXX`}4P3hre{L3LLgw<v`1MS~OB9>nh*QS$upM
zYf2$6+gIFh{u5`En`QZ2S+GNY){WRVWQEOXEu^}#>biX13pUXXVBtO_RWv-eLhaU0
z$uc}4qs_UOf&7wf3C1pBXw*2KXLwYTXDf%^7VEk&_S(ekEKH!TSK6;LVj<SpG&mUi
z*fVeQ8Q((W+nplLhMpIkSbUhOgH=g>nR3ypymgwFgYm20cavs^kXCtTel2^5vHpWA
z$+UE{H&_Ea8PF=SdR$n9dm-ft`|wi~eR+yv8j4E@qTlS&${l3;Q7dPcOm*^RPJzd%
zL-vAHQ+FJ@bK27Vw82{eIU?(4#YDMtQm$|eKW(Qke?Ol_H{Q`@ae7DiR90tyHNUzN
zO18OplrZ&)yZZC0&ujOsXM9`zv@9XMtF`FfE?Ub+E-cP%oubSCQF{tr^e6N=vx+)Y
zLpf@-jfIqqfop`X{H9*2b+6YgkiHBZ*NE>yS2Y>)hFQYAQSOz5$djpsW0{j1?IG9)
z+4(ICP%{Iaqf=Z**t=osa7(%Yz2K(2b>Fw+(I@JMZQTmZ{^PE2Z(*^`-St2AK3Pp5
z7{iz!pn_N2TZVz5_7FbeFsfvOqsX*#T%Aqua)d#f+CI&nGxzsD@ClgHC<-Q0`qqQg
zF)kY>lZ=d#^+OYDjP;m>H+v#9(MRSviUOK1)oi&++5%0Lep7Ce1GJ%O7#>_8_~Bng
zn^cRFy7e}CA4?6ZXSx2@=z5%sj=OYMQLnw^_~g;5+`Q@Ff@<Si+&ZiO9|-*COiY-?
z8;?}|zXovP7zl(6jO&C2H{iR#H&uCNfv=;2s$NO%?>MZEiZgF9Agfa;PAe(D3nbq8
z6M%zFA&mKlQ*^W$ci1Lz^#8G3VyWzy?@#;U@XBp$)NH$Y(BK;Q<N4*F=)G&d-Z28j
zl~?=rZ$-04dzl!^^$k!=xo2z)c(_e>&4HhVey?>KpNhYbkfN$qt`d4PxFyK4`R=5=
zz&P!I_F9YdMaF_J$GP|j6R?#Lj_~?aHE?!Wfj=_3DtyPP0e;az62Z7S(m3#n3${2K
zqH5e<)jH=HU1ZJR8OE<ss_)4m1@lN=l1JX<?_$K;0Ppx-_}f!*++$OVZFgucV9d&F
zdkd8~eam$Ei?7GaD67*^oIR|AeEh|2c}917VzJM(S<xEV(VT4SC4qsKZ^c@Bv0*G<
z7@{qtje{S}iLCq%Bo*zKd6eSjrjCrZO(?_IWYlau7Sg&qe|xd}+0FKSjU0;Ycx#wN
zBZmIs29J26@IuQaI!wy?N`O6!z?+MhmHX@uOjC^dEO}1Vhpi?;J}6PnIxXJRO}Z*h
znL4K}zqtb!TFY;DaO)4jdQ#b{-R(nAR=SMY#acG921DYxP;Fd;NvW=V8XSU4%59$u
z^m*>#VS&v)XXPfPjl|NrGZ!l3o<`stvWt?c)<jrip0#+n33D$%+UDp|rayOa(}Bx|
zd(X1%bs*Ea79NJ#D2smH7jg<7DEn**V@-Hs6(3VqVNj}f+qQ%EEsqYGHiLojcEoQ}
z9Mik;+G$PqSI%g9I&g_hHU&F`M_Gt$kz9b}FLs5R=cKCb%icB>mC-1%3szqjwy1(W
zjm?b~qKnSUY-(+l|1gso?+&<SHPKTOmqOohEhc$*-cQkx0h8S`oal5H?iF~<Mt0Ux
zxzJ`UU<y~^#C6Bja~3<}qey$c53H0DF>4`L<MDl`<<5ZYw#C&YzAI{6n8j|VZT;Rq
zZ&3<yU({}v^6D`<r?tgm)1^}N$X{XfvGwkk@tb}8^z0sXBJ}Xm2d$nC*|*f`1*2J~
zNwspLH!er3sb)UCUZ$oR2ItQtU8zsK)gUu>*I7x7FbP&gqv_HHoXpRkZDibvJa@0S
z2$dIQpKyMaf=r=Lb<$*ZcG*5ByU0-{BOuI2%(jij5=9w>udoe`zzgN543|Y)3=K!F
z4W^`WnwYR_%Gj*E?8~AE>CyqK@H5J3zP$7PCgapeuEmViyG>m8Q0|o#sFIZ1_56B6
z!_b)~Km7#P*SWFAsn3VPQ3SODyop}=bIvSea*t=ayR2}*SkVomlkV$|Wda+rsN`)M
zMCpj2hA)*PXx_xigGr+*Mzqrc3?^G9<eSP=FwJzc6Rcd5*MzKWtJdyQw-%aA_I1fq
z%nQ4;^40LEpAqak#Dq3lZ9wO<SHkB{=m?+jYZdXvf~o$|Yz%}6^>f^=dvWGQn|Oi~
zdGpW*r*h^NQFiuSZ|!v7oiER@>4Gw(lOJC3&nK7CCwrt>UvWndze`56YMb>|k40su
zXxwJdmX{N>y73{x8LnO6n$_qSG{|h6R2nre%d)_FM+i#$jSCvSY+=ukzUA%~XD4@!
z)9sO4Owo+bIRDBVuBrm7FTou~+duCI5$BuQ4J}p2t_6HIi7Z}>E^nQh>4x)@IW30k
z8_g>_=`TE(4h)Re*v-Mdudk@?2CtkMr=3d+OqsIYo6O+XX~l`&RH96|T^z?QZA~>N
z-o}OUa(EZeLup$jvT|{u@cljcprx!HW1XIBkyL3-WtzEeUrX=R-o<0Xiqgkx8cd~o
zvI|r0k}(M_4E|x5jHQp0<I?;-Nx}0SD4dEd3}q8@*VD1_;SiX_f?p*`r9y9SdGGEd
zt!3R=Q+BS*iHfl?|7i0M(hu!>A4EtjB^s7~NEKF(h|+4sF_<T?ZS`X>dgxH8)Hytg
ziL&n@X^<XSAEO_*lrWo6VrFpI!-xm*6OV8KxjroQ|3$okf=-xQ4-S40k1GF51(*_t
z>;uQe@{&M3Sr&Y~ro>R;$Gk?E!7rN<l^Qq;K6HmLYacloW~d-c)u*@!r^SaQa`hK|
z2TnMf1*qjbejWfLPa&1Poeg#|r!kClCDY3124y(rR~hX#m42--iY<Ff`<v6%`wjc(
zR|U$ny~)kO3y)k_2AXNzMGd)Q+-+B}Qe|yT_H~)D5t5hky%JVeM65z)(`;bRo;iAm
zd$#sjXHxgXV|T)U6e+XcnhGJs!NWY7;kvylrFpSht(JH$m8$#ApPrPkmx<|_l^O;w
zC{eO#K(WKeCH3?4aMRz!?>T6J4c#M_yp^Cqu5D{iF~D3r)TE>0%kSWnxK$_%p+HjW
z?+;Y(_T5VHY@&I&#ICr}Vke}gfndz%8$uc05cQ<-&0w(}$v<246&sz{!{TW%B`L#&
zW@LoI23_sOnDtkuvKDo3Ch$#Ww(5RRj#O*S$_nc7u*vVfGK8sZH07$J%gX4^wsr0I
zLi>f_v`&UyjS1ae3^6JicS>y5;wbi%KIhZFKKz2oqtTr`V5pMRPbAf}q8j4fy;yy>
zIDNtHNv|ZTx+r06t|X!vRWj8jP02(kZ#DA`+ToW^*5l!qsi(BF==$&}W$xBE)l*%!
z_Pvt_9a6;wZO-V09?OJgd~oj#<6Gq1R<-BfMpO&keyoyP1nLpibF@lm{GcKK{;6B$
zgiu<OjLPG8$nY!NmgsWZ$XEw!+qW!yQxlWQm*|x3)xoT(j<8f~#SSz&rntkSvir;8
z@QZPsj<8J48}EnrBU?#}_QbyPQa6|uSHyk@d~+EB5KbTf%`=tG=38op>+r~REVtq@
zn^}1I{R?CL+zoN^m$Nvi**8u~v1?oy-Va0S?M?1qTf7!CZEyowY`wiJZ<$c3eE~%<
zp3>XuufVC=k}Bb99PFg>Z1nGOg3;-onI>sEk4mSkJ~h2_iQwr7<5x+p!|Pj#HDVh&
zF&SaSbCm_+4k58TPk03La&2o#xy>6bzLI4|6eDF+nkgS?G?=D~{V^NMuAwEiaGU@s
z@O{c}mnJ@Wc;_E;Hp+I}3evW9oP?yUXQykmyXgXl0nZBmBpv#b`__V5ySzxG*2eWj
z3v6IY?;m~iy**V^!P5Nt78z+$rF)J=v?XMN{0k5CWZ8Sg1X;V@?M!^V5o;-zV{B|^
z-ZbtMtX)t6=SX|pI0%Wft+lIy0C-lFkg}B|v6N3H;&~M6*t#!kYUQ45!-_oS&b@Cs
zsB)_Bz5>uOPr+g;4Vy$Ar(aEk%m-?<7mcLGThuVkR=mVM#B#!}2xT<vblsh^MhWxk
zvPourdk~YQ%7WS&D;v0niJ65L4{HE^nFSo1$tZOlGrVjmH`&fr_vps0#EJZ<gn`69
ztt_$n5)<`H;2i&<QldjyV$qK-nO+a9yE?bCS)h-PmERQ#lp`EFjx^6VyEtul<#HBl
zB2+QxBA&>}Q))F#aj!z>G|QrxF13LbQ!Uv7`m@H!bP_ngBykD!u<Ik<c4H(qF*u6U
z03AWZ71w*lTp9A<DEs!;fjZs!-UH&{`>c0z`oO~li#`3hPw#;BK6Uyj+Xd;M9H-sU
zy0tG=T-rqlrT>)>9xJ2%;G5@~BRN!?`r(}B^h`>DCa0?jidRLvS=U?Uv5t}X$H;8_
zEC(vFY&6tTKapdO>O+r}<4nN?-HBVfm#7QXY_d6Qsv%Cpg0YsG=(CD%zqy8GIb?eb
z6jDWoT2tB9vH<;2{x^6032yU892e1w8mW|Ix~A#lLeMfXZn8`<^=?e5W1cHve!9hn
z-^W((YkPRobxD|-97~wUotyk*@A8W7ogOo>5;|eez@m}6_U*<a6qd&F+|*(pp)?tZ
zli8AN2@$$1f)lccvPWjQ_Tb3JC5D;r2jYjepbZvJ0<va>Th+NHpBF@Furn2(zECCA
ztYD#Ls?mVC**E<}PVs~3mFEEw82*#}xzQFq#aSY$sr=#r8~PZPO!Ze5m1(ncB^B3g
zv-f+3lpDC!5#xvDFD3nv3}HwC-tB!kIQ083;!k+{_rF^~DjMeZfmfOb!_G;&;YQz$
zKPL#kQvnfv=kq`PeVGzJY_EqN_>D=!4z^E!pONXJluXynlmg&UkesyA{SS8yp8pSx
C<%a<P

literal 14684
zcmeHuS6q`@({H4zAgHKFw}q%Q1?f!zL5LLTy$Fg(54|f@r3nZ~2`IftlM+G+y#xfL
zO9vA`dI>$0^MG!_z2E(wi*s=<zVE`H=2`R1nl<yE`OmDx*9vk{r%uwI1c5-Oq#sBq
zf<OemAkeX(;{?DHA0?Gx;ICsgic(^r>{j|I-~*BIJ=uF8P+l<EwgEBl`GnO24I2>X
ztnL1vW5Z9G96=zyC25I!%8t5oaN*GL;?Sofiq522t+a8&Vj96;m$mM(U;85YY$@wf
zZwQP`DOo7LK)WHptKx+oh+6LKnHQ7%6ROglmwqta+lVZ)=9;$L+v=^@<pB0_z45rg
zBTRIc>DRy2IYFSxUN0~q2t<A?f*b_8MRf4!v_2Y;5P$K1TM#0xKzZOX!J;U)y}~(;
zPl069jdEQWOhZqEB@yUF4hfVoo{6an=J6_|zMd|iP`_4i<~20ZhHKzB2&6|DAw<^X
zStmDwG_e}QaP!47+J6oNYyhb*vnEB9&AMf}gW&vQ_#87#5h);l5|B@KtB>dRyGo;^
z>RyUcHKs$rN!)k0Z^)B@KzFV@v0?de>VXHV(@9-6{>|;pZjH_!bB=-HxnrQpNKbH^
zSlvUi*65u)>!LFt5D5qxM?7BIZei6|8|M%cIJ-Xhg@@q49l`Qet?yghsc|~5UAc>}
z#lbLKq$CKm@jTZE1bT6~GV~Y-<O_n5fIt@sSZP2Y{p0`N7d8Sn@S?k$3wui*?0&JR
zQ(ebcohaPh_SPDhJY4a+rUN6JOP5=s)&f@xe1e&@v`}Y2qw9CgRxS6|^diOOMo6YX
z5kewW_7!e>mnAc@)Ph{`8;<59>hjtY@R&YH(2EgbR+@RLVRNNDayBMW-2n$r3}a4}
z>7>|IP^Bed1o?V<Pw9Y3duS)&fNqiX03m4ORAuOxNY2v|)W`*M6^6IHhJDRE7J{d?
zw>OpIE`VM*AuZ7Zcg-L44W4{P*ueyi^ZfQuUfiMx`zvDGB#VI7*K=Ikchzyvv$g?8
zkM7%`<eaE7XW)S7YjkgGF#~pAAc!D$INnMVMr%_SiV${F1fn~-{vX358lorazKDAS
z0$qOr_8@3?_)x^bmcAei#8V%=bHcZ*KgVoNBmhDPdO-l4mtCPWy9y180al4vk-Q<4
zhCo&*knAtuZ!0=Xc$tDguc|`vHoKA;M8hG+K$l3MyY$ss)<hhQAiesLy(Rf{?E~i$
zIMO;*zH5#a+jsN}2$bZ-ql?JtDoY9==PIbxImyW)24gxZ?hboYe=+_BiYJR8uki%0
zGw;B9Eh?>9Qpgvh^wGCi)nS$wa6=V3Zri3<!^D-dr$F$^Py_W`?6Ab%MUCSW^q)4&
zhO|JVHou-4vGF_e)`uPwp=PD=C5FzoF43%(l=E3HjuqbKGzM*u=NetSz`9ALPQG#J
zqE+q`l`bHA=0JAWOx5Fp-f$qTILU_cffx<Qo#U7M1JPoUv7L*fWIy+rPZH%785Pt4
z=>a%6u$yoBX+^t3PK`eo2O6XYRF?gxDWf$(LQXr{BJMN6qUjuO0tDc0K)wfif@^a1
z(dDmM`OOwu&-2PZe7iJ<r@Y<K8Cs{h4zDxF=j^2lrljlvf?Zjok5<CqJsj9*GA3Kj
z!M1o5NL|&JAG61>)L90{=@NQnmw7}4N$@_IJGWm;<hfwoUWko2R7f{}9(KitW0sE#
z8x4lAyM)H&&OM<+2)~$I)ae|;xNJ$PPS%C)D1B~A$jTsmJoq`%e9)eDV-uGu#0IiN
z>a7;N9A#M)c8mWc*EFGem+tMZ`tmt!7VoC*C9tr;0Dx>4P@8qdv^imw+US9{n$WuU
zpW`1z_8VU_j6W|*;a%gmZRPw8%|jp*xDY+hIMT`cS%5cU`>`A}CXork0;MSB7uD_O
zZ4cRr12XP~-_mcxYC?G-0^3i9GRyC^)~-eRdtHJe82SyNA{O2shO~JJgx>sC$PvNH
zDwn^D*BK-kH9geO80xdL6D)6d40U0Idq^0SYH4IgrL{a`jG%GUAlW$S+H1Rh;HDN+
z^rXB*=3+JZBJoD{)clgL+ttjN4K^B(Z=HjzL-4{Bod%p5sw+|_H~dNuz*_#H|7t=b
zDf9GZdEax!kuRSA{Wi`3O^mAxJBB*`duxDXgo1zD%oI`fK4*7bZXkzZjOpGnR}qwH
ziHocJNaJ*@I&3OlVkwAx5qzlbG5Xk(=5MzvBgaZpN*mT!0zb8efe9<G|CXo;JvQp}
z`$Wh*Y{x?H9e39(zp+ls7c9TUY#yV>=z3`aMiA@)o@|8bhu*3QP4pl<zoUg+tT)Z=
z94FRT*Q4ExR0c`@R=Syo4WJwl;-|fq@|7HP{`YfaJ~p?!qPu_a&PB%ww~J7cT+DAH
z{lE<<q-&R}K4)hD-j`VfL6ZCZ_u^}CY7dS5tS>!Z>lZJ6!h*xI`GZ9(=>&C0smxw5
z|K7&{4O$fX?WPn2c0;PcHJkj_@!!uTim)r0+GKK~n!4f49+R(XsqY)(3u2F3QT~pC
zlGgHfw!F;2EqB{-W3=9kc4h|C6`qa@(rg*6YbN|>M%jOBODV$Y-$>U&6%|mG3)KfK
zU!my`+~GPyOYx-R%Yi7*Z_!+=K>8VXur#xY=H|70H%&RB;@_bznPS^KwSH>SI<n%y
z<M<{hxIyh>p9mX-V6fpgxE!FUxs<9JNrcgOKHdJJ>VL~ZoaS;qkw+7k=T#Q`!{MQt
zj}L=>Dx7fd9w4C&uh_%HhhoRyb|PN-N^mN3JwD0rM)&N2q^cj7>D(|<)-r@anvK7x
zhdQ!_o{tv_ArN}K5Y4`E;Z=!@t%@J`XD9HU8W%`+;W3nb1s#8U@fvW{QLRhUH``Sh
zfco7AM3@XkoREE1r+a52!WDH(b*xhdM0Y3!H-%bkXr%8dY@DCHSQ&Jvf4x)-_HfJa
zTlg&^QETFTH2kH=08Q#M+yM~LMF5q=BLt7sUT06|V!b|t=}3;fcyXwLw-AhQ<<A_F
zjcnJ|edTo(7eTIH>mW<*N+zKLF3E`L5B1VP)hv5zT?}?G0<jz#HPjU`Ncg0=>=L1N
z)uH@fsZ)ix+oh6_DZt9#ki#VE$WK!I@{c_$@SH297!HYeP0M(7zXjK&G42=VN4Rws
z*-qPq|HuFmvVt$SJyd+X+x>~@x`7bQD8b>@3=yuUSA$sl_>O~!07U=ZYxC0b*t5<*
zpY9`uVa|ef=gmwPd~=kx5-4?t;gLG$PIqyk5u|#krX-yWN!m2wbdRLi^H3CUQ?nLZ
z#EDy14$S~OKtKGf)HUNv&dFoNeuu(9qV69oXJk9y=%8MBTE0xnlu)x%UzWaTQK;9~
z?{-n1r1)qQ%E1*?qbVU(QW5mLgm<qVZUT3J@)Vn3Yt%U_g^w5soQfZTRr3J}`5xU@
z>fw5(SoNuIjeCWc72l;@FC-N5KDy3wC6X*)+mkhYermkJ`$B<PR{K*V!|3fp%>t}W
znx3barpHyw_)K9g3B$?u2Nd@ghJZvKfsbZY$>SCpO=gg4&q*CmAI8mM3xRdGI0Et#
zuM5~EKn{7vrr*pDkK>1m3>;(p6IO*D6S<DhtG-JWj=!Cc9)Mjf$V=&QnGf&R`XgN>
z>-LdUo$(iI&WjX_zzc^=z3X}lcGv?`3`G3bTb(tFP79a`?weY?&$K^Z0^p|?;TpX}
z94Q?;o%D5kwe~fehmE>@YgUp}LP8hC;>9Cwrh@7QJRAR9<$arTp@uL<^?|u@n@|yU
z1C`VyGY6ss%IROJC2NV6f4?MG?<P)_O{@=)+L2sw5Oi<l!mPKFzvsGpB!K~>Y}el<
zB%w0DXx=5CdrY$C@GYDL`u+7vwb3IRn4>|rt{>XQ5D|+(F9aOh4g#p>5K7Y(dARAe
z8tW%Vvyvf#o}PdA?6=2_#^T?`{d*25=U0LN$mZX2Mb^6To{FO@C*=tGG!o?`Pm+7|
zZ02ZxH`G{G46`MA=G78E2a_pbrOP2Y02P}wE2Z|+6;(<5$@J<dxJX-j%~wUr=tmi7
zj%L|pYpB=j?;k;W0Y`-iD{)6p^OMgH6Z5|=0~(L%gAT0{=#8M?t-Edc1*G?9PWe%0
z@XD>X@DNm{9yw%-IeLrQ(ha{mV{7O>dEuZB_#Iqx^z^#?f<*u|p)2=c2!H^7s?cFr
zTQYoc4Cq(>N`^rVPEkRUi=ZP3;0$#q<Y)ou9VvH{L!spr+){d0%{y~f1!4zZDGFAK
zA2~cgqYHAkdionc?QK1YEil1K!y~Ke%Vn?NszZ0a%`H*T&$<lU0CM_|S|i`Q9JHOa
zeav=?!VO5Dzvo$4Z#nrwP@Aj*BF5yFDw|4FmTjBHa^G-uTfFGF$uY;~oeadOJ2wvq
zNYze}2SL#PV#UD_%|Wr<G5SVAXW|L3xxo5<4#4)i)4(^sYgxgyr_nY&)NRA|z-rqO
z2OFSqXZx%aN-TWqb!?MFk>`QLY&haJpxtmNUW=*oP_u{Ufg2b))PDTUCBOQ+zwiK5
zU&6`{N1gHyyxfi3+g{j1(jUeQE(mRP#hd_Tf6(qZcldZ{Fe28yT>Y!wMs@fx_QXHe
zfQk;JkCc%A<@i$ukscUAtMlWZQ;wQmgdIh;yDWZh+=54~8@=$q9?9)<i0jEoHmBRz
z+LwAFJtQE)rYEvTydrCkUUEQQP~02$GJPTV`#rv5fz@0MLnAvIt<SkAdeScZVspbc
zkiOV?P9<{P&IOFt=;(A;&C|e;ZlLmjzMfENe0bYYwj$W_rB|jBWeoRzD?)ge@t=<Y
zuf;QI8jq$ng4Pv$zMS*wUBPc$_`RQ!*5_UOUb5}$G@}9;Z=)VBM!PeO5Rj^+GV+)P
ze6n~B&8}H51zCJN8FhmsXKI~B`$IH^>4*I;g}(?JsW(pyx}bcSt8SEgN5qRvxzpg(
z0pYhrl$Akn(d1TVei_s@O3-bqPI$oTcrDHd^2FtJk8P!%wS%6jWMBO?aX|KRs-<*5
z(-Gl3FfQNdVsuqz+y_9WUd<t_ia28`mt?<h+ApYUbiLcxXzHIVW<b)!p1xV?9x}3e
z`CQ=B%l*y^IJ?@O4kt(BEgmm5;d_8D19Ik>5A6gB8IAs9k{oBxn?@-Y^@e0gsMAtv
zXn(RaXJzj#$8M_q(``&!a2DU?X6U)Q-wgrX4a)aF8pIGGBs?SEg&Q&QM}Z=@pOtq}
z#5azr$nX0dlUa&BJQoEtj{Y{*l8I?kPwB!JXVYMw=DEdma@|#;eTSh+V#wJQWc|1m
zel~3BCoD^uM5{fxP_(maj-<LS>@msq8`}S3_;T&6eKZ1FusX<6NZGhiF)~dN`cZQ_
zWxsnp@bsTv1dL{@df2-@P?nXU>crc6SQa+#%;&_Kf4EGe2M7GKDC|)E*8y@f!agI<
ze<ev0jS#w;dKsay4D<j@Z}BX~Kv!I^mI{Q*wGDpjsLvo^;07k~lihASm{OP8f)gbG
zoOH}(-up0~3=y+0He`9tjLUfQ#uV{Z<o`u36m01d#rs1MuYW{J3tl@)pi=Z#m<#|9
zk@5brK1=Sg)ytb&Ke20ozQyj<>k|`@BL{$DLlttn_-=UCFLxVVjI5femi={%%&?2v
zADVjp0!2XcsNEPdCQ}IrDXLo<2-6eOpO|OeM`4Lo*iv`8*wd3E)7JnneJ(rzD$w9#
zI30D0(HqkMoO~?p&H@<h(>zEA3aGC&O&R;3k*gh$f7J+q*MLLJvJUX;2d*jp-{b9L
z?|)YGsBklQgs8oUyIJ6gN5M(r>2A0F9m$<niniwPIc{8&S)wAl8-_Dy%_9c<4F461
zv!0`9j#i+MEKYqj1R9CphJQtLAMxvEizJ@?hJw`S-xa>tuN>BMvEcYQwu5`w2zktI
zQy&R!=s$A7Kn3#qdflR$4%Vs+wX8{AJBF-)6}|NR&pFU0iw-74r}kAjZ)z+n{Ku5V
zI-RiC`rZJ=WUe#wXg_BP`qu<rE+FESJuytHwOoXFtLgAxJ$sA2%SUEK6_VyHJNCEI
z^6~0(u)0@?!t3E_qVU`su79f%wh~!HrV7+I+qK{y&hq>t;e*n`^-oQUft@p{l&k*%
z?yqMG&wje6lO|zl=PC2;A5}&mLDiExjV$~}mYl+gn7<bcaj7nuToSvFXUn@!+%)({
zY5Va?^F+@FcI5<wz&)_pKdKbNBbF-K|2_0k68Rq)Y2%6BbxRL3*X?ipXJr_M<2FwJ
zH@GjCZEtK~f-Ofp(pz3B`XL}9zW)tnK)WFutTwpVpE=?H-B(WkUzL9%;t(FO>@4>=
z(EMNVE5y<S9q(4-H%+e@pkmox?bTZ@a2#(F-5V?t@+?U=*qXTi``Z0onOh&5oukWR
z;*sE!(-U=WXAHL{>XPIH4AC~rYf{z86qfR=HBC{}U9zV-c{!NvI%uGA?;17RTJ-a1
z7G%weyLK5>TgyIJJb#aJx}6@@s=i#nx?5`Hx<;=BE2<Go-(mO^>vV6Z>y{htu_6SX
zXY|Y`oQoqZhBn1g_u+IBR|I)hE^H#%uLLv6SkLrAC>VdGzJ~wjM!u@*{J4$`_Tm}I
zWx}_O*5Z-l5JnBCNRT`8y#GbY$+Gud(#EVbng-)vpkXE0%;TaW_Qom^u5Sym993>-
zFk`XQVFFVDU~v|}1`|c<GPUxtI6dNXJ%%}Q^jgcP@-4@2{&G5e{i;P&7_uzcj)j5s
z1=C5l6@q`Lk16S<UY%^sEVU5ihlubb%o(Mnd6`)RF6AyA|0ryG!ra*O{QCU>Kbxb6
z(ixie&bzHwgvDJm(P>Z;apj4N9H6XCME%3KybLmS<N!7-BG$rOIKn?!AALe7_Nc?`
z?NU=kOwh0K^9|j^Jl`LUjG|sGS+)90!)@zBKb{9dP>@DtJgBShqyK6^*Xj$d(j+{G
zuHC*W^a;jHSV2wwwUnFJKU|x_u8xs23UR@WWn`+LEF=bqp}K<fvDB|Q%8MEjQ3j((
z=mLD${K*eLuING2+!cz0cxk&U_uQn9>@CnAxkB!qbLEq^LS(W3BY6zNhM1l3@r<kd
z2>(rA9UTEXI!fV?+hYRlxz7<%be)k-U)e>KH>r|WhgsL$_|_|kiN6bB*4S+-41OYG
zd^+)L#B76<;zin1Q(VXo?AiC533wlDVFhnBa#C{dC^O7gl|M#V>kJv*IHzE5*IjP$
zP;ZZPE+fq=h|<_`v=xihH52?OjAHvdGQ%A86CB8mv`;K+pU4en^!3=SF?l>a=L<f?
zW$pa1XV16Vcrr@XJmXoS(pO}|{P_-~0wb30?=a&qdhrf@%icsFkFUmr#+(B(n>{J<
zvm^Xf?}<RctkjCQaG9NrkM@!5S%dFURvLDN7-=0o#td-X2WH_Xk1B>g;%b$?z>Mc*
zjSzlZ=baUwFL?e8RJdty6DpzRiioO3$&bj@@&!r3nUY{cB7+Wdb-Zy*xumuN0;<Jl
zb22K{`SHDWxo><Wu{R0$?V6ojOb3x?Mw+)vN;Y&ku*_nzKbo;w`Bd?CvyJIXiwFKq
zwK;z!u(Y=F3fF8>ss#T6<Y`-pT#g^tRkzC>dnbP73UWD3!k<^S0sXk0Wisv9xNdtw
zKaV9xKG}4TMDLs%L^k=u{QZI3p;i=S!QnwLM$5W&&Wdcgq0McH*)XfqCHL8;u-7^h
zpEikDv+$<KLm;uvT~%!51>7oB6xf6Pa9vSn3rTO#A>-urf5o*r@oe0NAyQHhozTNF
z$@hNcCj7H`6H~k{b#*djYi6o*N4a=|KTjRcpu2s?j;H#;Ij!^(wW8vW3;Vn*H|9g$
zsyoK5wDwvHdE0F&EZ=G(lXIj4D*qZbTrzpEtJ1jBV!!V6CecBO@>QJ;mO)jUlLmo5
z_}AyvHkT)W3(SXgm8maOhdh&smbs_1N)bD2m@IsfI(yV5N>7b?-qe-_M-?LXtQz*C
zRDdO#@vrS@q*EE~te`m?x#o}wU@wFp?p*Y8hwk*oUweHCmT~+VR!7^jy!^v25ZBmV
zwqWKLnjAkX0ev<Sz_E5-6e{vMlnmQ_pA+-6MMO0LWjl5f6}f3=w^eTp5tDuPMSx?h
zB1ef{f$K?paX4Kk;%V{RCkf~3iHP<qkB9gLB?7C5<3wjTcNkm2rCj`dIu1Rb+6K-J
zg)|(LT+V{WZpyH>&EcZ#*S2|<*I0hgU+R19E5f^eJ4fXe8QDE{c&XyMK}k6{iqLrN
zWw{*_%OkxKE1w*#36lx8;%s)KREzN$iL!^!?!@C?4xS@>o-#{?fEjKn^5;0WVw!ok
z5Gxe?)`qr^T_eo&&f0xl9ecRa;FQ8U5$8$!%ipe|H`AKWp6rEQ_TQ0q`09;$9dB3Z
zb_Rd#WeDD#nYEnouR4s(meZKu+B)YnTgn*(d>sclQ5!1w3n=^4z}p3mqkm(+9P>^2
zJSE(rM<f%RBgft)$dQ;_@h*{g{r+qi(PK2jt$nWHEU0dZiWHnqSRX8WO6nm@cV!`9
z-5#Rn=8(D4EH}|rZu88a)U5XJRDevLza?+oTBnt4u=-6u+3)2%sw96+rX)6XLXb)A
z^Vr`@>-#R^(n0kdYn!G{9bQA(IUWIXh$*_+0>u>(>-R^=A5SUi_8hM$)nh*;s)$gT
zC|!DvuKGd-&%+A+S<fB1i6J`|qMliGC0)+FBgrJm;a!h4M(4yp)M1Q_tv{)hvjB#B
zgjBa!@4e}xypqVz3mYror0%NYPa>*7e&AkYw$9HEA7~aD#B4O(Gk7Y}I|2Xdfs`I|
za|VlG3T-Ty>2fo)V4p`MxJoPM9yerf>(txt+{jp|L&r!AJ^97H1)?{hJ0cprQv>Je
z1j<xznzX#jNm{+g&se2tKSy>szEA8rDl$XKuFJ>N{<}*QT{j%DQ7^=8B^1fhZn5WQ
z1GzqkTpXM&7`zTfz?-MVRP)mAo5gz0zwNj(E%FN-{Y>GXvb44HlJgSkw;TJYn%`54
znQzni@0j;X>ad_9k+M3rg!l_jxRsW&t<Q4E#cP900$vqrbA23_^vTpIiRxu)eXE*u
z!{{wmp`yjj*rVZ~L-{I$PYl&Lfqx@QE!JhXkZZ4@;ci{qtZv-qV~F6hCG7DzJ)Wz5
zeUq5l^vcKP>X7;ytY;LuV&*J63xSiwbj?vwj|^aZBP<>7R6I)k{)IWwa?kJ$+-i{L
zb!3Vqb%r$7^%`6(+Ei1z`ixdb`xs`nV|5sY855$q;F;U8kk@_!vc(B6x)<tVcR%LQ
z0TYL>Nr5#_Lhx@+we4)$O6g!ItOuIat^F$96KDm7T>PvrmtBj}zDX~<9Aw8(o&b;h
zSOnMHD@UFmX&y{qM-6PLGE#<~UWHEb+Tg{NV6H<UxFD7jXp5)sPO7EkIQl5dsp6WE
zCN-NLnu4vTzlS^=bmB{_?ZMMMv9_3?QoU*8aHT*;xzO-u;Z5Foq``z`D7@R@b1Cb&
zbOrmn@|sqWj@G=r-7e00pVN~caA;FyG(%r_DEn$KB6_L(3)?hy0*>HDHW)xqISuA=
zavCLIDy7&P-11Y|0_u}ZE6EyCX@ryW>6rV85ss%BuInx`AQ)RQ8KKh7y*PfUENQ=7
zoPbs&H~bgjn0P1$shr6d9h#DPU1Gt|b+a)b**O;f{LE81Hbd>>))WHqWs*at3yJ-`
ztuvBE9$m={2fjdNp@n=H?z=Mw#SO5&dn(hSRFX;2gXDDM&{%laA#35=@t#qTBHN7g
z4Yg}Sxt~qxY}A;&aoD^36Kc+^jlh-i5d(_uhkO<tLW{jBwP5JLAoJEj?1OgJQzLG(
zd-IXP<#iKf<{Pu$*`|Axbq6dwv3LaGT`0>!3-|DDU*ZQpw&ZAe7oBRW#_-9;WNU5Z
z4#%A7uG6O0BvxPOuI&fga>Sr3-*^cOGG2kNi}c-C|90-IoP^5wNY!tUer69fH=cAX
zX=@N-^}^_2o>6*On49>)ztSQY2veGGRy4|7q!x><x$z8AUO-Q&BaUgOh3j=_O*8Qf
z*kx_#PF%2S4^$R>zJRrJR||T5oGvP=f5HvYC){fOy?=$x1RSYtuRG4l;PX&SD{d^<
zFzMNgTpCMbNyoM!Kd!-V(+hbwuEoAmbfNphE7i2^l<UnU$5}^YhD+lZs;9pRHm;TW
zz-E1Mt!vMIkbEM$+;lJb>btWign7Sry>ZUpX&dWza*diyNe*Gz?7f{%w!ZQ{W~X)G
z1f1*4H_b4v@BC;4<M(%8jno)iYR0FaD@Nn2pN5{(cGB6=>!E_m;E^<lzyZq1P&r${
z!dS6tnRg{x(3vN)kY8nj#97%WS%PKp5^K8QF&ecn4V}w=^V%6_<=ckXmN;eGhWTSk
za-xgh4^yUwLn*tJ$d%Ovelf?BBq^|-Dn`{bBrmgh@@tG~9c~O+dShow&{%n$n~K^H
zUONn3=oeo-clCV4<S*9b>`F`+DJW`-4JvueUP|31yj#>;d0)`Ue*XJO)X>J=2duO8
zVQ)u}X$jscLuwDT12o(xlOw-$PI4(5IME-D7Ea1KxkD|AXGwKmX#zhJ<f||WTX7Yh
zyYTlZqB(IoAu?k$t4TXJ5ytB3X31^&q`D2oT*~qIPj(2-fQn9PYdhh;IU67bn3y`G
zgM1mr8`VFbI;4hu=9B79jwn0Bo$X#)&Xg0!DvPF3|9s-_B0g%v)Z&37US4DFuC7`R
zUUVMUrFuA8c&+tn9r-4I&U#-@vKZ^esX*9E`JaF5e=THhGQ1uJKX2l^2=3b){M#%-
znOSO*UX{e;Ds3;~K=8*H*4#O5;k4WtH0wG~^rsyS-B~03{q|-;+v$b>m|c#Hj|Y@~
zblN-^0)LLJM{HH#<$gG_KmSb{vu^6`ZAAtEKf48ydhBjaJ`FqdU=Mg2a!=pNwT*Ls
zsNDf(-v9LSui=qwI>mP?=E$sgIIjw;!&dN{)q4g{$Hzt}SK>IzRv=YJ$Kg#Dw#~Vm
z67y}nVg^iOeY^1RutG0cqUc1NB|8JnS`$FoOXIQO#Y%(V=fTQEfh;bVKA1vrN2_!F
zOcpS>9_~F@7rKY5*n`jwjKLMQGUrcgPX@XXty#OfHhS=qY8I^^{r$}Y4e+<cElLY4
zOxqzkYpf&2OlM!g`MB&q@{2ip@AaBHKy|TQDf)A{r&s$eU>{)f#6VQLh$L^?M`+wA
z9GQ4z*Z$?W4-u}Dx=`e**~f-3q(Wof_URtx9g<+t*i26ZrE$lYnTPraL^Reqc*8N?
zxHCmgfD-t1mxb##--LFG5HcTs^=(BIp3l<Fn(l7&x@clg;6`lk+Skwtq^+yR)M~A{
zF=RlU*;X&-x?0nxH#o;jy^!fMp~5+C^y*47QvFUA<KF%r_}!~6;o+_%d1+$7%|fYi
zjvTj`;r?NYcw$Bsgvhl4xrpVJxt8_`H^#V9;hgH7$cb{P1e@?KfhlIj^TjuTA9a+b
zS)6yU5M9w`cb%@e(O~FiJc7Wvp8?WR=q(tMX=kG5z>88E``XO#E-=xhj3GaEgAM*>
zHWw;m?ea}y*A;Cu6htai<WT*(uO6=R&<Eo@R(8Ke#&xW^Rc2{}^TV}VW8^}rPzlU?
zTjjzv<Y0(!g^yT#rcjn#+VOzvtIq{XL!`RFoo^FwaiekT_NNIMCfMJ`zr41jmK-cM
zAm=y5g4IxnTw`>a0uu}?AW7S%xPYEjyd2CCQ!Rp#&P2&i8h)&QB(f^@!#zlumZ#;J
zrMy<#Iq$9HwnVa$W>|47?2>I?_Jwn^)N*aw1zALGa#G}^i4isqbOPHUcf4C(NvQ?B
zt&q{@58V`OU{+Xbp41&Tu@h4YeXdF&GuP{=tn;-ntW)hhYQ<iN--}^F?l{-mmfWq=
zBIz@u_g-GExOKHo2sa(tu+(Ravtep|Ul#OrYm_`1$SEYNM&1x_C*fz?-3K|J)P1|X
zvXNMv*(KrIHTLD#;YluhuG%bUw%Cy7E09Zbh;?-vf37dVn?IV)$V8<+7tfJ+z;_+n
zR@T}%?ftouN?2AfdClKef!Z-D)~9+Uec*C=Ty%|Jx4FE!IYALS%`vtxi;3GWFPF}m
zT{G%taJWxVwr#g}A%4p$u59q7uT#YfV0eye4#gpE(oQlou+mSy^X;B9l{1-%GbS9L
zRST+xc^hH8q$jJRwo-oVR7Q72X#V&h-Qeat{*D`ht{-Y8{8(tU#vP{$c}G0|&bdsq
z^8u+H3vx2!mRoun1Z=NJt>}7Yeg3kmVsDi=0gzsCig9dER^BPzjV_?UrDyrld1d>y
z8^BUdj!=;W%_*Aa9(TW=2^!F)UHg%{;4L@S)+<E!Kwg`#LM~UPk+Lx9Bjw!G@l7tf
z8P28|Va<B<b|5dU==Z7cunL(={IK~X$UFIpTEII9nuxeTMZBFrnpUICL)t{Rgz7U7
z8H{lTbU@by+lp)vC#WHM_Pmd}KxoI&eYysD`kXTJR6GBjeI#iuchHOS;CGvb`!~E<
zAyPZ7JKMQ4m2_H$Kcdh$;VE#>Qo@)&rq)fhr&$UE-;@ZPax{21d%lM&A?EYhXg$1W
zIjl=1Lc2&gLEL)Yz(t;CewRo-lj<e`A(&X6g-j?ddr=L|H}k-8`Nw@6wfwkwT&y74
zt&Ju0oH@$r_|>_#_RTRfHOMutt>f`~R!@l@*yyaQ0Aqy>HIyV$h_~4wua=eUn$Gd&
zVVN?%?_7RaWU>U9Hz5koE9%0gmu<e@aeIUhJIe$wy}rtYme3iy{6%hy%l3Niz4kZ*
zuaY0xLTRSsN=$OXP`||oV~J;Fe_j`E#69V1@yBn~th5}t{`!BQkbkhib^v&9b26Tj
z0-zN7SNs+{eJ@rf*=RJEgqzc1sMlBh{)H_5<#LnQPU+ulWn|^wzOsaK7>KVB-d@rR
zb`O60Eo#CWxDh>kY5p<VW3NGUZ%1^u&GZ4jM;!kwPLj&sFLVwQ;3)Wkoknuud;gdK
zq~W~(bFn>e2+op<Ni<@vu(U}_PyaH#sJtk?!My$YBO6Xwc!OGN##K?$WO5*Qa>JNu
z?}W-~auRmg7A?j{h?EP>CNew&Q)Psd4>2`1>pTcCKM^JBAEIuB0ebT15^NS)^&xgF
zb=<|L(;ryeH7Ow5=U?^+)_FX_1E16Tb}g&HIdxbVF^0M$G|;XfDq>`)b*otwwgrnv
zMC9%?GI-q2_6VTY`7GPAE4uw|n2E{2WhgDS7#-=3R~gZ{6Pvdh$TPCsm<T&-5qABH
zF6x~DpYZ@ilO~ttn1P*`tiIUWVp&D$4IPV2t^Cc9;T7DT9G$cR_FP>@lW`FaJ6PV5
z_3h)hg?3W&$~cSf6^HMg{yA+m$V*bsO>YT48E<Fbzi5=o#dh<E)l@q(Mfe#RS8!Wn
zHbGzqC7z;?h^+kH>fIl69*kHi_5#29NN;)U-7Z@rHKCawdGqvJ;*<IfBOc{H_)a8F
z6D8EICYa91!OFG$2!e%f3eMQ&_eF9&3=NlyQsTg+34FYHi+LVq_k4KyZAn7x_m{B=
z1vo<7#^}D0;LYzt#WD+1E|GZx+m-L?Y?Y`rqkHrxuqpYP`xhID>dOX?9^vh!dd=@K
z^;@bBvJW)SEUi_+zVkk5^a+Wq{k~>y-1h}sc7<(HJFez@n~dZGRwt$mGjxs`Wo*{W
zeYGNJJWct$iqo@kqG}a!&&^CVPV%PeDBl7)Ck40y^+l>S{relTjEZV9H(oWSM610M
zDnz$($HkT+kmd3scSN|YS>UNTA28Rr0zdj@E4P{^hLz+p!B{dyoZz1$@}u{y#FPP4
z=1Ko;K>~JnBq3jKi_gcsTMBF|FL^UCcH+FwZDA|s&u4hO{mljI^2OrCvKJcTGFCr%
z_ax^C&>L$_ehjA?c@4`LC--{F?IpkXUSJGf`O1i7rg~xVZ3#91fYCXo(a;2pP}GAP
z^FH}nPOy1e7>m8512220=@!qhNV19FC7Tg4DUvAE9ds^YLPbR__VcvH1)u`qv?D%!
z+qy4aZalRe68nf&BiJQL>7gSy*1?MLW?XVk=fjJ)%L`6Rx|nEx%MObRXSmwt#bO*8
zQ{l{CgH!eC>8Xuzd;nk69epVfJCEY;t@W~rl)XXUC|rB$4ZG+rq^$-1-P~f6BDvH$
zD!Y^VPMK3{2NhNO=51Mk=8D$O#c-0tLvOg;?S1o;x-eSyp?ViIg7Eb5O0*gXrl@4n
z1aCE6TGcnkbx8HbSn6n4FNIMdb)B)}vvKccQ{`44a!O*dsbO<QMaZ4wgAZ&;vgLpz
zRa8SwCSSEYUq0wXCdBJ-!_tPu&T*q}me1{875-r-RaiM`lDz`8k;$~ia|X&En)`!J
zjn=Z0+MD&X0(RY3QpX3Z36LQe5$n|4C|SqxWP@Z!Rh!)t(y*7w_l#llewk)2?Zj-o
z?$5?+C3v~dC^z03c5n9xqnP#%t-LE&RhRD6EI$31zj-)LKLb3R$#PP5@`c(ImxHwh
z5~+x$UZ^LujY4s^d>QhP6}9<Zgs&x9d*tv|+Fc0}Q+I<U@M%3T6FcN#_f{+Rv<BC?
zliX%PSlce`meIgQi>aF3-t4;PFXYT-wJUJ1$jYzbuI3n8#s+@05gGG|^bP;65xAM^
z9TuAD?R$=f%Ob!f9%d7D0q|I@x6m4SL~IaVp)vQ`4-tm06Hi{|a1|?_b`KkvUcnSz
zbC`&J8gtW4<s5wdRU>4QJzgs*?Y*n*D2)b0Cq*8?pmgDNOPfo=s%t_{$K;sMaik<$
zUdimcZYi$)o0L*@&f?m2)<g^4!Ct06zPCE8(!aii`G26H{YfULVmH(pV4cysU##=d
zBuRU~_hZ0EhyTm}lDnkNx#xZYc&&MJ22deN0(Pg>=OwWOUIj_tmy^gA)A#&8iV*2b

diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index d629828f12..ed9e4ae833 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -30,8 +30,8 @@ Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne,
-charmm/coul/long, lj/cut, lj/cut/coul/long, sw, tersoff :l
-K-Space Styles: pppm :l
+charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l
+K-Space Styles: pppm, pppm/disp :l
 :ule
 
 [Speed-ups to expect:]
@@ -42,62 +42,88 @@ precision mode. Performance improvements are shown compared to
 LAMMPS {without using other acceleration packages} as these are
 under active development (and subject to performance changes). The
 measurements were performed using the input files available in
-the src/USER-INTEL/TEST directory. These are scalable in size; the
-results given are with 512K particles (524K for Liquid Crystal).
-Most of the simulations are standard LAMMPS benchmarks (indicated
-by the filename extension in parenthesis) with modifications to the
-run length and to add a warmup run (for use with offload
-benchmarks).
+the src/USER-INTEL/TEST directory with the provided run script. 
+These are scalable in size; the results given are with 512K 
+particles (524K for Liquid Crystal). Most of the simulations are 
+standard LAMMPS benchmarks (indicated by the filename extension in
+parenthesis) with modifications to the run length and to add a 
+warmup run (for use with offload benchmarks).
 
 :c,image(JPG/user_intel.png)
 
 Results are speedups obtained on Intel Xeon E5-2697v4 processors
 (code-named Broadwell) and Intel Xeon Phi 7250 processors
-(code-named Knights Landing) with "18 Jun 2016" LAMMPS built with
-Intel Parallel Studio 2016 update 3. Results are with 1 MPI task
+(code-named Knights Landing) with "June 2017" LAMMPS built with
+Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
 per physical core. See {src/USER-INTEL/TEST/README} for the raw
 simulation rates and instructions to reproduce.
 
 :line
 
+[Accuracy and order of operations:]
+
+In most molecular dynamics software, parallelization parameters
+(# of MPI, OpenMP, and vectorization) can change the results due
+to changing the order of operations with finite-precision 
+calculations. The USER-INTEL package is deterministic. This means
+that the results should be reproducible from run to run with the
+{same} parallel configurations and when using determinstic 
+libraries or library settings (MPI, OpenMP, FFT). However, there
+are differences in the USER-INTEL package that can change the
+order of operations compared to LAMMPS without acceleration:
+
+Neighbor lists can be created in a different order :ulb,l
+Bins used for sorting atoms can be oriented differently :l
+The default stencil order for PPPM is 7. By default, LAMMPS will 
+calculate other PPPM parameters to fit the desired acuracy with 
+this order :l
+The {newton} setting applies to all atoms, not just atoms shared
+between MPI tasks :l
+Vectorization can change the order for adding pairwise forces :l
+:ule
+
+The precision mode (described below) used with the USER-INTEL 
+package can change the {accuracy} of the calculations. For the 
+default {mixed} precision option, calculations between pairs or 
+triplets of atoms are performed in single precision, intended to 
+be within the inherent error of MD simulations. All accumulation
+is performed in double precision to prevent the error from growing 
+with the number of atoms in the simulation. {Single} precision
+mode should not be used without appropriate validation.
+
+:line
+
 [Quick Start for Experienced Users:]
 
 LAMMPS should be built with the USER-INTEL package installed.
 Simulations should be run with 1 MPI task per physical {core},
 not {hardware thread}.
 
-For Intel Xeon CPUs:
-
 Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l
-If using {kspace_style pppm} in the input script, add "neigh_modify binsize cutoff" and "kspace_modify diff ad" to the input script for better
-performance.  Cutoff should be roughly the neighbor list cutoff.  By
-default the binsize is half the neighbor list cutoff.  :l
-"-pk intel 0 omp 2 -sf intel" added to LAMMPS command-line :l
+Set the environment variable KMP_BLOCKTIME=0 :l
+"-pk intel 0 omp $t -sf intel" added to LAMMPS command-line :l
+$t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
+For some of the simple 2-body potentials without long-range
+electrostatics, performance and scalability can be better with
+the "newton off" setting added to the input script :l
+If using {kspace_style pppm} in the input script, add 
+"kspace_modify diff ad" for better performance :l
 :ule
 
-For Intel Xeon Phi CPUs for simulations without {kspace_style
-pppm} in the input script :
+For Intel Xeon Phi CPUs:
 
-Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
-Runs should be performed using MCDRAM. :l
-"-pk intel 0 omp 2 -sf intel" {or} "-pk intel 0 omp 4 -sf intel"
-should be added to the LAMMPS command-line. Choice for best
-performance will depend on the simulation. :l
+Runs should be performed using MCDRAM. :ulb,l
 :ule
 
-For Intel Xeon Phi CPUs for simulations with {kspace_style
-pppm} in the input script:
-
-Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l
-Runs should be performed using MCDRAM. :l
-Add "neigh_modify binsize 3" to the input script for better
-performance. :l
-Add "kspace_modify diff ad" to the input script for better
-performance. :l
-export KMP_AFFINITY=none :l
-"-pk intel 0 omp 3 lrt yes -sf intel" or "-pk intel 0 omp 1 lrt yes
--sf intel" added to LAMMPS command-line. Choice for best performance
-will depend on the simulation. :l
+For simulations using {kspace_style pppm} on Intel CPUs 
+supporting AVX-512:
+
+Add "kspace_modify diff ad" to the input script :ulb,l
+The command-line option should be changed to 
+"-pk intel 0 omp $r lrt yes -sf intel" where $r is the number of 
+threads minus 1. :l
+Do not use thread affinity (set KMP_AFFINITY=none) :l
+The "newton off" setting may provide better scalability :l
 :ule
 
 For Intel Xeon Phi coprocessors (Offload):
@@ -169,6 +195,10 @@ cat /proc/cpuinfo :pre
 
 [Building LAMMPS with the USER-INTEL package:]
 
+NOTE: See the src/USER-INTEL/README file for additional flags that
+might be needed for best performance on Intel server processors
+code-named "Skylake".
+
 The USER-INTEL package must be installed into the source directory:
 
 make yes-user-intel :pre
@@ -322,8 +352,8 @@ follow in the input script.
 
 NOTE: The USER-INTEL package will perform better with modifications
 to the input script when "PPPM"_kspace_style.html is used:
-"kspace_modify diff ad"_kspace_modify.html and "neigh_modify binsize
-3"_neigh_modify.html should be added to the input script.
+"kspace_modify diff ad"_kspace_modify.html should be added to the 
+input script.
 
 Long-Range Thread (LRT) mode is an option to the "package
 intel"_package.html command that can improve performance when using
@@ -342,6 +372,10 @@ would normally perform best with "-pk intel 0 omp 4", instead use
 environment variable "KMP_AFFINITY=none". LRT mode is not supported
 when using offload.
 
+NOTE: Changing the "newton"_newton.html setting to off can improve
+performance and/or scalability for simple 2-body potentials such as
+lj/cut or when using LRT mode on processors supporting AVX-512.
+
 Not all styles are supported in the USER-INTEL package. You can mix
 the USER-INTEL package with styles from the "OPT"_accelerate_opt.html
 package or the "USER-OMP package"_accelerate_omp.html. Of course,
@@ -467,7 +501,7 @@ supported.
 
 Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l
 
-Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency. 2016 International Conference for High Performance Computing. In press. :l
+Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. "Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency."_http://dl.acm.org/citation.cfm?id=3014915 2016 High Performance Computing, Networking, Storage and Analysis, SC16: International Conference (pp. 82-95). :l
 
 Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l
 :ule
diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt
index 94c6ee84fd..7382e6024d 100644
--- a/doc/src/fix_neb.txt
+++ b/doc/src/fix_neb.txt
@@ -14,152 +14,178 @@ fix ID group-ID neb Kspring keyword value :pre
 
 ID, group-ID are documented in "fix"_fix.html command :ulb,l
 neb = style name of this fix command :l
-Kspring = parallel spring constant (force/distance units or force units) :l
+Kspring = parallel spring constant (force/distance units or force units, see nudge keyword) :l
 zero or more keyword/value pairs may be appended :l
-keyword = {nudg_style} or {perp} or {freend} or {freend_k_spring} :l
-  {nudg_style} value = {neigh} or {idealpos}
-    {neigh} = the parallel nudging force is calculated from the distances to neighbouring replicas (in this case, Kspring is in force/distance units)
-    {idealpos} = the parallel nudging force is proportional to the distance between the replica and its interpolated ideal position (in this case Kspring is in force units)
-  {perp} value {none} or kspring2
-    {none} = no perpendicular spring force is applied
-    {kspring2} = spring constant for the perpendicular nudging force (in force/distance units)
-  {freeend} value = {none} or {ini} or {final} or {finaleini} or {final2eini}
-    {none} = no nudging force is applied to the first and last replicas
-    {ini} = set the first replica to be a free end 
-    {final} = set the last replica to be a free end
-    {finaleini} = set the last replica to be a free end and set its target energy as that of the first replica
-    {final2eini} = same as {finaleini} plus prevent intermediate replicas to have a lower energy than the first replica
-  {freeend_kspring}  value = kspring3
-    kspring3 = spring constant of the perpendicular spring force (per distance units)    
-   :pre
+keyword = {nudge} or {perp} or {ends} :l
+  {nudge} value = {neigh} or {ideal}
+    {neigh} = parallel nudging force based on distance to neighbor replicas (Kspring = force/distance units)
+    {ideal} = parallel nudging force based on interpolated ideal position (Kspring = force units)
+  {perp} value = {Kspring2}
+    {Kspring2} = spring constant for perpendicular nudging force (force/distance units)
+  {end} values = estyle Kspring3
+    {estyle} = {first} or {last} or {last/efirst} or {last/efirst/middle}
+      {first} = apply force to first replica
+      {last} = apply force to last replica
+      {last/efirst} = apply force to last replica and set its target energy to that of first replica
+      {last/efirst/middle} = same as {last/efirst} plus prevent middle replicas having lower energy than first replica
+    {Kspring3} = spring constant for target energy term (1/distance units) :pre
 
 [Examples:]
 
 fix 1 active neb 10.0
-fix 2 all neb 1.0 perp 1.0 freeend final
-fix 1 all neb 1.0 nudg_style idealpos freeend final2eini freend_kspring 1:pre
+fix 2 all neb 1.0 perp 1.0 end last
+fix 2 all neb 1.0 perp 1.0 end first end last
+fix 1 all neb 1.0 nudge ideal end last/efirst 1 :pre
 
 [Description:]
 
-Add a nudging force to atoms in the group for a multi-replica
+Add nudging forces to atoms in the group for a multi-replica
 simulation run via the "neb"_neb.html command to perform a nudged
 elastic band (NEB) calculation for finding the transition state.
 Hi-level explanations of NEB are given with the "neb"_neb.html command
 and in "Section_howto 5"_Section_howto.html#howto_5 of the manual.
 The fix neb command must be used with the "neb" command and defines
-how nudging inter-replica forces are computed.  A NEB calculation is
+how inter-replica nudging forces are computed.  A NEB calculation is
 divided in two stages. In the first stage n replicas are relaxed
-toward a MEP and in a second stage, the climbing image scheme (see
-"(Henkelman2)"_#Henkelman2) is turned on so that the replica having
-the highest energy relaxes toward the saddle point (i.e. the point of
-highest energy along the MEP).
-
-One purpose of the nudging forces is to keep the replicas equally
-spaced.  During the NEB, the 3N-length vector of interatomic force Fi
-= -Grad(V) of replicas i is altered. For all intermediate replicas
-(i.e. for 1<i<n) but the climbing replica the force vector
-becomes:
-
-Fi = -Grad(V) + (Grad(V) dot That) That + Fnudgparallel + Fspringperp :pre
-
-That is the unit "tangent" vector for replica i and is a function of
-Ri, Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
-roughly in the direction of (Ri+i - Ri-1) (see the
-"(Henkelman1)"_#Henkelman1 paper for details).  Ri are the atomic
-coordinates of replica i; Ri-1 and Ri+1 are the coordinates of its
-neighbor replicas.  The term (Grad(V) dot That) is used to remove the
+toward a MEP until convergence.  In the second stage, the climbing
+image scheme (see "(Henkelman2)"_#Henkelman2) is enabled, so that the
+replica having the highest energy relaxes toward the saddle point
+(i.e. the point of highest energy along the MEP), and a second
+relaxation is performed.
+
+A key purpose of the nudging forces is to keep the replicas equally
+spaced.  During the NEB calculation, the 3N-length vector of
+interatomic force Fi = -Grad(V) for each replica I is altered.  For
+all intermediate replicas (i.e. for 1 < I < N, except the climbing
+replica) the force vector becomes:
+
+Fi = -Grad(V) + (Grad(V) dot T') T' + Fnudge_parallel + Fspring_perp :pre
+
+T' is the unit "tangent" vector for replica I and is a function of Ri,
+Ri-1, Ri+1, and the potential energy of the 3 replicas; it points
+roughly in the direction of (Ri+i - Ri-1); see the
+"(Henkelman1)"_#Henkelman1 paper for details.  Ri are the atomic
+coordinates of replica I; Ri-1 and Ri+1 are the coordinates of its
+neighbor replicas.  The term (Grad(V) dot T') is used to remove the
 component of the gradient parallel to the path which would tend to
-distribute the replica unevenly along the path.  Fnudgparallel is an
-artificial nudging force which is applied only in the tangent direction
-and which maintains the replicas equally spaced (see below for more
-information).  Fspringperp is an optinal artificial spring which is
-applied only perpendicular to the tangent and which prevent the paths
-from forming too acute kinks (see below for more information).
-
-The keyword {nudg_style} allow to specify how to parallel
-nudging force is computed. With a value of idealpos, the spring 
-force is computed as suggested in "(E)"_#E :
-   
-Fnudgparallel=-{Kspring}* (RD-RDideal)/(2 meanDist) :pre
-
-where RD is the "reaction coordinate" see "neb"_neb.html section, and
-RDideal is the ideal RD for which all the images are equally spaced
-(i.e. RDideal = (i-1)*meanDist when the climbing image is off, where i
-is the replica number). The meanDist is the average distance between
-replicas.
+distribute the replica unevenly along the path.  Fnudge_parallel is an
+artificial nudging force which is applied only in the tangent
+direction and which maintains the equal spacing between replicas (see
+below for more information).  Fspring_perp is an optional artificial
+spring which is applied only perpendicular to the tangent and which
+prevent the paths from forming acute kinks (see below for more
+information).
 
-When {nudg_style} has a value of neigh (or by default), the parallel 
-nudging force is computed as in "(Henkelman1)"_#Henkelman1 by 
-connecting each intermediate replica with the previous and the next 
-image:
+In the second stage of the NEB calculation, the interatomic force Fi
+for the climbing replica (the replica of highest energy after the
+first stage) is changed to:
 
-Fnudgparallel= {Kspring}* (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
+Fi = -Grad(V) + 2 (Grad(V) dot T') T' :pre
 
-The parallel nudging force associated with the key word idealpos should
-usually be more efficient at keeping the images equally spaced.
+and the relaxation procedure is continued to a new converged MEP.
 
 :line
 
-The keyword {perp} allows to add a spring force perpendicular to the
-path in order to prevent the path from becoming too kinky. It can
-improve significantly the convergence of the NEB when the resolution
-is poor (i.e. when too few images are used) (see "(Maras)"_#Maras1).
-The perpendicular spring force is given by
+The keyword {nudge} specifies how the parallel nudging force is
+computed.  With a value of {neigh}, the parallel nudging force is
+computed as in "(Henkelman1)"_#Henkelman1 by connecting each
+intermediate replica with the previous and the next image:
+
+Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
+
+Note that in this case the specified {Kspring) is in force/distance
+units.
+
+With a value of {ideal}, the spring force is computed as suggested in
+"(WeinenE)"_#WeinenE :
+   
+Fnudge_parallel = -{Kspring} * (RD-RDideal) / (2 * meanDist) :pre
 
-Fspringperp = {Kspringperp} * f(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
+where RD is the "reaction coordinate" see "neb"_neb.html section, and
+RDideal is the ideal RD for which all the images are equally spaced.
+I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
+I is the replica number).  The meanDist is the average distance
+between replicas.  Note that in this case the specified {Kspring) is
+in force units.
 
-f(Ri-1 Ri R+1) is a smooth scalar function of the angle Ri-1 Ri
-Ri+1. It is equal to 0 when the path is straight and is equal to 1
-when the angle Ri-1 Ri Ri+1 is accute. f(Ri-1 Ri R+1) is defined in
-"(Jonsson)"_#Jonsson
+Note that the {ideal} form of nudging can often be more effective at
+keeping the replicas equally spaced.
 
 :line
 
-By default, the force acting on the first and last replicas is not
-altered so that during the NEB relaxation, these ending replicas relax
-toward local minima. However it is possible to use the key word
-{freeend} to allow either the initial or the final replica to relax
-toward a MEP while constraining its energy.  The interatomic force Fi
-for the free end image becomes :
+The keyword {perp} adds a spring force perpendicular to the path in
+order to prevent the path from becoming too kinky, with magnitude It
+can significantly improve the convergence of the NEB calculation when
+the resolution is poor.  I.e. when too few replicas are used; see
+"(Maras)"_#Maras1 for details.
 
-Fi = -Grad(V)+ (Grad(V) dot That + (E-ETarget)*kspring3) That,  {when} Grad(V) dot That < 0
-Fi = -Grad(V)+ (Grad(V) dot That + (ETarget- E)*kspring3) That, {when} Grad(V) dot That > 0
-:pre
+The perpendicular spring force is given by
 
-where E is the energy of the free end replica and ETarget is the
-target energy.
-
-When the value {ini} ({final}) is used after the keyword {freeend},
-the first (last) replica is considered as a free end. The target
-energy is set to the energy of the replica at starting of the NEB
-calculation. When the value {finaleini} or {final2eini} is used the
-last image is considered as a free end and the target energy is equal
-to the energy of the first replica (which can evolve during the NEB
-relaxation).  With the value {finaleini}, when the initial path is too
-far from the MEP, an intermediate repilica might relax "faster" and
-get a lower energy than the last replica. The benefit of the free end
-is then lost since this intermediate replica will relax toward a local
-minima. This behavior can be prevented by using the value {final2eini}
-which remove entirely the contribution of the gradient for all
-intermediate replica which have a lower energy than the initial one
-thus preventing these replicae to over-relax.  After converging a NEB
-with the {final2eini} value it is recommended to check that all
-intermediate replica have a larger energy than the initial
-replica. Finally note that if the last replica converges toward a
-local minimum with a larger energy than the energy of the first
-replica, a free end neb calculation with the value {finaleini} or
-{final2eini} cannot reach the convergence criteria.
+Fspring_perp = {Kspring2} * F(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre
 
-:line
+where {Kspring2} is the specified value.  F(Ri-1 Ri R+1) is a smooth
+scalar function of the angle Ri-1 Ri Ri+1.  It is equal to 0.0 when
+the path is straight and is equal to 1 when the angle Ri-1 Ri Ri+1 is
+acute.  F(Ri-1 Ri R+1) is defined in "(Jonsson)"_#Jonsson.
 
+If {Kspring2} is set to 0.0 (the default) then no perpendicular spring
+force is added.
 
+:line
 
-In the second stage of the NEB, the interatomic force Fi for the
-climbing replica (which is the replica of highest energy) becomes:
+By default, no forces act on the first and last replicas during the
+NEB relaxation, so these replicas simply relax toward their respective
+local minima.  By using the key word {end}, additional forces can be
+applied to the first or last replica, to enable them to relax toward a
+MEP while constraining their energy.
 
-Fi = -Grad(V) + 2 (Grad(V) dot That) That :pre
+The interatomic force Fi for the specified replica becomes:
 
+Fi = -Grad(V) + (Grad(V) dot T' + (E-ETarget)*Kspring3) T',  {when} Grad(V) dot T' < 0
+Fi = -Grad(V) + (Grad(V) dot T' + (ETarget- E)*Kspring3) T', {when} Grad(V) dot T' > 0
+:pre
 
+where E is the current energy of the replica and ETarget is the target
+energy.  The "spring" constant on the difference in energies is the
+specified {Kspring3} value.
+
+When {estyle} is specified as {first}, the force is applied to the
+first replica.  When {estyle} is specified as {last}, the force is
+applied to the last replica.  Note that the {end} keyword can be used
+twice to add forces to both the first and last replicas.
+
+For both these {estyle} settings, the target energy {ETarget} is set
+to the initial energy of the replica (at the start of the NEB
+calculation).
+
+If the {estyle} is specified as {last/efirst} or {last/efirst/middle},
+force is applied to the last replica, but the target energy {ETarget}
+is continuously set to the energy of the first replica, as it evolves
+during the NEB relaxation.
+
+The difference between these two {estyle} options is as follows.  When
+{estyle} is specified as {last/efirst}, no change is made to the
+inter-replica force applied to the intermediate replicas (neither
+first or last).  If the initial path is too far from the MEP, an
+intermediate repilica may relax "faster" and reach a lower energy than
+the last replica.  In this case the intermediate replica will be
+relaxing toward its own local minima.  This behavior can be prevented
+by specifying {estyle} as {last/efirst/middle} which will alter the
+inter-replica force applied to intermediate replicas by removing the
+contribution of the gradient to the inter-replica force.  This will
+only be done if a particular intermediate replica has a lower energy
+than the first replica.  This should effectively prevent the
+intermediate replicas from over-relaxing.
+
+After converging a NEB calculation using an {estyle} of {last/efirst},
+you should check that all intermediate replicas have a larger energy
+than the first replica.  If not, then repeat the calculation with an
+{estyle} of {last/efirst/middle}.
+
+Finally, note that if the last replica converges toward a local
+minimum which has a larger energy than the energy of the first
+replica, a NEB calculation using an {estyle} of {last/efirst} or
+{last/efirst/middle} cannot reach final convergence.
 
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
@@ -186,7 +212,8 @@ for more info on packages.
 
 [Default:]
 
-The option defaults are nudg_style = neigh, perp = none, freeend = none and freend_kspring = 1.
+The option defaults are nudge = neigh, perp = 0.0, ends is not
+specified (no inter-replica force on the end replicas).
 
 :line
 
@@ -197,14 +224,14 @@ The option defaults are nudg_style = neigh, perp = none, freeend = none and free
 [(Henkelman2)] Henkelman, Uberuaga, Jonsson, J Chem Phys, 113,
 9901-9904 (2000).
 
-:link(E)
-[(E)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002)
+:link(WeinenE)
+[(WeinenE)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002).
 
 :link(Jonsson)
 [(Jonsson)] Jonsson, Mills and Jacobsen, in Classical and Quantum
-Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti, and Coker
-World Scientific, Singapore, 1998, p. 385
+Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti,
+and Coker World Scientific, Singapore, 1998, p 385.
 
 :link(Maras1)
 [(Maras)] Maras, Trushin, Stukowski, Ala-Nissila, Jonsson,
-Comp Phys Comm, 205, 13-21 (2016)
+Comp Phys Comm, 205, 13-21 (2016).
diff --git a/doc/src/kspace_modify.txt b/doc/src/kspace_modify.txt
index b488df9627..66091f4973 100644
--- a/doc/src/kspace_modify.txt
+++ b/doc/src/kspace_modify.txt
@@ -308,7 +308,8 @@ The option defaults are mesh = mesh/disp = 0 0 0, order = order/disp =
 gewald = gewald/disp = 0.0, slab = 1.0, compute = yes, cutoff/adjust =
 yes (MSM), pressure/scalar = yes (MSM), fftbench = yes (PPPM), diff = ik
 (PPPM), mix/disp = pair, force/disp/real = -1.0, force/disp/kspace = -1.0,
-split = 0, tol = 1.0e-6, and disp/auto = no.
+split = 0, tol = 1.0e-6, and disp/auto = no. For pppm/intel, order =
+order/disp = 7.
 
 :line
 
diff --git a/doc/src/kspace_style.txt b/doc/src/kspace_style.txt
index 371540bd68..4f27c9aa78 100644
--- a/doc/src/kspace_style.txt
+++ b/doc/src/kspace_style.txt
@@ -33,12 +33,16 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
     accuracy = desired relative error in forces
   {pppm/gpu} value = accuracy
     accuracy = desired relative error in forces
+  {pppm/intel} value = accuracy
+    accuracy = desired relative error in forces
   {pppm/kk} value = accuracy
     accuracy = desired relative error in forces
   {pppm/omp} value = accuracy
     accuracy = desired relative error in forces
   {pppm/cg/omp} value = accuracy
     accuracy = desired relative error in forces
+  {pppm/disp/intel} value = accuracy
+    accuracy = desired relative error in forces
   {pppm/tip4p/omp} value = accuracy
     accuracy = desired relative error in forces
   {pppm/stagger} value = accuracy
diff --git a/doc/src/pair_lj_long.txt b/doc/src/pair_lj_long.txt
index d559871f9d..da9f37b9c3 100644
--- a/doc/src/pair_lj_long.txt
+++ b/doc/src/pair_lj_long.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style lj/long/coul/long command :h3
+pair_style lj/long/coul/long/intel command :h3
 pair_style lj/long/coul/long/omp command :h3
 pair_style lj/long/coul/long/opt command :h3
 pair_style lj/long/tip4p/long command :h3
diff --git a/examples/neb/in.neb.hop1 b/examples/neb/in.neb.hop1
index b874d1ba32..9b5dcb07ec 100644
--- a/examples/neb/in.neb.hop1
+++ b/examples/neb/in.neb.hop1
@@ -51,7 +51,7 @@ set		group nebatoms type 3
 group		nonneb subtract all nebatoms
 
 fix		1 lower setforce 0.0 0.0 0.0
-fix		2 nebatoms neb 1.0 nudg_style idealpos
+fix		2 nebatoms neb 1.0 #nudge ideal
 fix		3 all enforce2d
 
 thermo		100
diff --git a/examples/neb/in.neb.hop1freeend b/examples/neb/in.neb.hop1.end
similarity index 91%
rename from examples/neb/in.neb.hop1freeend
rename to examples/neb/in.neb.hop1.end
index fa90e9a98c..2f4ba526d8 100644
--- a/examples/neb/in.neb.hop1freeend
+++ b/examples/neb/in.neb.hop1.end
@@ -15,7 +15,7 @@ variable	u uloop 20
 lattice		hex 0.9
 region		box block 0 20 0 10 -0.25 0.25
 
-read_data        initial.hop1freeend
+read_data        initial.hop1.end
 
 # LJ potentials
 
@@ -41,7 +41,7 @@ set		group nebatoms type 3
 group		nonneb subtract all nebatoms
 
 fix		1 lower setforce 0.0 0.0 0.0
-fix		2 nebatoms neb 1.0 nudg_style idealpos freeend ini
+fix		2 nebatoms neb 1.0 nudge ideal end first 1.0
 fix		3 all enforce2d
 
 thermo		100
diff --git a/examples/neb/initial.hop1freeend b/examples/neb/initial.hop1.end
similarity index 100%
rename from examples/neb/initial.hop1freeend
rename to examples/neb/initial.hop1.end
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index 2cb37ed9fe..ac8279949a 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
+CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
index b7f3cd6846..db5de83a06 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 MIC_OPT =       -qoffload-arch=mic-avx512 -fp-model fast=2
-CCFLAGS =	-g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
+CCFLAGS =	-O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
                 -xHost -fno-alias -ansi-alias -restrict \
                 -qoverride-limits $(MIC_OPT)
 SHFLAGS =	-fPIC
diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl
index 3bc777592e..881c51f0e4 100644
--- a/src/MAKE/OPTIONS/Makefile.knl
+++ b/src/MAKE/OPTIONS/Makefile.knl
@@ -8,7 +8,7 @@ SHELL = /bin/sh
 
 CC =		mpiicpc
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
+CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
                 -fno-alias -ansi-alias -restrict $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
diff --git a/src/REPLICA/fix_neb.cpp b/src/REPLICA/fix_neb.cpp
index b17315ca0d..297c101234 100644
--- a/src/REPLICA/fix_neb.cpp
+++ b/src/REPLICA/fix_neb.cpp
@@ -34,6 +34,9 @@ using namespace FixConst;
 using namespace MathConst;
 
 enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC};
+
+#define BUFSIZE 8
+
 /* ---------------------------------------------------------------------- */
 
 FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
@@ -45,56 +48,62 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
   tagsendall(NULL), tagrecvall(NULL), counts(NULL),
   displacements(NULL)
 {
+  if (narg < 4) error->all(FLERR,"Illegal fix neb command");
 
-  NEBLongRange=false;
-  StandardNEB=true;
-  PerpSpring=FreeEndIni=FreeEndFinal=false;
-  FreeEndFinalWithRespToEIni=FinalAndInterWithRespToEIni=false;
+  kspring = force->numeric(FLERR,arg[3]);
+  if (kspring <= 0.0) error->all(FLERR,"Illegal fix neb command");
 
-  kspringPerp=0.0;
-  kspring2=1.0;
-  if (narg < 4)
-    error->all(FLERR,"Illegal fix neb command, argument missing");
+  // optional params
 
-  kspring = force->numeric(FLERR,arg[3]);
-  if (kspring <= 0.0)
-    error->all(FLERR,"Illegal fix neb command."
-               " The spring force was not provided properly");
+  NEBLongRange = false;
+  StandardNEB = true;
+  PerpSpring = FreeEndIni = FreeEndFinal = false;
+  FreeEndFinalWithRespToEIni = FinalAndInterWithRespToEIni = false;
+  kspringPerp = 0.0;
+  kspring2 = 1.0;
 
-  int iarg =4;
+  int iarg = 4;
   while (iarg < narg) {
-    if (strcmp (arg[iarg],"nudg_style")==0) {
-          if (strcmp (arg[iarg+1],"idealpos")==0) {
-	    NEBLongRange = true;
-	    iarg+=2;}
-	  else if (strcmp (arg[iarg+1],"neigh")==0) {
-	    NEBLongRange = false;
-	    StandardNEB = true;
-	    iarg+=2;}
-	  else error->all(FLERR,"Illegal fix neb command. Unknown keyword");}
-    else if (strcmp (arg[iarg],"perp")==0) {
-      PerpSpring=true;
+    if (strcmp(arg[iarg],"nudge") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
+      if (strcmp(arg[iarg+1],"ideal") == 0) {
+        NEBLongRange = true;
+        StandardNEB = false;
+      } else if (strcmp(arg[iarg+1],"neigh") == 0) {
+        NEBLongRange = false;
+        StandardNEB = true;
+      } else error->all(FLERR,"Illegal fix neb command");
+      iarg += 2;
+
+    } else if (strcmp(arg[iarg],"perp") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command");
+      PerpSpring = true;
       kspringPerp = force->numeric(FLERR,arg[iarg+1]);
-      if (kspringPerp < 0.0)
-        error->all(FLERR,"Illegal fix neb command. "
-                   "The perpendicular spring force was not provided properly");
-      iarg+=2;} 
-    else if (strcmp (arg[iarg],"freeend")==0) {
-      if (strcmp (arg[iarg+1],"ini")==0)
-        FreeEndIni=true;
-      else if (strcmp (arg[iarg+1],"final")==0)
-        FreeEndFinal=true;
-      else if (strcmp (arg[iarg+1],"finaleini")==0)
-        FreeEndFinalWithRespToEIni=true;
-      else if (strcmp (arg[iarg+1],"final2eini")==0) {
-        FinalAndInterWithRespToEIni=true;
-        FreeEndFinalWithRespToEIni=true;}
-      else if (strcmp (arg[iarg+1],"none")!=0) error->all(FLERR,"Illegal fix neb command. Unknown keyword");
-      iarg+=2;} 
-    else if (strcmp (arg[iarg],"freeend_kspring")==0) {
-      kspring2=force->numeric(FLERR,arg[iarg+1]);
-      iarg+=2; }
-    else error->all(FLERR,"Illegal fix neb command. Unknown keyword");
+      if (kspringPerp == 0.0) PerpSpring = false;
+      if (kspringPerp < 0.0) error->all(FLERR,"Illegal fix neb command");
+      iarg += 2;
+
+    } else if (strcmp (arg[iarg],"end") == 0) {
+      if (iarg+3 > narg) error->all(FLERR,"Illegal fix neb command");
+      if (strcmp(arg[iarg+1],"first") == 0) {
+        FreeEndIni = true;
+      } else if (strcmp(arg[iarg+1],"last") == 0) {
+        FreeEndFinal = true;
+        FinalAndInterWithRespToEIni = false;
+        FreeEndFinalWithRespToEIni = false;
+      } else if (strcmp(arg[iarg+1],"last/efirst") == 0) {
+        FreeEndFinal = false;
+        FinalAndInterWithRespToEIni = false;
+        FreeEndFinalWithRespToEIni = true;
+      } else if (strcmp(arg[iarg+1],"last/efirst/middle") == 0) {
+        FreeEndFinal = false;
+        FinalAndInterWithRespToEIni = true;
+        FreeEndFinalWithRespToEIni = true;
+      } else error->all(FLERR,"Illegal fix neb command");
+      kspring2 = force->numeric(FLERR,arg[iarg+2]);
+      iarg += 3;
+    
+    } else error->all(FLERR,"Illegal fix neb command");
   }
 
   // nreplica = number of partitions
@@ -119,12 +128,12 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) :
   MPI_Group uworldgroup,rootgroup;
   if (NEBLongRange) {
     for (int i=0; i<nreplica; i++)
-      iroots[i]=universe->root_proc[i];
+      iroots[i] = universe->root_proc[i];
     MPI_Comm_group(uworld, &uworldgroup);
     MPI_Group_incl(uworldgroup, nreplica, iroots, &rootgroup);
     MPI_Comm_create(uworld, rootgroup, &rootworld);
   }
-  delete[] iroots;
+  delete [] iroots;
 
   // create a new compute pe style
   // id = fix-ID + pe, compute group = all
@@ -256,11 +265,11 @@ void FixNEB::min_post_force(int vflag)
   double delxp,delyp,delzp,delxn,delyn,delzn;
   double vIni=0.0;
 
-  vprev=vnext=veng=pe->compute_scalar();
+  vprev = vnext = veng = pe->compute_scalar();
 
-  if (ireplica < nreplica-1 && me ==0)
+  if (ireplica < nreplica-1 && me == 0)
     MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld);
-  if (ireplica > 0 && me ==0)
+  if (ireplica > 0 && me == 0)
     MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE);
 
   if (ireplica > 0 && me == 0)
@@ -297,6 +306,7 @@ void FixNEB::min_post_force(int vflag)
   }
 
   // communicate atoms to/from adjacent replicas to fill xprev,xnext
+
   inter_replica_comm();
 
   // trigger potential energy computation on next timestep
@@ -335,10 +345,10 @@ void FixNEB::min_post_force(int vflag)
           tangent[i][0]=delxp;
           tangent[i][1]=delyp;
           tangent[i][2]=delzp;
-          tlen += tangent[i][0]*tangent[i][0]
-            + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
-          dot += f[i][0]*tangent[i][0]
-            + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
+          tlen += tangent[i][0]*tangent[i][0] +
+            tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
+          dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + 
+            f[i][2]*tangent[i][2];
         }
       }
 
@@ -360,10 +370,10 @@ void FixNEB::min_post_force(int vflag)
           tangent[i][0]=delxn;
           tangent[i][1]=delyn;
           tangent[i][2]=delzn;
-          tlen += tangent[i][0]*tangent[i][0]
-            + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
-          dot += f[i][0]*tangent[i][0]
-            + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
+          tlen += tangent[i][0]*tangent[i][0] + 
+            tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
+          dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + 
+            f[i][2]*tangent[i][2];
         }
       }
   } else {
@@ -388,13 +398,13 @@ void FixNEB::min_post_force(int vflag)
         domain->minimum_image(delxn,delyn,delzn);
 
         if (vnext > veng && veng > vprev) {
-          tangent[i][0]=delxn;
-          tangent[i][1]=delyn;
-          tangent[i][2]=delzn;
+          tangent[i][0] = delxn;
+          tangent[i][1] = delyn;
+          tangent[i][2] = delzn;
         } else if (vnext < veng && veng < vprev) {
-          tangent[i][0]=delxp;
-          tangent[i][1]=delyp;
-          tangent[i][2]=delzp;
+          tangent[i][0] = delxp;
+          tangent[i][1] = delyp;
+          tangent[i][2] = delzp;
         } else {
           if (vnext > vprev) {
             tangent[i][0] = vmax*delxn + vmin*delxp;
@@ -408,24 +418,23 @@ void FixNEB::min_post_force(int vflag)
         }
 
         nlen += delxn*delxn + delyn*delyn + delzn*delzn;
-        tlen += tangent[i][0]*tangent[i][0]
-          + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
+        tlen += tangent[i][0]*tangent[i][0] + 
+          tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2];
         gradlen += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2];
         dotpath += delxp*delxn + delyp*delyn + delzp*delzn;
-        dottangrad += tangent[i][0]* f[i][0]
-          + tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
-        gradnextlen += fnext[i][0]*fnext[i][0]
-          + fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
-        dotgrad += f[i][0]*fnext[i][0]
-          + f[i][1]*fnext[i][1] + f[i][2]*fnext[i][2];
-
-        springF[i][0]=kspringPerp*(delxn-delxp);
-        springF[i][1]=kspringPerp*(delyn-delyp);
-        springF[i][2]=kspringPerp*(delzn-delzp);
+        dottangrad += tangent[i][0]*f[i][0] + 
+          tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2];
+        gradnextlen += fnext[i][0]*fnext[i][0] + 
+          fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2];
+        dotgrad += f[i][0]*fnext[i][0] + f[i][1]*fnext[i][1] + 
+          f[i][2]*fnext[i][2];
+
+        springF[i][0] = kspringPerp*(delxn-delxp);
+        springF[i][1] = kspringPerp*(delyn-delyp);
+        springF[i][2] = kspringPerp*(delzn-delzp);
       }
   }
 
-#define BUFSIZE 8
   double bufin[BUFSIZE], bufout[BUFSIZE];
   bufin[0] = nlen;
   bufin[1] = plen;
@@ -459,7 +468,7 @@ void FixNEB::min_post_force(int vflag)
 
   // first or last replica has no change to forces, just return
 
-  if(ireplica>0 && ireplica<nreplica-1)
+  if (ireplica > 0 && ireplica < nreplica-1)
     dottangrad = dottangrad/(tlen*gradlen);
   if (ireplica == 0)
     dottangrad = dottangrad/(nlen*gradlen);
@@ -468,7 +477,6 @@ void FixNEB::min_post_force(int vflag)
   if (ireplica < nreplica-1)
     dotgrad = dotgrad /(gradlen*gradnextlen);
 
-
   if (FreeEndIni && ireplica == 0) {
     if (tlen > 0.0) {
       double dotall;
@@ -568,14 +576,15 @@ void FixNEB::min_post_force(int vflag)
 
   for (int i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
-      dot += f[i][0]*tangent[i][0]
-        + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2];
-      dotSpringTangent += springF[i][0]*tangent[i][0]
-        +springF[i][1]*tangent[i][1]+springF[i][2]*tangent[i][2];}
+      dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + 
+        f[i][2]*tangent[i][2];
+      dotSpringTangent += springF[i][0]*tangent[i][0] +
+        springF[i][1]*tangent[i][1] + springF[i][2]*tangent[i][2];}
   }
 
   double dotSpringTangentall;
-  MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,
+                MPI_DOUBLE,MPI_SUM,world);
   dotSpringTangent=dotSpringTangentall;
   double dotall;
   MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world);
@@ -603,12 +612,12 @@ void FixNEB::min_post_force(int vflag)
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
-      f[i][0] += prefactor*tangent[i][0]
-        +AngularContr*(springF[i][0] -dotSpringTangent*tangent[i][0]);
-      f[i][1] += prefactor*tangent[i][1]
-        + AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
-      f[i][2] += prefactor*tangent[i][2]
-        + AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
+      f[i][0] += prefactor*tangent[i][0] + 
+        AngularContr*(springF[i][0] - dotSpringTangent*tangent[i][0]);
+      f[i][1] += prefactor*tangent[i][1] + 
+        AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]);
+      f[i][2] += prefactor*tangent[i][2] + 
+        AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]);
     }
 }
 
@@ -827,7 +836,6 @@ void FixNEB::inter_replica_comm()
   }
 }
 
-
 /* ----------------------------------------------------------------------
    reallocate xprev,xnext,tangent arrays if necessary
    reallocate communication arrays if necessary
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index e32a09c45c..c02014d0ce 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -4,6 +4,7 @@
                      --------------------------------
                      
              W. Michael Brown (Intel) michael.w.brown at intel.com
+                   William McDoniel (RWTH Aachen University)
                    Rodrigo Canales (RWTH Aachen University)
                   Markus Hï¿½hnerbach (RWTH Aachen University)
                            Stan Moore (Sandia)
@@ -14,15 +15,25 @@
 
 -----------------------------------------------------------------------------
 
-This package is based on the USER-OMP package and provides LAMMPS styles that:
+This package provides LAMMPS styles that:
 
    1. include support for single and mixed precision in addition to double.
    2. include modifications to support vectorization for key routines
+   3. include modifications for data layouts to improve cache efficiency
    3. include modifications to support offload to Intel(R) Xeon Phi(TM) 
       coprocessors
 
 -----------------------------------------------------------------------------
 
+For Intel server processors codenamed "Skylake", the following flags should
+be added or changed in the Makefile depending on the version:
+
+2017 update 2         - No changes needed
+2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
+2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
+
+-----------------------------------------------------------------------------
+
 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
 USER-OMP styles will be used whenever USER-INTEL styles are not available. This
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index cf14fb3237..758c37bf56 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -4,6 +4,7 @@
 # in.intel.lj -	        Atomic fluid (LJ Benchmark)
 # in.intel.rhodo -      Protein (Rhodopsin Benchmark)
 # in.intel.lc -	        Liquid Crystal w/ Gay-Berne potential
+# in.intel.eam -	Copper benchmark with Embedded Atom Method
 # in.intel.sw -	        Silicon benchmark with Stillinger-Weber
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
@@ -11,19 +12,26 @@
 #############################################################################
 
 #############################################################################
-# Expected Timesteps/second with turbo on and HT enabled, LAMMPS 18-Jun-2016
+# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
+#  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
 #                     Xeon E5-2697v4     Xeon Phi 7250
 #                    
-# in.intel.lj -          162.764             179.148
-# in.intel.rhodo -        11.633              13.668
-# in.intel.lc -	          19.136              24.863
-# in.intel.sw -	         139.048             152.026
-# in.intel.tersoff -      82.663              92.985
-# in.intel.water -        59.838              85.704
+# in.intel.lj -            199.5               282.3
+# in.intel.rhodo -          12.4                17.5
+# in.intel.lc -	            19.0                25.7
+# in.intel.eam -            59.4                92.8
+# in.intel.sw -	           132.4               161.9
+# in.intel.tersoff -        83.3               101.1
+# in.intel.water -          53.4                90.3
 #
 #############################################################################
 
+#############################################################################
+# For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README
+# for build flags that should be used. 
+#############################################################################
+
 #############################################################################
 # For Haswell (Xeon v3) architectures, depending on the compiler version, 
 # it may give better performance to compile for an AVX target (with -xAVX 
@@ -42,7 +50,18 @@
 # -v m 0.5		# Run for half as long
 #############################################################################
 
-#	Example for running benchmarks:
+#############################################################################
+# The LAMMPS newton setting can be controlled from the commandline for the
+# benchmarks with the N variable:
+#
+# -v N on		# newton on
+# -v N off		# newton off
+#
+# The default is on for all of the benchmarks except for LJ where the off
+# setting performs best with the USER-INTEL package
+#############################################################################
+
+#	Example for running benchmarks (see run_benchmarks.sh for script):
 
 # 	Number of physical cores per node not including hyperthreads
 export LMP_CORES=28
@@ -57,26 +76,35 @@ export LMP_BIN=../../lmp_intel_cpu
 #      LAMMPS root directory
 export LMP_ROOT=../../../
                
-source /opt/intel/parallel_studio_xe_2016.2.062/psxevars.sh
+source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
+export KMP_BLOCKTIME=0
 export I_MPI_PIN_DOMAIN=core
 export I_MPI_FABRICS=shm		# For single node
 
+# ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS
+export I_MPI_SHM_LMT=shm
+
 #      Generate the restart file for use with liquid crystal benchmark
 mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none
 
 #      Benchmark to run
 export bench=in.intel.lj
 
+#############################################################################
+# For Intel Xeon Phi x200 series processors best performance is achieved by
+# using MCDRAM. In flat mode, this can be achieved with numactl,
+# MPI environment variables, or other options provided by batch schedulers
+#############################################################################
 
 #############################################################################
 # To run without a optimization package
 #############################################################################
-mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none
+mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on
 
 #############################################################################
 # To run with USER-OMP package
 #############################################################################
-mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp
+mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on
 
 #############################################################################
 # To run with USER-INTEL package and no coprocessor
@@ -89,6 +117,9 @@ mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel
 mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel
 
 #############################################################################
-# If using PPPM (in.intel.rhodo) on Intel Xeon Phi x200 series processors
+# If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series 
+#   or Skylake processors
 #############################################################################
-mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp 3 lrt yes -sf intel
+export KMP_AFFINITY=none
+rthreads=$((OMP_NUM_THREADS-1))
+mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel
diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam
index e9523a5dd1..5a3b3064af 100644
--- a/src/USER-INTEL/TEST/in.intel.eam
+++ b/src/USER-INTEL/TEST/in.intel.eam
@@ -1,4 +1,6 @@
 # bulk Cu lattice
+
+variable        N index on      # Newton Setting
 variable	w index 10      # Warmup Timesteps
 variable	t index 3100    # Main Run Timesteps
 variable	m index 1       # Main Run Timestep Multiplier
@@ -13,6 +15,7 @@ variable	z index 2
 variable	rr equal floor($t*$m)
 variable	root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 variable	xx equal 20*$x
diff --git a/src/USER-INTEL/TEST/in.intel.lc b/src/USER-INTEL/TEST/in.intel.lc
index 0172ba3b4d..411f5d830d 100644
--- a/src/USER-INTEL/TEST/in.intel.lc
+++ b/src/USER-INTEL/TEST/in.intel.lc
@@ -3,6 +3,7 @@
 # shape: 2 1.5 1
 # cutoff 4.0 with skin 0.8
 
+variable        N index on      # Newton Setting
 variable        w index 10	# Warmup Timesteps
 variable        t index 840	# Main Run Timesteps
 variable        m index 1	# Main Run Timestep Multiplier
@@ -15,6 +16,7 @@ variable        z index 2
 
 variable        rr equal floor($t*$m)
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units		lj
diff --git a/src/USER-INTEL/TEST/in.intel.lj b/src/USER-INTEL/TEST/in.intel.lj
index 8931ca24bc..2b724f6014 100644
--- a/src/USER-INTEL/TEST/in.intel.lj
+++ b/src/USER-INTEL/TEST/in.intel.lj
@@ -1,5 +1,6 @@
 # 3d Lennard-Jones melt
 
+variable        N index off     # Newton Setting
 variable	w index 10	# Warmup Timesteps
 variable	t index 7900	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
@@ -15,6 +16,7 @@ variable	yy equal 20*$y
 variable	zz equal 20*$z
 variable	rr equal floor($t*$m)
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units		lj
diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo
index 7b3b092607..05145d79c0 100644
--- a/src/USER-INTEL/TEST/in.intel.rhodo
+++ b/src/USER-INTEL/TEST/in.intel.rhodo
@@ -1,5 +1,6 @@
 # Rhodopsin model
 
+variable        N index on      # Newton Setting
 variable	w index 10	# Warmup Timesteps
 variable	t index 520	# Main Run Timesteps
 variable	m index 1	# Main Run Timestep Multiplier
@@ -16,10 +17,11 @@ variable	z index 2
 variable	rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"	then "processors * * * grid numa"
 
 units           real  
-neigh_modify    delay 5 every 1 binsize $b
+neigh_modify    delay 5 every 1
 
 atom_style      full  
 bond_style      harmonic 
diff --git a/src/USER-INTEL/TEST/in.intel.sw b/src/USER-INTEL/TEST/in.intel.sw
index 077c9bb4fb..494f58dea3 100644
--- a/src/USER-INTEL/TEST/in.intel.sw
+++ b/src/USER-INTEL/TEST/in.intel.sw
@@ -1,5 +1,6 @@
 # bulk Si via Stillinger-Weber
 
+variable        N index on      # Newton Setting
 variable        w index 10      # Warmup Timesteps
 variable        t index 6200	# Main Run Timesteps
 variable        m index 1       # Main Run Timestep Multiplier
@@ -16,6 +17,7 @@ variable	zz equal 10*$z
 variable        rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"     then "processors * * * grid numa"
 
 units		metal
diff --git a/src/USER-INTEL/TEST/in.intel.tersoff b/src/USER-INTEL/TEST/in.intel.tersoff
index f0c6a88f75..574b29f674 100644
--- a/src/USER-INTEL/TEST/in.intel.tersoff
+++ b/src/USER-INTEL/TEST/in.intel.tersoff
@@ -1,5 +1,6 @@
 # bulk Si via Tersoff
 
+variable        N index on      # Newton Setting
 variable        w index 10      # Warmup Timesteps
 variable        t index 2420	# Main Run Timesteps
 variable        m index 1       # Main Run Timestep Multiplier
@@ -16,6 +17,7 @@ variable        zz equal 10*$z
 variable        rr equal floor($t*$m)
 variable        root getenv LMP_ROOT
 
+newton          $N
 if "$n > 0"     then "processors * * * grid numa"
 
 units		metal
diff --git a/src/USER-INTEL/TEST/in.intel.water b/src/USER-INTEL/TEST/in.intel.water
index 1c1fca311f..0643def19e 100644
--- a/src/USER-INTEL/TEST/in.intel.water
+++ b/src/USER-INTEL/TEST/in.intel.water
@@ -1,5 +1,6 @@
 # Coarse-grain water simulation using Stillinger-Weber
 
+variable        N index on      # Newton Setting
 variable        w index 10      # Warmup Timesteps
 variable        t index 2600	# Main Run Timesteps
 variable        m index 1       # Main Run Timestep Multiplier
@@ -11,6 +12,7 @@ variable	y index 2
 variable	z index 2
 variable        rr equal floor($t*$m)
 
+newton          $N
 if "$n > 0"     then "processors * * * grid numa"
 
 units		real
diff --git a/src/USER-INTEL/TEST/in.lc_generate_restart b/src/USER-INTEL/TEST/in.lc_generate_restart
index 8ae53c5c8e..30d593f2cd 100644
--- a/src/USER-INTEL/TEST/in.lc_generate_restart
+++ b/src/USER-INTEL/TEST/in.lc_generate_restart
@@ -4,13 +4,13 @@
 # cutoff 4.0 with skin 0.8
 # NPT, T=2.4, P=8.0
 
-variable	x index 1
-variable	y index 1
-variable	z index 1
+variable	xt index 1
+variable	yt index 1
+variable	zt index 1
 
-variable	i equal $x*32
-variable	j equal $y*32
-variable	k equal $z*32
+variable	i equal ${xt}*32
+variable	j equal ${yt}*32
+variable	k equal ${zt}*32
 
 units		lj
 atom_style	ellipsoid
diff --git a/src/USER-INTEL/TEST/run_benchmarks.sh b/src/USER-INTEL/TEST/run_benchmarks.sh
new file mode 100755
index 0000000000..10bd79e0d1
--- /dev/null
+++ b/src/USER-INTEL/TEST/run_benchmarks.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+#########################################################################
+# Adjust settings below for your system
+#########################################################################
+
+# --------------------- MPI Launch Command
+
+export MPI="mpirun"           
+#export MPI="numactl -p 1 mpirun"    # -- Systems w/ MCDRAM in flat mode
+
+# ------------- Name and location of the LAMMPS binary
+
+export LMP_BIN=../../lmp_intel_cpu_intelmpi
+#export LMP_BIN=../../lmp_knl
+
+# ------------- Directory containing the LAMMPS installation
+
+export LMP_ROOT=../../../
+
+# ------------- Number of physical cores (not HW threads)
+
+export LMP_CORES=36            # -- For Intel Xeon E5-2697v4 SKU
+#export LMP_CORES=68           # -- For Intel Xeon Phi x200 7250 SKU
+
+# ------------- Number of HW threads to use in tests
+
+export LMP_THREAD_LIST="2"     # -- For 2 threads per core w/ HT enabled
+#export LMP_THREAD_LIST="2 4"   # -- For 2 threads per core w/ HT enabled
+
+# ------------- MPI Tuning Parameters
+
+#export I_MPI_SHM_LMT=shm      # -- Uncomment for Xeon Phi x200 series
+
+# ------------- Library locations for build
+
+#source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh
+
+#########################################################################
+# End settings for your system
+#########################################################################
+
+export WORKLOADS="lj rhodo rhodo_lrt lc sw water eam"
+export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1"
+export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1"
+
+export LOG_DIR_HEADER=`echo $LMP_BIN | sed 's/\.\.\///g' | sed 's/\.\///g'`
+export LOG_DIR_HOST=`hostname`
+export DATE_STRING=`date +%s`
+export LOG_DIR=$LOG_DIR_HOST"_"$LOG_DIR_HEADER"_"$DATE_STRING
+mkdir $LOG_DIR
+
+export I_MPI_PIN_DOMAIN=core
+export I_MPI_FABRICS=shm
+export KMP_BLOCKTIME=0
+
+echo -n "Creating restart file...."
+$MPI -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none $LMP_ARGS
+echo "Done."
+for threads in $LMP_THREAD_LIST
+do
+  export OMP_NUM_THREADS=$threads
+  for workload in $WORKLOADS
+  do
+    export LOGFILE=$LOG_DIR/$workload.$LMP_CORES"c"$threads"t".log
+    echo "Running $LOGFILE"
+    cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $LMP_ARGS";
+    rthreads=$threads
+    unset KMP_AFFINITY
+    $cmd
+
+    # - For benchmarks with PPPM, also try LRT mode
+    if [ $workload = "rhodo" ]; then
+      export LOGFILE=$LOG_DIR/$workload"_lrt".$LMP_CORES"c"$threads"t".log
+      cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $RLMP_ARGS";
+      rthreads=$((threads-1))
+      export KMP_AFFINITY=none
+      export OMP_NUM_THREADS=$rthreads
+      echo "  $cmd" >> $LOG_DIR/commands.info
+      $cmd
+    fi
+  done
+done
+
+# Performance reported by LAMMPS (Timesteps/second ignoring warm-up run)
+grep Perf $LOG_DIR/*.log | awk 'BEGIN{n=1}n%2==0{print $0}{n++}' | sed 's/\/day//g' | sed 's/steps\/s/steps_s/g' | sed 's/hours\/ns//g' | sed 's/.*\///g' | sed 's/\.log:Performance://g' | awk '{c=NF-1; print $1,$c}'
diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp
index aafc765c6b..0c493646e3 100644
--- a/src/USER-INTEL/angle_charmm_intel.cpp
+++ b/src/USER-INTEL/angle_charmm_intel.cpp
@@ -81,16 +81,16 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -102,7 +102,7 @@ void AngleCharmmIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void AngleCharmmIntel::eval(const int vflag, 
 			    IntelBuffers<flt_t,acc_t> *buffers,
 			    const ForceConst<flt_t> &fc)
@@ -126,12 +126,9 @@ void AngleCharmmIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeangle = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeangle = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -140,8 +137,12 @@ void AngleCharmmIntel::eval(const int vflag,
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -150,7 +151,17 @@ void AngleCharmmIntel::eval(const int vflag,
     const int4_t * _noalias const anglelist = 
       (int4_t *) neighbor->anglelist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seangle = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = anglelist[n].a;
       const int i2 = anglelist[n].b;
       const int i3 = anglelist[n].c;
@@ -229,40 +240,58 @@ void AngleCharmmIntel::eval(const int vflag,
 
       // apply force to each of 3 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+	  f[i1].y += f1y;
+	  f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= f1x + f3x;
+	  f[i2].y -= f1y + f3y;
+	  f[i2].z -= f1z + f3z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+	  f[i3].y += f3y;
+	  f[i3].z += f3z;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= f1x + f3x;
-        f[i2].y -= f1y + f3y;
-        f[i2].z -= f1z + f3z;
-      }
-
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, 
-                              f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, 
-                              delx2, dely2, delz2, oeangle, f, NEWTON_BOND, 
-                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, 
+                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, 
+                              dely1, delz1, delx2, dely2, delz2, seangle, 
+                              f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, 
+                              sv4, sv5);
+	#else
+        IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, 
+                              i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, 
+                              dely1, delz1, delx2, dely2, delz2, oeangle, 
+                              f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, 
+                              ov4, ov5);
+        #endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oeangle += seangle;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
+	ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeangle;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oeangle;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp
index f101fd9e1f..198431d552 100644
--- a/src/USER-INTEL/angle_harmonic_intel.cpp
+++ b/src/USER-INTEL/angle_harmonic_intel.cpp
@@ -81,16 +81,16 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -102,7 +102,7 @@ void AngleHarmonicIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void AngleHarmonicIntel::eval(const int vflag, 
 			    IntelBuffers<flt_t,acc_t> *buffers,
 			    const ForceConst<flt_t> &fc)
@@ -126,12 +126,9 @@ void AngleHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeangle = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeangle = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -140,8 +137,12 @@ void AngleHarmonicIntel::eval(const int vflag,
     reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -150,7 +151,17 @@ void AngleHarmonicIntel::eval(const int vflag,
     const int4_t * _noalias const anglelist = 
       (int4_t *) neighbor->anglelist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seangle = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = anglelist[n].a;
       const int i2 = anglelist[n].b;
       const int i3 = anglelist[n].c;
@@ -211,40 +222,58 @@ void AngleHarmonicIntel::eval(const int vflag,
 
       // apply force to each of 3 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+	  f[i1].y += f1y;
+	  f[i1].z += f1z;
+        }
+
+	if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= f1x + f3x;
+	  f[i2].y -= f1y + f3y;
+	  f[i2].z -= f1z + f3z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+	  f[i3].y += f3y;
+	  f[i3].z += f3z;
+        }
       }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= f1x + f3x;
-        f[i2].y -= f1y + f3y;
-        f[i2].z -= f1z + f3z;
-      }
-
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, 
-                              f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, 
-                              delx2, dely2, delz2, oeangle, f, NEWTON_BOND, 
-                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+	IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
+                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, 
+                              delz1, delx2, dely2, delz2, seangle, f, 
+                              NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, 
+                              sv5);
+        #else
+	IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3,
+                              f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, 
+                              delz1, delx2, dely2, delz2, oeangle, f, 
+                              NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, 
+                              ov5);
+        #endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oeangle += seangle;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
+	ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeangle;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oeangle;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp
index e61ab9be84..430142a72a 100644
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@@ -77,16 +77,16 @@ void BondFENEIntel::compute(int eflag, int vflag,
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -96,10 +96,10 @@ void BondFENEIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void BondFENEIntel::eval(const int vflag, 
-			     IntelBuffers<flt_t,acc_t> *buffers,
-			     const ForceConst<flt_t> &fc)
+			 IntelBuffers<flt_t,acc_t> *buffers,
+			 const ForceConst<flt_t> &fc)
 {
   const int inum = neighbor->nbondlist;
   if (inum == 0) return;
@@ -119,23 +119,23 @@ void BondFENEIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oebond = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oebond = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
-
   #if defined(_OPENMP)
   #pragma omp parallel default(none) \
     shared(f_start,f_stride,fc)		  \
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -144,7 +144,17 @@ void BondFENEIntel::eval(const int vflag,
     const int3_t * _noalias const bondlist = 
       (int3_t *) neighbor->bondlist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sebond = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = bondlist[n].a;
       const int i2 = bondlist[n].b;
       const int type = bondlist[n].t;
@@ -199,33 +209,48 @@ void BondFENEIntel::eval(const int vflag,
 
       // apply force to each of 2 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += delx*fbond;
-        f[i1].y += dely*fbond;
-        f[i1].z += delz*fbond;
-      }
-
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= delx*fbond;
-        f[i2].y -= dely*fbond;
-        f[i2].z -= delz*fbond;
-      }
-
-      if (EVFLAG) {
-	IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, 
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += delx*fbond;
+	  f[i1].y += dely*fbond;
+	  f[i1].z += delz*fbond;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= delx*fbond;
+	  f[i2].y -= dely*fbond;
+	  f[i2].z -= delz*fbond;
+        }
+      } 
+
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+	IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, 
+                             delx, dely, delz, sebond, f, NEWTON_BOND, 
+                             nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
+	#else
+	IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, 
                              delx, dely, delz, oebond, f, NEWTON_BOND, 
                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+	#endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oebond += sebond;
+    if (VFLAG && vflag) {
+       ov0 += sv0; ov1 += sv1; ov2 += sv2;
+       ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oebond;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oebond;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp
index 51a33b1cc3..1cccf5fe54 100644
--- a/src/USER-INTEL/bond_harmonic_intel.cpp
+++ b/src/USER-INTEL/bond_harmonic_intel.cpp
@@ -77,16 +77,16 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -96,7 +96,7 @@ void BondHarmonicIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void BondHarmonicIntel::eval(const int vflag, 
 			     IntelBuffers<flt_t,acc_t> *buffers,
 			     const ForceConst<flt_t> &fc)
@@ -119,12 +119,9 @@ void BondHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oebond = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oebond = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -133,8 +130,12 @@ void BondHarmonicIntel::eval(const int vflag,
     reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -143,7 +144,17 @@ void BondHarmonicIntel::eval(const int vflag,
     const int3_t * _noalias const bondlist = 
       (int3_t *) neighbor->bondlist[0];
 
-    for (int n = nfrom; n < nto; n++) {
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sebond = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = bondlist[n].a;
       const int i2 = bondlist[n].b;
       const int type = bondlist[n].t;
@@ -167,33 +178,50 @@ void BondHarmonicIntel::eval(const int vflag,
       if (EFLAG) ebond = rk*dr;
 
       // apply force to each of 2 atoms
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += delx*fbond;
-        f[i1].y += dely*fbond;
-        f[i1].z += delz*fbond;
-      }
-
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x -= delx*fbond;
-        f[i2].y -= dely*fbond;
-        f[i2].z -= delz*fbond;
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += delx*fbond;
+	  f[i1].y += dely*fbond;
+	  f[i1].z += delz*fbond;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x -= delx*fbond;
+	  f[i2].y -= dely*fbond;
+	  f[i2].z -= delz*fbond;
+        }
       }
 
-      if (EVFLAG) {
-	IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, 
-                             delx, dely, delz, oebond, f, NEWTON_BOND, 
-                             nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, 
+                             fbond, delx, dely, delz, sebond, f, 
+                             NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, 
+                             sv4, sv5);
+	#else
+        IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, 
+                             fbond, delx, dely, delz, oebond, f, 
+                             NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, 
+                             ov4, ov5);
+	#endif
       }
     } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oebond += sebond;
+    if (VFLAG && vflag) {
+       ov0 += sv0; ov1 += sv1; ov2 += sv2;
+       ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oebond;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
-    }
+  if (EFLAG) energy += oebond;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp
index c07c226611..df8834c283 100644
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@@ -93,16 +93,16 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
     force->pair->vflag_either = force->pair->vflag_global = 1;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -114,7 +114,7 @@ void DihedralCharmmIntel::compute(int eflag, int vflag,
 
 #ifndef LMP_USE_AVXCD_DHC
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
@@ -140,13 +140,10 @@ void DihedralCharmmIntel::eval(const int vflag,
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oevdwl = oecoul = oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-      opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
-    }
+  if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -156,8 +153,13 @@ void DihedralCharmmIntel::eval(const int vflag,
 	      opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
+    #if defined(LMP_SIMD_COMPILER_TEST)
     int nfrom, nto, tid;
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    int nfrom, npl, nto, tid;
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -169,21 +171,19 @@ void DihedralCharmmIntel::eval(const int vflag,
 
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
-    if (EVFLAG) {
-      if (EFLAG)
-	sevdwl = secoul = sedihedral = (acc_t)0.0;
-      if (vflag) {
-	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-	spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
-      }
+    if (EFLAG) sevdwl = secoul = sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+      spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
     }
 
     #if defined(LMP_SIMD_COMPILER_TEST)
     #pragma vector aligned
     #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \
                            sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) 
-    #endif
     for (int n = nfrom; n < nto; n++) {
+    #endif
+    for (int n = nfrom; n < nto; n += npl) {
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
@@ -333,14 +333,14 @@ void DihedralCharmmIntel::eval(const int vflag,
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
-      if (EVFLAG) {
+      if (EFLAG || VFLAG) {
 	flt_t deng;
 	if (EFLAG) deng = tk * p;
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
-			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
-			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
-			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
-			      sv0, sv1, sv2, sv3, sv4, sv5);
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, 
+                              i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, 
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm,
+                              vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND, 
+                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
       }
 
 
@@ -387,7 +387,7 @@ void DihedralCharmmIntel::eval(const int vflag,
 	f4z -= delz*fpair;
       }
 
-      if (EVFLAG) {
+      if (EFLAG || VFLAG) {
 	flt_t ev_pre = (flt_t)0;
 	if (NEWTON_BOND || i1 < nlocal)
 	  ev_pre += (flt_t)0.5;
@@ -412,13 +412,13 @@ void DihedralCharmmIntel::eval(const int vflag,
 	}
 	//	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
 	//				   delx, dely, delz);
-	if (vflag) {                                                    
-	  spv0 += ev_pre * delx * delx * fpair;                               
-	  spv1 += ev_pre * dely * dely * fpair;                               
-	  spv2 += ev_pre * delz * delz * fpair;                               
-	  spv3 += ev_pre * delx * dely * fpair;                               
-	  spv4 += ev_pre * delx * delz * fpair;                               
-	  spv5 += ev_pre * dely * delz * fpair;                               
+	if (VFLAG && vflag) {
+	  spv0 += ev_pre * delx * delx * fpair;
+	  spv1 += ev_pre * dely * dely * fpair;
+	  spv2 += ev_pre * delz * delz * fpair;
+	  spv3 += ev_pre * delx * dely * fpair;
+	  spv4 += ev_pre * delx * delz * fpair;
+	  spv5 += ev_pre * dely * delz * fpair;
 	}                                                                    
       }
 
@@ -440,36 +440,32 @@ void DihedralCharmmIntel::eval(const int vflag,
         }
       }
     } // for n
-    if (EVFLAG) {
-      if (EFLAG) {
-	oedihedral += sedihedral;
-	oecoul += secoul;
-	oevdwl += sevdwl;
-      }
-      if (vflag) {
-	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-	opv0 += spv0; opv1 += spv1; opv2 += spv2; 
-	opv3 += spv3; opv4 += spv4; opv5 += spv5;
-      }
-    }
-  } // omp parallel
-
-  if (EVFLAG) {
     if (EFLAG) {
-      energy += oedihedral;
-      force->pair->eng_vdwl += oevdwl;
-      force->pair->eng_coul += oecoul;
+      oedihedral += sedihedral;
+      oecoul += secoul;
+      oevdwl += sevdwl;
     }
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-      force->pair->virial[0] += opv0;
-      force->pair->virial[1] += opv1;
-      force->pair->virial[2] += opv2;
-      force->pair->virial[3] += opv3;
-      force->pair->virial[4] += opv4;
-      force->pair->virial[5] += opv5;
+    if (VFLAG && vflag) {
+      ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
+      opv0 += spv0; opv1 += spv1; opv2 += spv2; 
+      opv3 += spv3; opv4 += spv4; opv5 += spv5;
     }
+  } // omp parallel
+
+  if (EFLAG) {
+    energy += oedihedral;
+    force->pair->eng_vdwl += oevdwl;
+    force->pair->eng_coul += oecoul;
+  }
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+    force->pair->virial[0] += opv0;
+    force->pair->virial[1] += opv1;
+    force->pair->virial[2] += opv2;
+    force->pair->virial[3] += opv3;
+    force->pair->virial[4] += opv4;
+    force->pair->virial[5] += opv5;
   }
 
   fix->set_reduce_flag();
@@ -488,7 +484,7 @@ authors for more details.
 
 ------------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralCharmmIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
@@ -518,13 +514,10 @@ void DihedralCharmmIntel::eval(const int vflag,
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
   acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oevdwl = oecoul = oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-      opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
-    }
+  if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -534,8 +527,9 @@ void DihedralCharmmIntel::eval(const int vflag,
 	      opv0,opv1,opv2,opv3,opv4,opv5)
   #endif
   {
-    int nfrom, nto, tid;
-    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    int nfrom, npl, nto, tid;
+    IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads,
+			     swidth);
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -559,26 +553,24 @@ void DihedralCharmmIntel::eval(const int vflag,
 
     SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
     SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
-    if (EVFLAG) {
-      if (EFLAG) {
-	sevdwl = SIMD_set((acc_t)0.0);
-	secoul = SIMD_set((acc_t)0.0);
-	sedihedral = SIMD_set((acc_t)0.0);
-      }
-      if (vflag) {
-	sv0 = SIMD_set((acc_t)0.0);
-	sv1 = SIMD_set((acc_t)0.0);
-	sv2 = SIMD_set((acc_t)0.0);
-	sv3 = SIMD_set((acc_t)0.0);
-	sv4 = SIMD_set((acc_t)0.0);
-	sv5 = SIMD_set((acc_t)0.0);
-	spv0 = SIMD_set((acc_t)0.0);
-	spv1 = SIMD_set((acc_t)0.0);
-	spv2 = SIMD_set((acc_t)0.0);
-	spv3 = SIMD_set((acc_t)0.0);
-	spv4 = SIMD_set((acc_t)0.0);
-	spv5 = SIMD_set((acc_t)0.0);
-      }
+    if (EFLAG) {
+      sevdwl = SIMD_set((acc_t)0.0);
+      secoul = SIMD_set((acc_t)0.0);
+      sedihedral = SIMD_set((acc_t)0.0);
+    }
+    if (VFLAG && vflag) {
+      sv0 = SIMD_set((acc_t)0.0);
+      sv1 = SIMD_set((acc_t)0.0);
+      sv2 = SIMD_set((acc_t)0.0);
+      sv3 = SIMD_set((acc_t)0.0);
+      sv4 = SIMD_set((acc_t)0.0);
+      sv5 = SIMD_set((acc_t)0.0);
+      spv0 = SIMD_set((acc_t)0.0);
+      spv1 = SIMD_set((acc_t)0.0);
+      spv2 = SIMD_set((acc_t)0.0);
+      spv3 = SIMD_set((acc_t)0.0);
+      spv4 = SIMD_set((acc_t)0.0);
+      spv5 = SIMD_set((acc_t)0.0);
     }
 
     SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50,
@@ -588,7 +580,7 @@ void DihedralCharmmIntel::eval(const int vflag,
     const SIMD_int simd_nlocals4 = SIMD_set(nlocals4);
     const int ntypes = atom->ntypes + 1;
 
-    for (int n = nfrom; n < nto; n += swidth) {
+    for (int n = nfrom; n < nto; n += npl) {
       SIMD_mask nmask = n_offset < nto5;
       SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset);
       const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1);
@@ -601,7 +593,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset);
       const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type);
       type = type << 2;
-      n_offset = n_offset + swidth * 5;
+      n_offset = n_offset + npl * 5;
 
       // 1st bond
 
@@ -747,7 +739,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_flt_t f3z = -sz2 - f4z;
 
       SIMD_flt_t qdeng;
-      if (EVFLAG) {
+      if (EFLAG || VFLAG) {
 	SIMD_flt_t ev_pre;
 	if (NEWTON_BOND) ev_pre = one;
 	else {
@@ -774,7 +766,7 @@ void DihedralCharmmIntel::eval(const int vflag,
 	    SIMD_jeng_update(newton_mask, featom, i3, ieng);
 	  }
 	}
-	if (vflag) {
+	if (VFLAG && vflag) {
           sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x));
 	  sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y));
 	  sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z));
@@ -816,7 +808,7 @@ void DihedralCharmmIntel::eval(const int vflag,
       f4y = f4y - dely * fpair;
       f4z = f4z - delz * fpair;
 
-      if (EVFLAG) {
+      if (EFLAG || VFLAG) {
 	SIMD_flt_t ev_pre;
 	if (NEWTON_BOND) ev_pre = one;
 	else {
@@ -848,7 +840,7 @@ void DihedralCharmmIntel::eval(const int vflag,
             SIMD_jeng_update(newton_mask, featom, i4, ieng);
 	  }
 	}
-	if (vflag) {                                                    
+	if (VFLAG && vflag) {
           spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair);
 	  spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair);
 	  spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair);
@@ -865,45 +857,41 @@ void DihedralCharmmIntel::eval(const int vflag,
       SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z);
     } // for n
 
-    if (EVFLAG) {
-      if (EFLAG) {
-	oedihedral += SIMD_sum(sedihedral);
-	oecoul += SIMD_sum(secoul);
-	oevdwl += SIMD_sum(sevdwl);
-      }
-      if (vflag) {
-	ov0 += SIMD_sum(sv0); 
-	ov1 += SIMD_sum(sv1); 
-	ov2 += SIMD_sum(sv2); 
-	ov3 += SIMD_sum(sv3); 
-	ov4 += SIMD_sum(sv4); 
-	ov5 += SIMD_sum(sv5);
-	opv0 += SIMD_sum(spv0); 
-	opv1 += SIMD_sum(spv1); 
-	opv2 += SIMD_sum(spv2); 
-	opv3 += SIMD_sum(spv3); 
-	opv4 += SIMD_sum(spv4); 
-	opv5 += SIMD_sum(spv5);
-      }
-    }
-  } // omp parallel
-
-  if (EVFLAG) {
     if (EFLAG) {
-      energy += oedihedral;
-      force->pair->eng_vdwl += oevdwl;
-      force->pair->eng_coul += oecoul;
+      oedihedral += SIMD_sum(sedihedral);
+      oecoul += SIMD_sum(secoul);
+      oevdwl += SIMD_sum(sevdwl);
     }
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-      force->pair->virial[0] += opv0;
-      force->pair->virial[1] += opv1;
-      force->pair->virial[2] += opv2;
-      force->pair->virial[3] += opv3;
-      force->pair->virial[4] += opv4;
-      force->pair->virial[5] += opv5;
+    if (VFLAG && vflag) {
+      ov0 += SIMD_sum(sv0); 
+      ov1 += SIMD_sum(sv1); 
+      ov2 += SIMD_sum(sv2); 
+      ov3 += SIMD_sum(sv3); 
+      ov4 += SIMD_sum(sv4); 
+      ov5 += SIMD_sum(sv5);
+      opv0 += SIMD_sum(spv0); 
+      opv1 += SIMD_sum(spv1); 
+      opv2 += SIMD_sum(spv2); 
+      opv3 += SIMD_sum(spv3); 
+      opv4 += SIMD_sum(spv4); 
+      opv5 += SIMD_sum(spv5);
     }
+  } // omp parallel
+
+  if (EFLAG) {
+    energy += oedihedral;
+    force->pair->eng_vdwl += oevdwl;
+    force->pair->eng_coul += oecoul;
+  }
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+    force->pair->virial[0] += opv0;
+    force->pair->virial[1] += opv1;
+    force->pair->virial[2] += opv2;
+    force->pair->virial[3] += opv3;
+    force->pair->virial[4] += opv4;
+    force->pair->virial[5] += opv5;
   }
 
   fix->set_reduce_flag();
@@ -953,12 +941,14 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
   fc.set_ntypes(tp1,bp1,memory);
   buffers->set_ntypes(tp1);
 
-  for (int i = 0; i < tp1; i++) {
-    for (int j = 0; j < tp1; j++) {
-      fc.ljp[i][j].lj1 = lj14_1[i][j];
-      fc.ljp[i][j].lj2 = lj14_2[i][j];
-      fc.ljp[i][j].lj3 = lj14_3[i][j];
-      fc.ljp[i][j].lj4 = lj14_4[i][j];
+  if (weightflag) {
+    for (int i = 0; i < tp1; i++) {
+      for (int j = 0; j < tp1; j++) {
+	fc.ljp[i][j].lj1 = lj14_1[i][j];
+	fc.ljp[i][j].lj2 = lj14_2[i][j];
+	fc.ljp[i][j].lj3 = lj14_3[i][j];
+	fc.ljp[i][j].lj4 = lj14_4[i][j];
+      }
     }
   }
 
diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp
index 03ab152f49..94130f4355 100644
--- a/src/USER-INTEL/dihedral_harmonic_intel.cpp
+++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp
@@ -77,16 +77,16 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
   } else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -96,7 +96,7 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralHarmonicIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
@@ -120,12 +120,9 @@ void DihedralHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -134,8 +131,12 @@ void DihedralHarmonicIntel::eval(const int vflag,
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -144,16 +145,17 @@ void DihedralHarmonicIntel::eval(const int vflag,
     const int5_t * _noalias const dihedrallist = 
       (int5_t *) neighbor->dihedrallist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
-    if (EVFLAG) {
-      if (EFLAG)
-	sedihedral = (acc_t)0.0;
-      if (vflag) {
-	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-      }
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
-
-    for (int n = nfrom; n < nto; n++) {
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
@@ -203,6 +205,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
 
       // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
 	int me = comm->me;
 
@@ -224,6 +227,7 @@ void DihedralHarmonicIntel::eval(const int vflag,
 		  me,x[i4].x,x[i4].y,x[i4].z);
 	}
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -292,16 +296,27 @@ void DihedralHarmonicIntel::eval(const int vflag,
       const flt_t f3y = -sy2 - f4y;
       const flt_t f3z = -sz2 - f4z;
 
-      if (EVFLAG) {
+      if (EFLAG || VFLAG) {
 	flt_t deng;
 	if (EFLAG) deng = tk * p;
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
-			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
-			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
-			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+	#ifdef LMP_INTEL_USE_SIMDOFF
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+	                      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
+	                      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
+	                      vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
 			      sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+	                      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
+	                      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
+	                      vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+			      ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
       }
 
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
 	  f[i1].x += f1x;
@@ -328,20 +343,19 @@ void DihedralHarmonicIntel::eval(const int vflag,
         }
       }
     } // for n
-    if (EVFLAG) {
-      if (EFLAG) oedihedral += sedihedral;
-      if (vflag) {
-	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-      }
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
+	ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG) energy += oedihedral;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-    }
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp
index bfd5a53956..3248a8bfc7 100644
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@@ -81,16 +81,16 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
   } else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -100,7 +100,7 @@ void DihedralOPLSIntel::compute(int eflag, int vflag,
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void DihedralOPLSIntel::eval(const int vflag, 
 			       IntelBuffers<flt_t,acc_t> *buffers,
 			       const ForceConst<flt_t> &fc)
@@ -124,12 +124,9 @@ void DihedralOPLSIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oedihedral = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -138,8 +135,12 @@ void DihedralOPLSIntel::eval(const int vflag,
     reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -148,16 +149,17 @@ void DihedralOPLSIntel::eval(const int vflag,
     const int5_t * _noalias const dihedrallist = 
       (int5_t *) neighbor->dihedrallist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF
     acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
-    if (EVFLAG) {
-      if (EFLAG)
-	sedihedral = (acc_t)0.0;
-      if (vflag) {
-	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-      }
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
     }
-
-    for (int n = nfrom; n < nto; n++) {
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = dihedrallist[n].a;
       const int i2 = dihedrallist[n].b;
       const int i3 = dihedrallist[n].c;
@@ -236,6 +238,7 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3;
 
       // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
 	int me = comm->me;
 
@@ -257,6 +260,7 @@ void DihedralOPLSIntel::eval(const int vflag,
 		  me,x[i4].x,x[i4].y,x[i4].z);
 	}
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -321,14 +325,25 @@ void DihedralOPLSIntel::eval(const int vflag,
       const flt_t f3y = sy2 - f4y;
       const flt_t f3z = sz2 - f4z;
 
-      if (EVFLAG) {
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x, 
-			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
-			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
-			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, 
+			      i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
+			      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
+			      vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
 			      sv0, sv1, sv2, sv3, sv4, sv5);
+	#else
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, 
+			      i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
+			      vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
+			      vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+			      ov0, ov1, ov2, ov3, ov4, ov5);
+	#endif
       }
 
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
       {
         if (NEWTON_BOND || i1 < nlocal) {
 	  f[i1].x += f1x;
@@ -355,20 +370,19 @@ void DihedralOPLSIntel::eval(const int vflag,
         }
       }
     } // for n
-    if (EVFLAG) {
-      if (EFLAG) oedihedral += sedihedral;
-      if (vflag) {
-	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
-      }
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2; 
+	ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
   } // omp parallel
 
-  if (EVFLAG) {
-    if (EFLAG) energy += oedihedral;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
-    }
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index edd33eb72b..e132947750 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -61,6 +61,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   int ncops = force->inumeric(FLERR,arg[3]);
 
   _nbor_pack_width = 1;
+  _three_body_neighbor = 0;
 
   _precision_mode = PREC_MODE_MIXED;
   _offload_balance = -1.0;
@@ -95,7 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
   _allow_separate_buffers = 1;
   _offload_ghost = -1;
   _lrt = 0;
-
+  
   int iarg = 4;
   while (iarg < narg) {
     if (strcmp(arg[iarg],"omp") == 0) {
@@ -140,7 +141,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
       else error->all(FLERR,"Illegal package intel command");
       iarg += 2;
     }
-
+  
     // undocumented options
 
     else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) {
@@ -326,12 +327,18 @@ void FixIntel::init()
 	       "Currently, cannot use more than one intel style with hybrid.");
 
   check_neighbor_intel();
-  if (_precision_mode == PREC_MODE_SINGLE)
+  int off_mode = 0;
+  if (_offload_balance != 0.0) off_mode = 1;
+  if (_precision_mode == PREC_MODE_SINGLE) {
     _single_buffers->zero_ev();
-  else if (_precision_mode == PREC_MODE_MIXED)
+    _single_buffers->grow_ncache(off_mode,_nthreads);
+  } else if (_precision_mode == PREC_MODE_MIXED) {
     _mixed_buffers->zero_ev();
-  else
+    _mixed_buffers->grow_ncache(off_mode,_nthreads);
+  } else {
     _double_buffers->zero_ev();
+    _double_buffers->grow_ncache(off_mode,_nthreads);
+  }
 
   _need_reduce = 0;
 }
@@ -367,8 +374,6 @@ void FixIntel::pair_init_check(const bool cdmessage)
 {
   #ifdef INTEL_VMASK
   atom->sortfreq = 1;
-  if (neighbor->binsizeflag && atom->userbinsize <= 0.0)
-    atom->userbinsize = neighbor->binsize_user;
   #endif
 
   _nbor_pack_width = 1;
@@ -376,9 +381,8 @@ void FixIntel::pair_init_check(const bool cdmessage)
   #ifdef _LMP_INTEL_OFFLOAD
   if (_offload_balance != 0.0) atom->sortfreq = 1;
 
-  if (force->newton_pair == 0)
-    _offload_noghost = 0;
-  else if (_offload_ghost == 0)
+  _offload_noghost = 0;
+  if (force->newton_pair && _offload_ghost == 0)
     _offload_noghost = 1;
 
   set_offload_affinity();
@@ -535,24 +539,24 @@ void FixIntel::pre_reverse(int eflag, int vflag)
 {
   if (_force_array_m != 0) {
     if (_need_reduce) {
-      reduce_results(_force_array_m);
+      reduce_results(&_force_array_m[0].x);
       _need_reduce = 0;
     }
-    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0);
+    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
     _force_array_m = 0;
   } else if (_force_array_d != 0) {
     if (_need_reduce) {
-      reduce_results(_force_array_d);
+      reduce_results(&_force_array_d[0].x);
       _need_reduce = 0;
     }
-    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0);
+    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
     _force_array_d = 0;
   } else if (_force_array_s != 0) {
     if (_need_reduce) {
-      reduce_results(_force_array_s);
+      reduce_results(&_force_array_s[0].x);
       _need_reduce = 0;
     }
-    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0);
+    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
     _force_array_s = 0;
   }
 
@@ -563,47 +567,56 @@ void FixIntel::pre_reverse(int eflag, int vflag)
 
 /* ---------------------------------------------------------------------- */
 
-template <class ft>
-void FixIntel::reduce_results(ft * _noalias const f_start)
+template <class acc_t>
+void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 {
   int o_range, f_stride;
   if (force->newton_pair)
     o_range = atom->nlocal + atom->nghost;
   else		
     o_range = atom->nlocal;
-  IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque);
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none) shared(o_range, f_stride)
-  #endif
-  {
-    int iifrom, iito, tid;
-    IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
-  			      sizeof(ft));
-
-    int t_off = f_stride;
-    if (_results_eatom) {
-      for (int t = 1; t < _nthreads; t++) {
-        _use_simd_pragma("vector nontemporal")
-        _use_simd_pragma("novector")
-        for (int n = iifrom; n < iito; n++) {
-	  f_start[n].x += f_start[n + t_off].x;
-	  f_start[n].y += f_start[n + t_off].y;
-	  f_start[n].z += f_start[n + t_off].z;
-	  f_start[n].w += f_start[n + t_off].w;
-	}
-	t_off += f_stride;
-      }
+  IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque);
+
+  o_range *= 4;
+  const int f_stride4 = f_stride * 4;
+
+  if (_nthreads <= INTEL_HTHREADS) {
+    acc_t *f_scalar2 = f_scalar + f_stride4;
+    if (_nthreads == 4) {
+      acc_t *f_scalar3 = f_scalar2 + f_stride4;
+      acc_t *f_scalar4 = f_scalar3 + f_stride4;
+      _use_simd_pragma("vector aligned")
+      _use_simd_pragma("simd")
+      for (int n = 0; n < o_range; n++)
+	f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];
+    } else if (_nthreads == 2) {
+      _use_simd_pragma("vector aligned")
+      _use_simd_pragma("simd")
+      for (int n = 0; n < o_range; n++)
+	f_scalar[n] += f_scalar2[n];
     } else {
+      acc_t *f_scalar3 = f_scalar2 + f_stride4;
+      _use_simd_pragma("vector aligned")
+      _use_simd_pragma("simd")
+      for (int n = 0; n < o_range; n++)
+	f_scalar[n] += f_scalar2[n] + f_scalar3[n];
+    }
+  } else {
+    #if defined(_OPENMP)
+    #pragma omp parallel
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
+				sizeof(acc_t));
+
+      acc_t *f_scalar2 = f_scalar + f_stride4;
       for (int t = 1; t < _nthreads; t++) {
-	_use_simd_pragma("vector nontemporal")
-	_use_simd_pragma("novector")
-	for (int n = iifrom; n < iito; n++) {
-	  f_start[n].x += f_start[n + t_off].x;
-	  f_start[n].y += f_start[n + t_off].y;
-	  f_start[n].z += f_start[n + t_off].z;
-	}
-	t_off += f_stride;
+	_use_simd_pragma("vector aligned")
+	_use_simd_pragma("simd")
+	for (int n = iifrom; n < iito; n++)
+          f_scalar[n] += f_scalar2[n];
+        f_scalar2 += f_stride4;
       }
     }
   }
@@ -641,40 +654,59 @@ void FixIntel::add_results(const ft * _noalias const f_in,
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
     if (offload) {
-      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
       if (force->newton_pair) {
+	add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
 	const acc_t * _noalias const enull = 0;
 	int offset = _offload_nlocal;
 	if (atom->torque) offset *= 2;
 	add_oresults(f_in + offset, enull, eatom, vatom,
 		     _offload_min_ghost, _offload_nghost);
-      }
+      } else
+	add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair());
     } else {
-      add_oresults(f_in, ev_global, eatom, vatom,
-		   _host_min_local, _host_used_local);
       if (force->newton_pair) {
+	add_oresults(f_in, ev_global, eatom, vatom,
+		     _host_min_local, _host_used_local);
 	const acc_t * _noalias const enull = 0;
 	int offset = _host_used_local;
 	if (atom->torque) offset *= 2;
 	add_oresults(f_in + offset, enull, eatom,
 		     vatom, _host_min_ghost, _host_used_ghost);
+      } else {
+	int start = host_start_pair();
+	add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start);
       }
     }
     stop_watch(TIME_PACK);
     return;
   }
-  if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
-    f_length = atom->nlocal + atom->nghost;
-  else
-    f_length = atom->nlocal;
+  int start;
+  if (offload) {
+    start = 0;
+    if (force->newton_pair) {
+      if (_offload_noghost == 0)
+	f_length = atom->nlocal + atom->nghost;
+      else
+	f_length = atom->nlocal;
+    } else
+      f_length = offload_end_pair();
+  } else {
+    if (force->newton_pair) {
+      start = 0;
+      f_length = atom->nlocal + atom->nghost;
+    } else {
+      start = host_start_pair();
+      f_length = atom->nlocal - start;
+    }
+  }
+  add_oresults(f_in, ev_global, eatom, vatom, start, f_length);
   #else
   if (force->newton_pair)
     f_length = atom->nlocal + atom->nghost;
   else
     f_length = atom->nlocal;
-  #endif
-
   add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
+  #endif
   stop_watch(TIME_PACK);
 }
 
@@ -695,8 +727,11 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
                    "Sphere particles not yet supported for gayberne/intel");
   }
 
+  int packthreads;
+  if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads;
+  else packthreads = 1;
   #if defined(_OPENMP)
-  #pragma omp parallel default(none)
+  #pragma omp parallel if(packthreads > 1)
   #endif
   {
     #if defined(_OPENMP)
@@ -705,7 +740,7 @@ void FixIntel::add_oresults(const ft * _noalias const f_in,
     const int tid = 0;
     #endif
     int ifrom, ito;
-    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
+    IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t));
     if (atom->torque) {
       int ii = ifrom * 2;
       lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
@@ -833,6 +868,11 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
       _offload_nlocal;
   }
 
+  if (atom->torque)
+    if (f_in[1].w < 0.0)
+      error->all(FLERR, "Bad matrix inversion in mldivide3");
+  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
+
   // Load balance?
   if (_offload_balance < 0.0) {
     if (neighbor->ago == 0)
@@ -860,10 +900,6 @@ void FixIntel::add_off_results(const ft * _noalias const f_in,
   stop_watch(TIME_IMBALANCE);
   #endif
   acc_timers();
-  if (atom->torque)
-    if (f_in[1].w < 0.0)
-      error->all(FLERR, "Bad matrix inversion in mldivide3");
-  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index f4c02b37b5..92d1311256 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -70,23 +70,32 @@ class FixIntel : public Fix {
 
   inline int nbor_pack_width() const { return _nbor_pack_width; }
   inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
-
+  inline int three_body_neighbor() { return _three_body_neighbor; }
+  inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; }
+  
   inline int need_zero(const int tid) {
     if (_need_reduce == 0 && tid > 0) return 1;
     return 0;
   }
-  inline void set_reduce_flag() { _need_reduce = 1; }
+  inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
   inline int lrt() {
     if (force->kspace_match("pppm/intel", 0)) return _lrt;
     else return 0;
   }
+  inline int pppm_table() {
+    if (force->kspace_match("pppm/intel", 0) ||
+	force->kspace_match("pppm/disp/intel",0)) 
+      return INTEL_P3M_TABLE;
+    else return 0;
+  }
+  
 
  protected:
   IntelBuffers<float,float> *_single_buffers;
   IntelBuffers<float,double> *_mixed_buffers;
   IntelBuffers<double,double> *_double_buffers;
 
-  int _precision_mode, _nthreads, _nbor_pack_width;
+  int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
 
  public:
   inline int* get_overflow_flag() { return _overflow_flag; }
@@ -241,7 +250,10 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
     } else {
       nlocal = atom->nlocal;
       nall = _host_nall;
-      minlocal = _host_min_local;
+      if (force->newton)
+        minlocal = _host_min_local;
+      else
+	minlocal = host_start_pair();
     }
     return;
   }
@@ -275,7 +287,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
   _results_eatom = eatom;
   _results_vatom = vatom;
   #ifndef _LMP_INTEL_OFFLOAD
-  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
   #endif
 
   if (_overflow_flag[LMP_OVERFLOW])
@@ -303,7 +315,7 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
   _results_eatom = eatom;
   _results_vatom = vatom;
   #ifndef _LMP_INTEL_OFFLOAD
-  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
   #endif
 
   if (_overflow_flag[LMP_OVERFLOW])
@@ -331,7 +343,7 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
   _results_eatom = eatom;
   _results_vatom = vatom;
   #ifndef _LMP_INTEL_OFFLOAD
-  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1;
   #endif
 
   if (_overflow_flag[LMP_OVERFLOW])
diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp
index 0fb02420b9..df13cd5d66 100644
--- a/src/USER-INTEL/improper_cvff_intel.cpp
+++ b/src/USER-INTEL/improper_cvff_intel.cpp
@@ -87,16 +87,16 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -108,7 +108,7 @@ void ImproperCvffIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void ImproperCvffIntel::eval(const int vflag, 
 				 IntelBuffers<flt_t,acc_t> *buffers,
 				 const ForceConst<flt_t> &fc)
@@ -131,12 +131,9 @@ void ImproperCvffIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeimproper = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeimproper = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -145,8 +142,12 @@ void ImproperCvffIntel::eval(const int vflag,
     reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF_FIX
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -155,7 +156,17 @@ void ImproperCvffIntel::eval(const int vflag,
     const int5_t * _noalias const improperlist = 
       (int5_t *) neighbor->improperlist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+    acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seimproper = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = improperlist[n].a;
       const int i2 = improperlist[n].b;
       const int i3 = improperlist[n].c;
@@ -216,7 +227,7 @@ void ImproperCvffIntel::eval(const int vflag,
       flt_t c = (c0 + c1mag*c2mag) * s12;
 
       // error check
-
+      #ifndef LMP_INTEL_USE_SIMDOFF_FIX
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
 	MPI_Comm_rank(world,&me);
@@ -238,6 +249,7 @@ void ImproperCvffIntel::eval(const int vflag,
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -250,31 +262,36 @@ void ImproperCvffIntel::eval(const int vflag,
       const int m = fc.fc[type].multiplicity;
 
       flt_t p, pd;
-      if (m == 2) {
-	p = (flt_t)2.0*c*c;
-	pd = (flt_t)2.0*c;
-      } else if (m == 3) {
-	const flt_t rc2 = c*c;
-	p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
-	pd = (flt_t)6.0*rc2 - (flt_t)1.5;
-      } else if (m == 4) {
-	const flt_t rc2 = c*c;
-	p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
-	pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
-      } else if (m == 6) {
-	const flt_t rc2 = c*c;
-	p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
-	pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
-      } else if (m == 1) {
-	p = c + (flt_t)1.0;
-	pd = (flt_t)0.5;
-      } else if (m == 5) {
-	const flt_t rc2 = c*c;
-	p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
-	pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
-      } else if (m == 0) {
-	p = (flt_t)2.0;
-	pd = (flt_t)0.0;
+      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+      #pragma simdoff
+      #endif
+      {
+        if (m == 2) {
+          p = (flt_t)2.0*c*c;
+	  pd = (flt_t)2.0*c;
+        } else if (m == 3) {
+	  const flt_t rc2 = c*c;
+	  p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0;
+	  pd = (flt_t)6.0*rc2 - (flt_t)1.5;
+        } else if (m == 4) {
+          const flt_t rc2 = c*c;
+	  p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0;
+	  pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c;
+        } else if (m == 6) {
+          const flt_t rc2 = c*c;
+	  p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2;
+	  pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c;
+        } else if (m == 1) {
+	  p = c + (flt_t)1.0;
+	  pd = (flt_t)0.5;
+        } else if (m == 5) {
+	  const flt_t rc2 = c*c;
+	  p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0;
+	  pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5;
+        } else if (m == 0) {
+          p = (flt_t)2.0;
+	  pd = (flt_t)0.0;
+        }
       }
 
       if (fc.fc[type].sign == -1) {
@@ -317,46 +334,63 @@ void ImproperCvffIntel::eval(const int vflag,
 
       // apply force to each of 4 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
-      }
+      #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+	  f[i1].y += f1y;
+	  f[i1].z += f1z;
+        }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x += f2x;
-	f[i2].y += f2y;
-	f[i2].z += f2z;
-      }
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+	  f[i2].y += f2y;
+	  f[i2].z += f2z;
+        }
 
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
+	if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+	  f[i3].y += f3y;
+	  f[i3].z += f3z;
+        }
 
-      if (NEWTON_BOND || i4 < nlocal) {
-        f[i4].x += f4x;
-	f[i4].y += f4y;
-	f[i4].z += f4z;
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+	  f[i4].y += f4y;
+	  f[i4].z += f4z;
+        }
       }
 
-      if (EVFLAG) {
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, 
-                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, 
-			      vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal,
-			      ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, 
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, 
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, 
+                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, 
+                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
+	#else
+	IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, 
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, 
+                              f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, 
+                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, 
+                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+	#endif
       }
     } // for n
-  } // omp parallel
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeimproper;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    #ifdef LMP_INTEL_USE_SIMDOFF_FIX
+    if (EFLAG) oeimproper += seimproper;
+    if (VFLAG && vflag) {
+      ov0 += sv0; ov1 += sv1; ov2 += sv2;
+      ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
+  } // omp parallel
+  if (EFLAG) energy += oeimproper;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp
index 071ff548ea..cc854091f5 100644
--- a/src/USER-INTEL/improper_harmonic_intel.cpp
+++ b/src/USER-INTEL/improper_harmonic_intel.cpp
@@ -88,16 +88,16 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
   else evflag = 0;
 
   if (evflag) {
-    if (eflag) {
+    if (vflag && !eflag) {
       if (force->newton_bond)
-	eval<1,1,1>(vflag, buffers, fc);
+	eval<0,1,1>(vflag, buffers, fc);
       else
-	eval<1,1,0>(vflag, buffers, fc);
+	eval<0,1,0>(vflag, buffers, fc);
     } else {
       if (force->newton_bond)
-	eval<1,0,1>(vflag, buffers, fc);
+	eval<1,1,1>(vflag, buffers, fc);
       else
-	eval<1,0,0>(vflag, buffers, fc);
+	eval<1,1,0>(vflag, buffers, fc);
     }
   } else {
     if (force->newton_bond)
@@ -109,7 +109,7 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
 void ImproperHarmonicIntel::eval(const int vflag, 
 				 IntelBuffers<flt_t,acc_t> *buffers,
 				 const ForceConst<flt_t> &fc)
@@ -132,12 +132,9 @@ void ImproperHarmonicIntel::eval(const int vflag,
   const int nthreads = tc;
 
   acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
-  if (EVFLAG) {
-    if (EFLAG)
-      oeimproper = (acc_t)0.0;
-    if (vflag) {
-      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+  if (EFLAG) oeimproper = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
   }
 
   #if defined(_OPENMP)
@@ -146,8 +143,12 @@ void ImproperHarmonicIntel::eval(const int vflag,
     reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
   #endif
   {
-    int nfrom, nto, tid;
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
     IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
 
     FORCE_T * _noalias const f = f_start + (tid * f_stride);
     if (fix->need_zero(tid))
@@ -156,7 +157,17 @@ void ImproperHarmonicIntel::eval(const int vflag,
     const int5_t * _noalias const improperlist = 
       (int5_t *) neighbor->improperlist[0];
 
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) seimproper = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5)
     for (int n = nfrom; n < nto; n++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
       const int i1 = improperlist[n].a;
       const int i2 = improperlist[n].b;
       const int i3 = improperlist[n].c;
@@ -207,7 +218,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
       flt_t c = (c1*c2 + c0) * s12;
 
       // error check
-
+      #ifndef LMP_INTEL_USE_SIMDOFF
       if (c > PTOLERANCE || c < MTOLERANCE) {
         int me;
 	MPI_Comm_rank(world,&me);
@@ -229,6 +240,7 @@ void ImproperHarmonicIntel::eval(const int vflag,
                   me,x[i4].x,x[i4].y,x[i4].z);
         }
       }
+      #endif
 
       if (c > (flt_t)1.0) c = (flt_t)1.0;
       if (c < (flt_t)-1.0) c = (flt_t)-1.0;
@@ -278,46 +290,63 @@ void ImproperHarmonicIntel::eval(const int vflag,
 
       // apply force to each of 4 atoms
 
-      if (NEWTON_BOND || i1 < nlocal) {
-        f[i1].x += f1x;
-	f[i1].y += f1y;
-	f[i1].z += f1z;
-      }
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+	  f[i1].y += f1y;
+	  f[i1].z += f1z;
+        }
 
-      if (NEWTON_BOND || i2 < nlocal) {
-        f[i2].x += f2x;
-	f[i2].y += f2y;
-	f[i2].z += f2z;
-      }
+	if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+	  f[i2].y += f2y;
+	  f[i2].z += f2z;
+        }
 
-      if (NEWTON_BOND || i3 < nlocal) {
-        f[i3].x += f3x;
-        f[i3].y += f3y;
-        f[i3].z += f3z;
-      }
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+	  f[i3].y += f3y;
+	  f[i3].z += f3z;
+        }
 
-      if (NEWTON_BOND || i4 < nlocal) {
-        f[i4].x += f4x;
-	f[i4].y += f4y;
-	f[i4].z += f4z;
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+	  f[i4].y += f4y;
+	  f[i4].z += f4z;
+        }
       }
 
-      if (EVFLAG) {
-	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, 
-                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
-                              vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, 
-                              vb3z, oeimproper, f, NEWTON_BOND, nlocal,
-			      ov0, ov1, ov2, ov3, ov4, ov5);
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, 
+                              f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
+                              vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, 
+                              nlocal, sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2,
+                              i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, 
+                              f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,
+                              vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, 
+                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+	#endif
       }
     } // for n
-  } // omp parallel
-  if (EVFLAG) {
-    if (EFLAG)
-      energy += oeimproper;
-    if (vflag) {
-      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
-      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oeimproper += seimproper;
+    if (VFLAG && vflag) {
+      ov0 += sv0; ov1 += sv1; ov2 += sv2;
+      ov3 += sv3; ov4 += sv4; ov5 += sv5;
     }
+    #endif
+  } // omp parallel
+  if (EFLAG) energy += oeimproper;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
   }
 
   fix->set_reduce_flag();
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index c81dffec83..bacc8a8bad 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -12,6 +12,7 @@
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
+#include <math.h>
 #include "intel_buffers.h"
 #include "force.h"
 #include "memory.h"
@@ -28,6 +29,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _ntypes = 0;
   _off_map_listlocal = 0;
   _ccachex = 0;
+  _ncache_alloc = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   _separate_buffers = 0;
   _off_f = 0;
@@ -36,6 +38,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
   _off_list_alloc = false;
   _off_threads = 0;
   _off_ccache = 0;
+  _off_ncache = 0;
   _host_nmax = 0;
   #endif
 }
@@ -111,15 +114,20 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
     _buf_local_size = _buf_size;
   else
     _buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
-  if (lmp->atom->torque)
-    _buf_local_size *= 2;
   const int f_stride = get_stride(_buf_local_size);
   lmp->memory->create(_x, _buf_size,"intel_x");
   if (lmp->atom->q != NULL)
     lmp->memory->create(_q, _buf_size, "intel_q");
   if (lmp->atom->ellipsoid != NULL)
     lmp->memory->create(_quat, _buf_size, "intel_quat");
-  lmp->memory->create(_f, f_stride * nthreads, "intel_f");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (lmp->force->newton_pair)
+  #else
+  if (lmp->force->newton_pair || lmp->atom->molecular)
+  #endif
+    lmp->memory->create(_f, f_stride * nthreads, "intel_f");
+  else
+    lmp->memory->create(_f, f_stride, "intel_f");
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_separate_buffers) {
@@ -131,7 +139,10 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
   }
 
   if (offload_end > 0) {
-    lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f");
+    int fm;
+    if (lmp->force->newton_pair) fm = _off_threads;
+    else fm = 1;
+    lmp->memory->create(_off_f, f_stride * fm, "intel_off_f");
     const atom_t * const x = get_x();
     const flt_t * const q = get_q();
     const vec3_acc_t * f_start = get_off_f();
@@ -140,14 +151,14 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
       if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
-	  nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
+	  nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
 	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     } else {
       if (x != NULL && f_start != NULL && ev_global != NULL) {
         #pragma offload_transfer target(mic:_cop) \
           nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
-          nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
+          nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\
 	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
       }
     }
@@ -427,6 +438,115 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
 
 /* ---------------------------------------------------------------------- */
 
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_ncache()
+{
+  if (_ncache_alloc) {
+    flt_t *ncachex = _ncachex;
+    flt_t *ncachey = _ncachey;
+    flt_t *ncachez = _ncachez;
+    int *ncachej = _ncachej;
+    int *ncachejtype = _ncachejtype;
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_ncache) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \
+        nocopy(ncachejtype:alloc_if(0) free_if(1))
+    }
+    _off_ncache = 0;
+    #endif
+
+    lmp->memory->destroy(ncachex);
+    lmp->memory->destroy(ncachey);
+    lmp->memory->destroy(ncachez);
+    lmp->memory->destroy(ncachej);
+    lmp->memory->destroy(ncachejtype);
+
+    _ncache_alloc = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
+					     const int nthreads)
+{
+  const int nsize = get_max_nbors() * 3;
+  int esize = MIN(sizeof(int), sizeof(flt_t));
+  IP_PRE_get_stride(_ncache_stride, nsize, esize, 0);
+  int nt = MAX(nthreads, _off_threads);
+  const int vsize = _ncache_stride * nt;
+
+  if (_ncache_alloc) {
+    if (vsize > _ncache_alloc)
+      free_ncache();
+    #ifdef _LMP_INTEL_OFFLOAD
+    else if (off_flag && _off_ncache == 0)
+      free_ncache();
+    #endif
+    else
+      return;
+  }
+
+  lmp->memory->create(_ncachex, vsize, "_ncachex");
+  lmp->memory->create(_ncachey, vsize, "_ncachey");
+  lmp->memory->create(_ncachez, vsize, "_ncachez");
+  lmp->memory->create(_ncachej, vsize, "_ncachej");
+  lmp->memory->create(_ncachejtype, vsize, "_ncachejtype");
+
+  _ncache_alloc = vsize;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (off_flag) {
+    flt_t *ncachex = _ncachex;
+    flt_t *ncachey = _ncachey;
+    flt_t *ncachez = _ncachez;
+    int *ncachej = _ncachej;
+    int *ncachejtype = _ncachejtype;
+
+    if (ncachex != NULL && ncachey !=NULL && ncachez != NULL &&
+	ncachej != NULL && ncachejtype != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \
+        nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \
+        nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0))
+    }
+    _off_ncache = 1;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifndef _LMP_INTEL_OFFLOAD
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, 
+    const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1,
+    acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) 
+{
+  IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0,
+                            ov1, ov2, ov3, ov4, ov5);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#ifndef _LMP_INTEL_OFFLOAD
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall, 
+    const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, 
+    acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5)
+{
+  int iifrom, iito, tid;
+  IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2,
+			 ov0, ov1, ov2, ov3, ov4, ov5);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
 {
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index 3462d013a1..9b73a65f60 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -78,6 +78,7 @@ class IntelBuffers {
     free_nbor_list();
     free_nmax();
     free_list_local();
+    free_ncache();
   }
 
   inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
@@ -106,6 +107,15 @@ class IntelBuffers {
   inline acc_t * get_ccachef() { return _ccachef; }
   #endif
 
+  void free_ncache();
+  void grow_ncache(const int off_flag, const int nthreads);
+  inline int ncache_stride() { return _ncache_stride; }
+  inline flt_t * get_ncachex() { return _ncachex; }
+  inline flt_t * get_ncachey() { return _ncachey; }
+  inline flt_t * get_ncachez() { return _ncachez; }
+  inline int * get_ncachej() { return _ncachej; }
+  inline int * get_ncachejtype() { return _ncachejtype; }
+
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
         (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
@@ -180,6 +190,15 @@ class IntelBuffers {
     }
   }
 
+  #ifndef _LMP_INTEL_OFFLOAD
+  void fdotr_reduce_l5(const int lf, const int lt, const int nthreads, 
+		       const int f_stride, acc_t &ov0, acc_t &ov1,
+		       acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5);
+  void fdotr_reduce(const int nall, const int nthreads, const int f_stride, 
+		    acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, 
+		    acc_t &ov4, acc_t &ov5);
+  #endif
+
   #ifdef _LMP_INTEL_OFFLOAD
   inline void thr_pack_cop(const int ifrom, const int ito,
 			   const int offset, const bool dotype = false) {
@@ -263,6 +282,10 @@ class IntelBuffers {
   int _ccache_stride;
   flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
   int *_ccachei, *_ccachej;
+
+  int _ncache_stride, _ncache_alloc;
+  flt_t *_ncachex, *_ncachey, *_ncachez;
+  int *_ncachej, *_ncachejtype;
   #ifdef LMP_USE_AVXCD
   int _ccache_stride3;
   acc_t * _ccachef;
@@ -274,7 +297,7 @@ class IntelBuffers {
   flt_t *_host_q;
   quat_t *_host_quat;
   vec3_acc_t *_off_f;
-  int _off_map_nmax, _cop, _off_ccache;
+  int _off_map_nmax, _cop, _off_ccache, _off_ncache;
   int *_off_map_ilist;
   int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
   int *_off_map_numneigh;
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index ad07dfd7c2..93787cd6c8 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -17,6 +17,9 @@
 
 #ifdef __INTEL_COMPILER
 #define LMP_SIMD_COMPILER
+#if (__INTEL_COMPILER_BUILD_DATE > 20160720)
+#define LMP_INTEL_USE_SIMDOFF
+#endif
 #endif
 
 #ifdef __INTEL_OFFLOAD
@@ -65,7 +68,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_STENCIL 256
 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
 #define INTEL_MAX_STENCIL_CHECK 4096
-#define INTEL_P3M_MAXORDER 5
+#define INTEL_P3M_MAXORDER 7
+#define INTEL_P3M_ALIGNED_MAXORDER 8
+// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
+#define INTEL_P3M_TABLE 1
 
 #ifdef __INTEL_COMPILER
 #ifdef __AVX__
@@ -87,7 +93,12 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #ifdef __MIC__
 #define INTEL_V512 1
 #define INTEL_VMASK 1
+#define INTEL_HTHREADS 4
+#endif
 #endif
+
+#ifdef __AVX512ER__
+#define INTEL_HTHREADS 4
 #endif
 
 #ifdef __AVX512CD__
@@ -96,15 +107,22 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #endif
 #endif
 
+#ifdef __MIC__
+#define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
+#else
+#define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
+#endif
+
 #else
 
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 1
+#define INTEL_COMPILE_WIDTH 1
 
 #endif
 
 #define INTEL_DATA_ALIGN 64
-#define INTEL_ONEATOM_FACTOR 2
+#define INTEL_ONEATOM_FACTOR 1
 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
 #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
 #define INTEL_LB_MEAN_WEIGHT 0.1
@@ -112,6 +130,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 #define INTEL_MAX_HOST_CORE_COUNT 512
 #define INTEL_MAX_COI_CORES 36
 
+#ifndef INTEL_HTHREADS
+#define INTEL_HTHREADS 2
+#endif
+
 #define IP_PRE_get_stride(stride, n, datasize, torque)	\
   {								\
     int blength = n;						\
@@ -125,9 +147,17 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 
 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) 	\
   {								\
-    const int idelta = 1 + inum/nthreads;			\
+    int idelta = inum/nthreads;					\
+    const int imod = inum % nthreads;				\
     ifrom = tid * idelta;					\
-    ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;	\
+    ito = ifrom + idelta;					\
+    if (tid < imod) {						\
+      ito+=tid+1;						\
+      ifrom+=tid;						\
+    } else {							\
+      ito+=imod;						\
+      ifrom+=imod;						\
+    }								\
   }
 
 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
@@ -136,12 +166,37 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
     IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);		\
   }
 
+#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr)	\
+  {								\
+    if (nthr <= INTEL_HTHREADS) {				\
+      ifrom = tid;						\
+      ito = inum;					      	\
+      ip = nthr;						\
+    } else if (nthr % INTEL_HTHREADS == 0) {			\
+      int nd = nthr / INTEL_HTHREADS;				\
+      int td = tid / INTEL_HTHREADS;				\
+      int tm = tid % INTEL_HTHREADS;				\
+      IP_PRE_omp_range(ifrom, ito, td, inum, nd);		\
+      ifrom += tm;						\
+      ip = INTEL_HTHREADS;					\
+    } else {							\
+      IP_PRE_omp_range(ifrom, ito, tid, inum, nthr);		\
+      ip = 1;							\
+    }								\
+  }
+
+#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)	\
+  {								\
+    tid = omp_get_thread_num();         			\
+    IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr);		\
+  }
+
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
   int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
-  int idelta = static_cast<int>(static_cast<float>(inum)	\
-				/chunk_size/nthreads) + 1;	\
+  int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
+				     /chunk_size/nthreads));	\
   idelta *= chunk_size;						\
   ifrom = tid*idelta;                                           \
   ito = ifrom + idelta;                                         \
@@ -168,6 +223,29 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
     if (ito > inum) ito = inum;					\
   }
 
+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \  
+                                 nthr, vecsize)			\
+  {								\
+    tid = omp_get_thread_num();					\
+    if (nthr <= INTEL_HTHREADS) {				\
+      ifrom = tid*vecsize;					\
+      ito = inum;					      	\
+      ip = nthr*vecsize;					\
+    } else if (nthr % INTEL_HTHREADS == 0) {			\
+      int nd = nthr / INTEL_HTHREADS;				\
+      int td = tid / INTEL_HTHREADS;				\
+      int tm = tid % INTEL_HTHREADS;				\
+      IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd,         \
+	vecsize);						\
+      ifrom += tm * vecsize;					\
+      ip = INTEL_HTHREADS * vecsize;				\
+    } else {							\
+      IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr,      \
+			      vecsize);				\
+      ip = vecsize;						\
+    }								\
+  }
+
 #else
 
 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
@@ -183,6 +261,21 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
     ito = inum;							\
   }
 
+#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads)	\
+  {								\
+    ifrom = 0;							\
+    ito = inum;						        \
+    ip = 1;							\
+  }
+
+#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)	\
+  {								\
+    tid = 0;							\
+    ifrom = 0;							\
+    ito = inum;							\
+    ip = 1;							\
+  }
+
 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
                              datasize)                          \
 {                                                               \
@@ -202,14 +295,215 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 				nthreads, vecsize)		\
   {								\
     tid = 0;                            			\
-    int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
-				       /vecsize));         	\
     ifrom = 0;							\
     ito = inum;							\
   }
 
+#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum,	\
+				nthreads, vecsize)		\
+  {								\
+    tid = 0;                            			\
+    ifrom = 0;							\
+    ito = inum;							\
+    ip = vecsize;						\
+  }
+
 #endif
 
+#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,	\
+				  f_stride, pos, ov0, ov1, ov2,		\
+				  ov3, ov4, ov5)			\
+{									\
+  acc_t *f_scalar = &f_start[0].x;					\
+  flt_t *x_scalar = &pos[minlocal].x;					\
+  int f_stride4 = f_stride * 4;						\
+  _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64);				\
+  int vwidth;								\
+  if (sizeof(acc_t) == sizeof(double))					\
+    vwidth = INTEL_COMPILE_WIDTH/2;					\
+  else									\
+    vwidth = INTEL_COMPILE_WIDTH;					\
+  if (vwidth < 4) vwidth = 4;						\
+  _use_simd_pragma("vector aligned")          				\ 
+  _use_simd_pragma("simd")					        \
+  for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;			\
+  int remainder = lt % vwidth;						\
+  if (lf > lt) remainder = 0;						\
+  const int v_range = lt - remainder;					\
+  if (nthreads == 2) {							\
+    acc_t *f_scalar2 = f_scalar + f_stride4;				\
+    for (int n = lf; n < v_range; n += vwidth) {			\
+      _use_simd_pragma("vector aligned")				\ 
+      _use_simd_pragma("simd")					        \
+      for (int v = 0; v < vwidth; v++) {				\
+	f_scalar[n+v] += f_scalar2[n+v];				\
+	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
+      }									\
+      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
+      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
+      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
+      if (vwidth > 4) {							\
+	ov3 += f_scalar[n+5] * x_scalar[n+4];				\
+	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
+	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
+      }									\
+      if (vwidth > 8) {							\
+        ov3 += f_scalar[n+9] * x_scalar[n+8];				\
+        ov3 += f_scalar[n+13] * x_scalar[n+12];				\
+	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
+	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
+	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
+	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
+      }									\
+    }									\
+    _use_simd_pragma("vector aligned")				        \ 
+    _use_simd_pragma("ivdep")						\
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")	\
+    for (int n = v_range; n < lt; n++)					\
+      f_scalar[n] += f_scalar2[n];					\
+  } else if (nthreads==4) {						\
+    acc_t *f_scalar2 = f_scalar + f_stride4;				\
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;				\
+    acc_t *f_scalar4 = f_scalar3 + f_stride4;				\
+    for (int n = lf; n < v_range; n += vwidth) {			\
+      _use_simd_pragma("vector aligned")				\ 
+      _use_simd_pragma("simd")						\
+      for (int v = 0; v < vwidth; v++) {				\
+	f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +		\
+	  f_scalar4[n+v];						\
+	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
+      }									\
+      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
+      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
+      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
+      if (vwidth > 4) {							\
+	ov3 += f_scalar[n+5] * x_scalar[n+4];				\
+	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
+	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
+      }									\
+      if (vwidth > 8) {							\
+        ov3 += f_scalar[n+9] * x_scalar[n+8];				\
+	ov3 += f_scalar[n+13] * x_scalar[n+12];				\
+	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
+	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
+	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
+	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
+      }									\
+    }									\
+    _use_simd_pragma("vector aligned")				        \ 
+    _use_simd_pragma("ivdep")						\
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")	\
+    for (int n = v_range; n < lt; n++)				        \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];	\
+  } else if (nthreads==1) {						\
+    for (int n = lf; n < v_range; n += vwidth) {			\
+      _use_simd_pragma("vector aligned")				\
+      _use_simd_pragma("simd")						\
+      for (int v = 0; v < vwidth; v++) 				        \
+	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
+      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
+      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
+      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
+      if (vwidth > 4) {							\
+        ov3 += f_scalar[n+5] * x_scalar[n+4];				\
+	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
+	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
+      }									\
+      if (vwidth > 8) {							\
+	ov3 += f_scalar[n+9] * x_scalar[n+8];				\
+	ov3 += f_scalar[n+13] * x_scalar[n+12];				\
+	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
+	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
+	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
+	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
+      }									\
+    }									\
+  } else if (nthreads==3) {						\
+    acc_t *f_scalar2 = f_scalar + f_stride4;				\
+    acc_t *f_scalar3 = f_scalar2 + f_stride4;				\
+    for (int n = lf; n < v_range; n += vwidth) {			\
+      _use_simd_pragma("vector aligned")				\ 
+      _use_simd_pragma("simd")						\
+      for (int v = 0; v < vwidth; v++) {				\
+	f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];		\
+	ovv[v] += f_scalar[n+v] * x_scalar[n+v];			\
+      }									\
+      ov3 += f_scalar[n+1] * x_scalar[n+0];				\
+      ov4 += f_scalar[n+2] * x_scalar[n+0];				\
+      ov5 += f_scalar[n+2] * x_scalar[n+1];				\
+      if (vwidth > 4) {							\
+	ov3 += f_scalar[n+5] * x_scalar[n+4];				\
+	ov4 += f_scalar[n+6] * x_scalar[n+4];				\
+	ov5 += f_scalar[n+6] * x_scalar[n+5];				\
+      }									\
+      if (vwidth > 8) {							\
+        ov3 += f_scalar[n+9] * x_scalar[n+8];				\
+	ov3 += f_scalar[n+13] * x_scalar[n+12];				\
+	ov4 += f_scalar[n+10] * x_scalar[n+8];				\
+	ov4 += f_scalar[n+14] * x_scalar[n+12];				\
+	ov5 += f_scalar[n+10] * x_scalar[n+9];				\
+	ov5 += f_scalar[n+14] * x_scalar[n+13];				\
+      }									\
+    }									\
+    _use_simd_pragma("vector aligned")				        \ 
+    _use_simd_pragma("ivdep")						\
+    _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")	\
+    for (int n = v_range; n < lt; n++)				        \
+      f_scalar[n] += f_scalar2[n] + f_scalar3[n];			\
+  }									\
+  for (int n = v_range; n < lt; n += 4) {				\
+    _use_simd_pragma("vector aligned")				        \ 
+    _use_simd_pragma("ivdep")						\
+    for (int v = 0; v < 4; v++) 				        \
+      ovv[v] += f_scalar[n+v] * x_scalar[n+v];				\
+    ov3 += f_scalar[n+1] * x_scalar[n+0];				\
+    ov4 += f_scalar[n+2] * x_scalar[n+0];				\
+    ov5 += f_scalar[n+2] * x_scalar[n+1];				\
+  }									\
+  ov0 += ovv[0];							\
+  ov1 += ovv[1];						       	\
+  ov2 += ovv[2];							\
+  if (vwidth > 4) {							\
+    ov0 += ovv[4];							\
+    ov1 += ovv[5];							\
+    ov2 += ovv[6];							\
+  }									\
+  if (vwidth > 8) {							\
+    ov0 += ovv[8] + ovv[12];						\
+    ov1 += ovv[9] + ovv[13];						\
+    ov2 += ovv[10] + ovv[14];						\
+  }									\
+}
+
+#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,	\
+                               f_stride, pos, offload, vflag, ov0, ov1,	\
+                               ov2, ov3, ov4, ov5)			\
+{									\
+  int o_range = (nall - minlocal) * 4;					\
+  IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,	\
+			    sizeof(acc_t));				\
+									\
+  acc_t *f_scalar = &f_start[0].x;					\
+  int f_stride4 = f_stride * 4;						\
+  int t;								\
+  if (vflag == 2) t = 4; else t = 1;					\
+  acc_t *f_scalar2 = f_scalar + f_stride4 * t;				\
+  for ( ; t < nthreads; t++) {						\
+    _use_simd_pragma("vector aligned")					\
+    _use_simd_pragma("simd") 					        \
+    for (int n = iifrom; n < iito; n++)				        \
+      f_scalar[n] += f_scalar2[n];					\
+    f_scalar2 += f_stride4;						\
+  }									\
+									\
+  if (vflag == 2) {							\
+    int nt_min = MIN(4,nthreads);					\
+    IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,	\
+			      f_stride, pos, ov0, ov1, ov2, ov3, ov4,	\
+			      ov5);					\
+  }									\
+}
+
 #ifdef _LMP_INTEL_OFFLOAD
 #include <sys/time.h>
 
@@ -229,17 +523,19 @@ inline double MIC_Wtime() {
     if (fix->separate_buffers() && ago != 0) {				\
     fix->start_watch(TIME_PACK);					\
     if (offload) {							\
-      _use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)")	\
+      int packthreads;							\
+      if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
+      else packthreads = 1;						\
+      _use_omp_pragma("omp parallel if(packthreads > 1)")		\
       {									\
         int ifrom, ito, tid;						\
-	int nthreads = comm->nthreads;					\
 	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		\
-				nthreads, sizeof(flt_t));		\
+				  packthreads, sizeof(flt_t));		\
 	buffers->thr_pack_cop(ifrom, ito, 0);				\
 	int nghost = nall - nlocal;					\
 	if (nghost) {							\
 	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,	\
-				 nthreads, sizeof(flt_t));		\
+				 packthreads, sizeof(flt_t));		\
 	  buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,		\
 				fix->offload_min_ghost() - nlocal,	\
 				ago == 1);				\
@@ -254,7 +550,7 @@ inline double MIC_Wtime() {
   }									\
 }
 
-#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
+#define IP_PRE_get_transfern(ago, newton, eflag, vflag,			\
 			     buffers, offload, fix, separate_flag,	\
 			     x_size, q_size, ev_size, f_stride)		\
 {									\
@@ -276,17 +572,12 @@ inline double MIC_Wtime() {
     q_size = 0;								\
   }									\
   ev_size = 0;								\
-  if (evflag) {								\
-    if (eflag) ev_size = 2;						\
-    if (vflag) ev_size = 8;						\
-  }									\
-  int f_length;								\
+  if (eflag) ev_size = 2;						\
+  if (vflag) ev_size = 8;						\
   if (newton)								\
-    f_length = nall;							\
+    f_stride = buffers->get_stride(nall);				\
   else									\
-    f_length = nlocal;							\
-  f_length -= minlocal;							\
-  f_stride = buffers->get_stride(f_length);				\
+    f_stride = buffers->get_stride(inum);				\
 }
 
 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
@@ -337,6 +628,20 @@ inline double MIC_Wtime() {
   }									\
 }
 
+#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,	\
+				f_start, f_stride, x, offload, vflag,	\
+				ov0, ov1, ov2, ov3, ov4, ov5)		\
+{								        \
+  if (newton) {								\
+    _use_omp_pragma("omp barrier");					\
+    IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,		\
+			   f_stride, x, offload, vflag, ov0, ov1, ov2,	\
+			   ov3, ov4, ov5);				\
+  }									\
+}
+
+#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,	\
+			    ov0, ov1, ov2, ov3, ov4, ov5)		
 
 #else
 
@@ -344,7 +649,7 @@ inline double MIC_Wtime() {
 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
                                      nlocal, nall)
 
-#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
+#define IP_PRE_get_transfern(ago, newton, eflag, vflag,			\
 			     buffers, offload, fix, separate_flag,	\
 			     x_size, q_size, ev_size, f_stride)		\
 {                                                                       \
@@ -369,18 +674,54 @@ inline double MIC_Wtime() {
 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
 				  f_stride, x, q)
 
+#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,	\
+				f_start, f_stride, x, offload, vflag,	\
+				ov0, ov1, ov2, ov3, ov4, ov5)		\
+{								        \
+  if (newton) {								\
+    if (vflag == 2 && nthreads > INTEL_HTHREADS) {			\
+      _use_omp_pragma("omp barrier");					\
+      buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2,	\
+			    ov3, ov4, ov5);				\
+    }									\
+  }									\
+}
+
+#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,	\
+			    ov0, ov1, ov2, ov3, ov4, ov5)		\
+{								        \
+  if (newton) {								\
+    if (vflag == 2 && nthreads <= INTEL_HTHREADS) {			\
+      int lt = nall * 4;						\
+      buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1,	\
+			       ov2, ov3, ov4, ov5);			\
+    }									\
+  }									\
+}
 
 #endif
 
-#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz)    \
+#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz)		\
 {                                                                       \
   if (vflag == 1) {                                                     \
-    sv0 += ev_pre * delx * delx * fpair;                                \
-    sv1 += ev_pre * dely * dely * fpair;                                \
-    sv2 += ev_pre * delz * delz * fpair;                                \
-    sv3 += ev_pre * delx * dely * fpair;                                \
-    sv4 += ev_pre * delx * delz * fpair;                                \
-    sv5 += ev_pre * dely * delz * fpair;                                \
+    sv0 += delx * delx * fpair;						\
+    sv1 += dely * dely * fpair;						\
+    sv2 += delz * delz * fpair;						\
+    sv3 += delx * dely * fpair;						\
+    sv4 += delx * delz * fpair;						\
+    sv5 += dely * delz * fpair;						\
+  }                                                                     \
+}
+
+#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz)		\
+{                                                                       \
+  if (vflag == 1) {                                                     \
+    sv0 += dx * fpx;							\
+    sv1 += dy * fpy;							\
+    sv2 += dz * fpz;							\
+    sv3 += dx * fpy;							\
+    sv4 += dx * fpz;							\
+    sv5 += dy * fpz;							\
   }                                                                     \
 }
 
@@ -408,9 +749,10 @@ inline double MIC_Wtime() {
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond,	\
-			     delx, dely, delz, obond, force, newton,	\
-			     nlocal, ov0, ov1, ov2, ov3, ov4, ov5)	\
+#define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
+			     fbond, delx, dely, delz, obond, force,	\
+			     newton, nlocal, ov0, ov1, ov2, ov3, ov4,	\
+			     ov5)					\
 {                                                                       \
   flt_t ev_pre;								\
   if (newton) ev_pre = (flt_t)1.0;					\
@@ -421,7 +763,7 @@ inline double MIC_Wtime() {
   }									\
 									\
   if (eflag) {								\
-    oebond += ev_pre * ebond;						\
+    obond += ev_pre * ebond;						\
     if (eatom) {							\
       flt_t halfeng = ebond * (flt_t)0.5;				\
       if (newton || i1 < nlocal) f[i1].w += halfeng;			\
@@ -429,7 +771,7 @@ inline double MIC_Wtime() {
     }									\
   }									\
 									\
-  if (vflag) {								\
+  if (VFLAG && vflag) {							\
     ov0 += ev_pre * (delx * delx * fbond);				\
     ov1 += ev_pre * (dely * dely * fbond);				\
     ov2 += ev_pre * (delz * delz * fbond);				\
@@ -439,9 +781,9 @@ inline double MIC_Wtime() {
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, 	\
-			      f1x, f1y, f1z, f3x, f3y, f3z, delx1,	\
-			      dely1, delz1, delx2, dely2, delz2,	\
+#define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1,   \
+			      i2, i3, f1x, f1y, f1z, f3x, f3y, f3z,	\
+			      delx1, dely1, delz1, delx2, dely2, delz2,	\
 			      oeangle, force, newton, nlocal, ov0, ov1, \
 			      ov2, ov3, ov4, ov5)			\
 {                                                                       \
@@ -464,20 +806,20 @@ inline double MIC_Wtime() {
     }									\
   }									\
 									\
-  if (vflag) {								\
-    ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);			\
-    ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);			\
-    ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);			\
-    ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);			\
-    ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);			\
-    ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);			\
+  if (VFLAG && vflag) {							\
+    ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);                        \
+    ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);                        \
+    ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);                        \
+    ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);                        \
+    ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);                        \
+    ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);                        \
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
-			      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,	\
-			      f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,	\
-			      vb3x, vb3y, vb3z,oedihedral, force,	\
+#define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
+			      i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
+			      f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y,	\
+			      vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
 			      newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
 			      ov5)					\
 {                                                                       \
@@ -502,7 +844,7 @@ inline double MIC_Wtime() {
     }									\
   }									\
 									\
-  if (vflag) {								\
+  if (VFLAG && vflag) {							\
     ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
     ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
     ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
@@ -512,96 +854,36 @@ inline double MIC_Wtime() {
   }                                                                     \
 }
 
-#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
+#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp)    	\
 {									\
-  if (evflag) {								\
-    if (eflag) {							\
-      f[i].w += fwtmp;							\
-      oevdwl += sevdwl;							\
-    }									\
-    if (vflag == 1) {							\
-      ov0 += sv0;							\
-      ov1 += sv1;							\
-      ov2 += sv2;							\
-      ov3 += sv3;							\
-      ov4 += sv4;							\
-      ov5 += sv5;							\
-    }									\
+  if (eflag) {								\
+    f[i].w += fwtmp;							\
+    oevdwl += sevdwl;							\
   }									\
-}
-
-#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp)    	\
-{									\
-  if (evflag) {								\
-    if (eflag) {							\
-      f[i].w += fwtmp;							\
-      oevdwl += sevdwl;							\
-      oecoul += secoul;							\
-    }									\
-    if (vflag == 1) {							\
-      ov0 += sv0;							\
-      ov1 += sv1;							\
-      ov2 += sv2;							\
-      ov3 += sv3;							\
-      ov4 += sv4;							\
-      ov5 += sv5;							\
-    }									\
+  if (newton == 0 && vflag == 1) {					\
+    ov0 += sv0;								\
+    ov1 += sv1;								\
+    ov2 += sv2;								\
+    ov3 += sv3;								\
+    ov4 += sv4;								\
+    ov5 += sv5;								\
   }									\
 }
 
-#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom,	\
-			       nall, nlocal, minlocal, nthreads,	\
-			       f_start, f_stride, x, offload)		\
+#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp)    	\
 {									\
-  int o_range;								\
-  if (newton)								\
-    o_range = nall;							\
-  else									\
-    o_range = nlocal;							\
-  if (offload == 0) o_range -= minlocal;				\
-    IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,	\
-			 sizeof(acc_t));				\
-									\
-  int t_off = f_stride;						        \
-  if (eflag && eatom) {							\
-    for (int t = 1; t < nthreads; t++) {				\
-      _use_simd_pragma("vector nontemporal")				\
-      _use_simd_pragma("novector")					\
-      for (int n = iifrom; n < iito; n++) {				\
-        f_start[n].x += f_start[n + t_off].x;				\
-        f_start[n].y += f_start[n + t_off].y;				\
-	f_start[n].z += f_start[n + t_off].z;				\
-	f_start[n].w += f_start[n + t_off].w;				\
-      }									\
-      t_off += f_stride;						\
-    }									\
-  } else {								\
-    for (int t = 1; t < nthreads; t++) {				\
-      _use_simd_pragma("vector nontemporal")  				\
-      _use_simd_pragma("novector")					\
-      for (int n = iifrom; n < iito; n++) {                             \
-	f_start[n].x += f_start[n + t_off].x;                  	        \
-        f_start[n].y += f_start[n + t_off].y;				\
-        f_start[n].z += f_start[n + t_off].z;				\
-      }									\
-      t_off += f_stride;						\
-    }									\
+  if (eflag) {								\
+    f[i].w += fwtmp;							\
+    oevdwl += sevdwl;							\
+    oecoul += secoul;							\
   }									\
-									\
-  if (evflag) {								\
-    if (vflag == 2) {							\
-      const ATOM_T * _noalias const xo = x + minlocal;			\
-      _use_simd_pragma("vector nontemporal")   				\
-      _use_simd_pragma("novector")					\
-      for (int n = iifrom; n < iito; n++) {				\
-	ov0 += f_start[n].x * xo[n].x;					\
-	ov1 += f_start[n].y * xo[n].y;					\
-	ov2 += f_start[n].z * xo[n].z;					\
-	ov3 += f_start[n].y * xo[n].x;					\
-	ov4 += f_start[n].z * xo[n].x;					\
-	ov5 += f_start[n].z * xo[n].y;					\
-      }									\
-    }									\
+  if (newton == 0 && vflag == 1) {					\
+    ov0 += sv0;								\
+    ov1 += sv1;								\
+    ov2 += sv2;								\
+    ov3 += sv3;								\
+    ov4 += sv4;								\
+    ov5 += sv5;								\
   }									\
 }
 
diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h
index ac13f1edfd..aa03a6f136 100644
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@@ -1778,7 +1778,7 @@ namespace ip_simd {
   inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
 				 const SIMD_int &i, const SIMD_float &fx,
 				 const SIMD_float &fy, const SIMD_float &fz,
-				 const int EVFLAG, const int eatom,
+				 const int EFLAG, const int eatom,
 				 const SIMD_float &fwtmp) {
     SIMD_float jfrc;
     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
@@ -1793,7 +1793,7 @@ namespace ip_simd {
 				    _MM_SCALE_1);
     jfrc = jfrc + fz;
     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
-    if (EVFLAG) {
+    if (EFLAG) {
       if (eatom) {
 	jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
 					_MM_SCALE_1);
@@ -1806,7 +1806,7 @@ namespace ip_simd {
   inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
 				 const SIMD_int &i, const SIMD_double &fx,
 				 const SIMD_double &fy, const SIMD_double &fz,
-				 const int EVFLAG, const int eatom,
+				 const int EFLAG, const int eatom,
 				 const SIMD_double &fwtmp) {
     SIMD_double jfrc;
     jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
@@ -1821,7 +1821,7 @@ namespace ip_simd {
 				      _MM_SCALE_2);
     jfrc = jfrc + fz;
     _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
-    if (EVFLAG) {
+    if (EFLAG) {
       if (eatom) {
 	jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, 
 					  force + 3, _MM_SCALE_2);
diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp
index c3335b2c26..bff3d53636 100644
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@@ -55,7 +55,7 @@ NBinIntel::~NBinIntel() {
       nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1))
   }
   #endif
-}
+}  
 
 /* ----------------------------------------------------------------------
    setup for bin_atoms()
@@ -71,7 +71,7 @@ void NBinIntel::bin_atoms_setup(int nall)
     if (_offload_alloc) {
       const int * binhead = this->binhead;
       #pragma offload_transfer target(mic:_cop)	\
-        nocopy(binhead:alloc_if(0) free_if(1))
+	nocopy(binhead:alloc_if(0) free_if(1))
     }
     #endif
 
@@ -99,7 +99,7 @@ void NBinIntel::bin_atoms_setup(int nall)
       const int * _atombin = this->_atombin;
       const int * _binpacked = this->_binpacked;
       #pragma offload_transfer target(mic:_cop)	\
-        nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
+	nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1))
     }
     #endif
     memory->destroy(bins);
@@ -157,10 +157,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
     const flt_t dx = (INTEL_BIGP - bboxhi[0]);
     const flt_t dy = (INTEL_BIGP - bboxhi[1]);
     const flt_t dz = (INTEL_BIGP - bboxhi[2]);
-    if (dx * dx + dy * dy + dz * dz <
-        static_cast<flt_t>(neighbor->cutneighmaxsq))
+    if (dx * dx + dy * dy + dz * dz < 
+	static_cast<flt_t>(neighbor->cutneighmaxsq))
       error->one(FLERR,
-        "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
   }
 
   // ---------- Grow and cast/pack buffers -------------
@@ -174,14 +174,16 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {
   biga.w = 1;
   buffers->get_x()[nall] = biga;
 
-  const int nthreads = comm->nthreads;
+  int nthreads;
+  if (comm->nthreads > INTEL_HTHREADS) nthreads = comm->nthreads;
+  else nthreads = 1;
   #if defined(_OPENMP)
-  #pragma omp parallel default(none) shared(buffers)
+  #pragma omp parallel if(nthreads > INTEL_HTHREADS)
   #endif
   {
     int ifrom, ito, tid;
     IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
-                              sizeof(ATOM_T));
+			      sizeof(ATOM_T));
     buffers->thr_pack(ifrom, ito, 0);
   }
   _fix->stop_watch(TIME_PACK);
diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp
index 7e0d2abdcb..ae4f599176 100644
--- a/src/USER-INTEL/npair_full_bin_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_intel.cpp
@@ -70,483 +70,62 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   #endif
 
   buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
-                     _fix->nbor_pack_width());
+		     _fix->nbor_pack_width());
 
   int need_ic = 0;
   if (atom->molecular)
     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
+			 neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
-  if (need_ic) {
-    if (offload_noghost) {
-      fbi<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
+  if (_fix->three_body_neighbor()) {
+    if (need_ic) {
+      if (offload_noghost) {
+	bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+	bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+      }
     } else {
-      fbi<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+      if (offload_noghost) {
+	bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+	bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
+      }
     }
   } else {
-    if (offload_noghost) {
-      fbi<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
+    if (need_ic) {
+      if (offload_noghost) {
+	bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+	bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+      }
     } else {
-      fbi<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
-      fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
+      if (offload_noghost) {
+	bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end);
+      } else {
+	bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end);
+	bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
+      }
     }
   }
   #else
-  if (need_ic)
-    fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
-  else
-    fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int offload_noghost, int need_ic>
-void NPairFullBinIntel::
-fbi(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-    const int astart, const int aend, const int offload_end) {
-
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-  int nall_t = nall;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload_noghost && offload) nall_t = atom->nlocal;
-  #endif
-
-  const int pack_width = _fix->nbor_pack_width();
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-  const int e_nall = nall_t;
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = e_nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width)	\
-    in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = e_nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-
-      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
-      ifrom += astart;
-      ito += astart;
-      int e_ito = ito;
-      if (ito == num) {
-        int imod = ito % pack_width;
-        if (imod) e_ito += pack_width - imod;
-      }
-      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
-      int which;
-      int pack_offset = maxnbors * pack_width;
-      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
-      const int obound = pack_offset + maxnbors * 2;
-
-      int max_chunk = 0;
-      int lane = 0;
-      for (int i = ifrom; i < ito; i++) {
-        const flt_t xtmp = x[i].x;
-        const flt_t ytmp = x[i].y;
-        const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
-        const tagint itag = tag[i];
-        const int ioffset = ntypes * itype;
-
-        const int ibin = atombin[i];
-        int raw_count = pack_offset;
-
-        // loop over all atoms in surrounding bins in stencil including self
-        // skip i = j
-        if (exclude) {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              int j = binpacked[jj];
-
-              if (i == j) j=e_nall;
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              #ifndef _LMP_INTEL_OFFLOAD
-              const int jtype = x[j].w;
-              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        } else {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              int j = binpacked[jj];
-
-              if (i == j) j=e_nall;
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        }
-
-        if (raw_count > obound) *overflow = 1;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #ifdef _LMP_INTEL_OFFLOAD
-        int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
-        #if __INTEL_COMPILER+0 > 1499
-        #pragma vector aligned
-        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-        #endif
-        #else
-        #pragma vector aligned
-        #pragma simd
-        #endif
-        #endif
-        for (int u = pack_offset; u < raw_count; u++) {
-          int j = neighptr[u];
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutneighsq[ioffset + jtype])
-            neighptr[u] = e_nall;
-          else {
-            if (need_ic) {
-              int no_special;
-              ominimum_image_check(no_special, delx, dely, delz);
-              if (no_special)
-                neighptr[u] = -j - 1;
-            }
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j < nlocal) {
-              if (j < vlmin) vlmin = j;
-              if (j > vlmax) vlmax = j;
-            } else {
-              if (j < vgmin) vgmin = j;
-              if (j > vgmax) vgmax = j;
-            }
-            #endif
-          }
-        }
-        #ifdef _LMP_INTEL_OFFLOAD
-        lmin = MIN(lmin,vlmin);
-        gmin = MIN(gmin,vgmin);
-        lmax = MAX(lmax,vlmax);
-        gmax = MAX(gmax,vgmax);
-        #endif
-
-        int n = lane, n2 = pack_offset;
-        for (int u = pack_offset; u < raw_count; u++) {
-          const int j = neighptr[u];
-          int pj = j;
-          if (pj < e_nall) {
-            if (need_ic)
-              if (pj < 0) pj = -pj - 1;
-
-            const int jtag = tag[pj];
-            int flist = 0;
-            if (itag > jtag) {
-              if ((itag+jtag) % 2 == 0) flist = 1;
-            } else if (itag < jtag) {
-              if ((itag+jtag) % 2 == 1) flist = 1;
-            } else {
-              if (x[pj].z < ztmp) flist = 1;
-              else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
-              else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp)
-              flist = 1;
-            }
-            if (flist) {
-              neighptr[n2++] = j;
-            } else {
-              neighptr[n] = j;
-              n += pack_width;
-            }
-          }
-        }
-        int ns = (n - lane) / pack_width;
-        atombin[i] = ns;
-        for (int u = pack_offset; u < n2; u++) {
-          neighptr[n] = neighptr[u];
-          n += pack_width;
-        }
-
-        ilist[i] = i;
-        cnumneigh[i] = ct + lane;
-        ns += n2 - pack_offset;
-        numneigh[i] = ns;
-
-        if (ns > max_chunk) max_chunk = ns;
-        lane++;
-        if (lane == pack_width) {
-          ct += max_chunk * pack_width;
-          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          const int edge = (ct % alignb);
-          if (edge) ct += alignb - edge;
-          neighptr = firstneigh + ct;
-          max_chunk = 0;
-          pack_offset = maxnbors * pack_width;
-          lane = 0;
-          if (ct + obound > list_size) {
-              if (i < ito - 1) {
-              *overflow = 1;
-              ct = (ifrom + tid * 2) * maxnbors;
-            }
-          }
-        }
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = e_nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-
-          const int trip = jnum * pack_width;
-          for (int jj = 0; jj < trip; jj+=pack_width) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-            if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++) {
-            if (jlist[jj] >= nlocal) {
-              if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-              else jlist[jj] -= ghost_offset;
-            }
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
+  if (_fix->three_body_neighbor()) {
+    if (need_ic)
+      bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal);
+    else
+      bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal);
   } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                                    _fix->host_used_local(),
-                                    _fix->host_min_ghost(),
-                                    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
+    if (need_ic)
+      bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal);
+    else
+      bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal);
   }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
   #endif
 }
diff --git a/src/USER-INTEL/npair_full_bin_intel.h b/src/USER-INTEL/npair_full_bin_intel.h
index f1be71abbc..83f2c3cd4c 100644
--- a/src/USER-INTEL/npair_full_bin_intel.h
+++ b/src/USER-INTEL/npair_full_bin_intel.h
@@ -15,7 +15,7 @@
 
 NPairStyle(full/bin/intel,
            NPairFullBinIntel,
-           NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI |
+           NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | 
            NP_INTEL)
 #else
 
@@ -36,9 +36,6 @@ class NPairFullBinIntel : public NPairIntel {
  private:
   template <class flt_t, class acc_t>
   void fbi(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int, int>
-  void fbi(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-           const int, const int offload_end = 0);
 };
 
 }
diff --git a/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp b/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp
deleted file mode 100644
index 9a40e2a07c..0000000000
--- a/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: W. Michael Brown (Intel)
-------------------------------------------------------------------------- */
-
-#include "npair_half_bin_newtoff_intel.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "atom.h"
-#include "comm.h"
-#include "group.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalfBinNewtoffIntel::NPairHalfBinNewtoffIntel(LAMMPS *lmp) :
-  NPairIntel(lmp) {}
-
-/* ----------------------------------------------------------------------
-   binned neighbor list construction with partial Newton's 3rd law
-   each owned atom i checks own bin and other bins in stencil
-   pair stored once if i,j are both owned and i < j
-   pair stored by me if j is ghost (also stored by proc owning j)
-------------------------------------------------------------------------- */
-
-void NPairHalfBinNewtoffIntel::build(NeighList *list)
-{
-  if (nstencil > INTEL_MAX_STENCIL_CHECK)
-    error->all(FLERR, "Too many neighbor bins for USER-INTEL package.");
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (exclude)
-    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
-  #endif
-
-  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
-    hbnni(list, _fix->get_mixed_buffers());
-  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-    hbnni(list, _fix->get_double_buffers());
-  else
-    hbnni(list, _fix->get_single_buffers());
-
-  _fix->stop_watch(TIME_HOST_NEIGHBOR);
-}
-
-template <class flt_t, class acc_t>
-void NPairHalfBinNewtoffIntel::
-hbnni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
-  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
-  list->inum = nlocal;
-
-  const int off_end = _fix->offload_end_neighbor();
-  int host_start = off_end;;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (off_end) grow_stencil();
-  if (_fix->full_host_list()) host_start = 0;
-  #endif
-
-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
-
-  int need_ic = 0;
-  if (atom->molecular)
-    dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (need_ic) {
-    hbnni<flt_t,acc_t,1>(1, list, buffers, 0, off_end);
-    hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
-  } else {
-    hbnni<flt_t,acc_t,0>(1, list, buffers, 0, off_end);
-    hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
-  }
-  #else
-  if (need_ic)
-    hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal);
-  else
-    hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int need_ic>
-void NPairHalfBinNewtoffIntel::
-hbnni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-      const int astart, const int aend) {
-
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
-  #endif
-    if (INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload,nall)  \
-    in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
-      ifrom += astart;
-      ito += astart;
-
-      int which;
-
-      const int list_size = (ito + tid + 1) * maxnbors;
-      int ct = (ifrom + tid) * maxnbors;
-      int *neighptr = firstneigh + ct;
-
-      for (int i = ifrom; i < ito; i++) {
-        int j, k, n, n2, itype, jtype, ibin;
-        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
-
-        n = 0;
-        n2 = maxnbors;
-
-        xtmp = x[i].x;
-        ytmp = x[i].y;
-        ztmp = x[i].z;
-        itype = x[i].w;
-        const int ioffset = ntypes*itype;
-
-        // loop over all atoms in other bins in stencil including self
-        // only store pair if i < j
-        // stores own/own pairs only once
-        // stores own/ghost pairs on both procs
-
-        ibin = atombin[i];
-
-        for (k = 0; k < nstencilp; k++) {
-          const int bstart = binhead[ibin + binstart[k]];
-          const int bend = binhead[ibin + binend[k]];
-          for (int jj = bstart; jj < bend; jj++) {
-            const int j = binpacked[jj];
-             if (j <= i) continue;
-
-            jtype = x[j].w;
-            #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-            #endif
-
-            delx = xtmp - x[j].x;
-            dely = ytmp - x[j].y;
-            delz = ztmp - x[j].z;
-            rsq = delx * delx + dely * dely + delz * delz;
-            if (rsq <= cutneighsq[ioffset + jtype]) {
-              if (j < nlocal) {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n++] = -j - 1;
-                  else
-                    neighptr[n++] = j;
-                } else
-                  neighptr[n++] = j;
-                #ifdef _LMP_INTEL_OFFLOAD
-                if (j < lmin) lmin = j;
-                if (j > lmax) lmax = j;
-                #endif
-              } else {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n2++] = -j - 1;
-                  else
-                    neighptr[n2++] = j;
-                } else
-                  neighptr[n2++] = j;
-                #ifdef _LMP_INTEL_OFFLOAD
-                if (j < gmin) gmin = j;
-                if (j > gmax) gmax = j;
-                #endif
-              }
-            }
-          }
-        }
-        ilist[i] = i;
-
-        cnumneigh[i] = ct;
-        if (n > maxnbors) *overflow = 1;
-        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
-
-        const int edge = (n % pad_width);
-        if (edge) {
-          const int pad_end = n + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; n < pad_end; n++)
-            neighptr[n] = nall;
-        }
-        numneigh[i] = n;
-        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
-        ct += n;
-        neighptr += n;
-        if (ct + n + maxnbors > list_size) {
-          *overflow = 1;
-          ct = (ifrom + tid) * maxnbors;
-        }
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          for (int jj = 0; jj < jnum; jj++) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-              if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++)
-            if (jlist[jj] >= nlocal) break;
-          while (jj < jnum) {
-            if (jlist[jj] == nall) jlist[jj] = nall_offset;
-            else jlist[jj] -= ghost_offset;
-            jj++;
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                        	    _fix->host_used_local(),
-                        	    _fix->host_min_ghost(),
-                        	    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
-  #endif
-}
diff --git a/src/USER-INTEL/npair_half_bin_newtoff_intel.h b/src/USER-INTEL/npair_half_bin_newtoff_intel.h
deleted file mode 100644
index 49482f8b3e..0000000000
--- a/src/USER-INTEL/npair_half_bin_newtoff_intel.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef NPAIR_CLASS
-
-NPairStyle(half/bin/newtoff/intel,
-           NPairHalfBinNewtoffIntel,
-           NP_HALF | NP_BIN | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
-
-#else
-
-#ifndef LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
-#define LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H
-
-#include "npair_intel.h"
-#include "fix_intel.h"
-
-namespace LAMMPS_NS {
-
-class NPairHalfBinNewtoffIntel : public NPairIntel {
- public:
-  NPairHalfBinNewtoffIntel(class LAMMPS *);
-  ~NPairHalfBinNewtoffIntel() {}
-  void build(class NeighList *);
-
- private:
-  template <class flt_t, class acc_t>
-  void hbnni(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int>
-  void hbnni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-             const int);
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-
-*/
diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
index 6313ab944f..e7d5995cc5 100644
--- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
@@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) :
+NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : 
   NPairIntel(lmp) {}
 
 /* ----------------------------------------------------------------------
@@ -75,536 +75,32 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int need_ic = 0;
   if (atom->molecular)
     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
+			 neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (need_ic) {
     if (offload_noghost) {
-      hbni<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal, 
+					off_end);
     } else {
-      hbni<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
     }
   } else {
     if (offload_noghost) {
-      hbni<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal, 
+                                        off_end);
     } else {
-      hbni<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
-      hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
     }
   }
   #else
-  if (need_ic)
-    hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+  if (need_ic) 
+    bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal);
   else
-    hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int offload_noghost, int need_ic>
-void NPairHalfBinNewtonIntel::
-hbni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-     const int astart, const int aend, const int offload_end) {
-
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-  int nall_t = nall;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload_noghost && offload) nall_t = atom->nlocal;
-  if (offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
-  #endif
-    if (INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-  const int e_nall = nall_t;
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = e_nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \
-    in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = e_nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-
-      #ifdef OUTER_CHUNK
-      const int swidth = ip_simd::SIMD_type<flt_t>::width();
-      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
-      ifrom += astart;
-      ito += astart;
-      int e_ito = ito;
-      if (ito == num) {
-        int imod = ito % swidth;
-        if (imod) e_ito += swidth - imod;
-      }
-      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
-      #else
-      const int swidth = 1;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
-      ifrom += astart;
-      ito += astart;
-      const int list_size = (ito + tid * 2 + 2) * maxnbors;
-      #endif
-
-      int which;
-
-      int pack_offset = maxnbors * swidth;
-      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
-      const int obound = pack_offset + maxnbors * 2;
-
-      int max_chunk = 0;
-      int lane = 0;
-      for (int i = ifrom; i < ito; i++) {
-        const flt_t xtmp = x[i].x;
-        const flt_t ytmp = x[i].y;
-        const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
-        const int ioffset = ntypes * itype;
-
-        // loop over rest of atoms in i's bin, ghosts are at end of linked list
-        // if j is owned atom, store it, since j is beyond i in linked list
-        // if j is ghost, only store if j coords are "above/to the right" of i
-
-        int raw_count = pack_offset;
-        for (int j = bins[i]; j >= 0; j = bins[j]) {
-          if (j >= nlocal) {
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (offload_noghost && offload) continue;
-            #endif
-            if (x[j].z < ztmp) continue;
-            if (x[j].z == ztmp) {
-              if (x[j].y < ytmp) continue;
-              if (x[j].y == ytmp && x[j].x < xtmp) continue;
-            }
-          }
-          #ifdef _LMP_INTEL_OFFLOAD
-          else if (offload_noghost && i < offload_end) continue;
-          #endif
-
-          #ifndef _LMP_INTEL_OFFLOAD
-          if (exclude) {
-            const int jtype = x[j].w;
-            if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-          }
-          #endif
-
-          neighptr[raw_count++] = j;
-        }
-
-        // loop over all atoms in other bins in stencil, store every pair
-
-        const int ibin = atombin[i];
-        if (exclude) {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              const int j = binpacked[jj];
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              #ifndef _LMP_INTEL_OFFLOAD
-              const int jtype = x[j].w;
-              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        } else {
-          for (int k = 0; k < nstencilp; k++) {
-            const int bstart = binhead[ibin + binstart[k]];
-            const int bend = binhead[ibin + binend[k]];
-            #ifndef _LMP_INTEL_OFFLOAD
-            #ifdef INTEL_VMASK
-            #pragma simd
-            #endif
-            #endif
-            for (int jj = bstart; jj < bend; jj++) {
-              const int j = binpacked[jj];
-
-              #ifdef _LMP_INTEL_OFFLOAD
-              if (offload_noghost) {
-                if (j < nlocal) {
-                  if (i < offload_end) continue;
-                } else if (offload) continue;
-              }
-              #endif
-
-              neighptr[raw_count++] = j;
-            }
-          }
-        }
-
-        if (raw_count > obound) *overflow = 1;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #ifdef _LMP_INTEL_OFFLOAD
-        int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
-        #if __INTEL_COMPILER+0 > 1499
-        #pragma vector aligned
-        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-        #endif
-        #else
-        #pragma vector aligned
-        #pragma simd
-        #endif
-        #endif
-        for (int u = pack_offset; u < raw_count; u++) {
-          int j = neighptr[u];
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutneighsq[ioffset + jtype])
-            neighptr[u] = e_nall;
-          else {
-            if (need_ic) {
-              int no_special;
-              ominimum_image_check(no_special, delx, dely, delz);
-              if (no_special)
-                neighptr[u] = -j - 1;
-            }
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j < nlocal) {
-              if (j < vlmin) vlmin = j;
-              if (j > vlmax) vlmax = j;
-            } else {
-              if (j < vgmin) vgmin = j;
-              if (j > vgmax) vgmax = j;
-            }
-            #endif
-          }
-        }
-        #ifdef _LMP_INTEL_OFFLOAD
-        lmin = MIN(lmin,vlmin);
-        gmin = MIN(gmin,vgmin);
-        lmax = MAX(lmax,vlmax);
-        gmax = MAX(gmax,vgmax);
-        #endif
-
-        int n = lane, n2 = pack_offset;
-        for (int u = pack_offset; u < raw_count; u++) {
-          const int j = neighptr[u];
-          int pj = j;
-          if (pj < e_nall) {
-            if (need_ic)
-              if (pj < 0) pj = -pj - 1;
-
-            if (pj < nlocal) {
-              neighptr[n] = j;
-              n += swidth;
-            } else
-              neighptr[n2++] = j;
-          }
-        }
-        int ns = (n - lane) / swidth;
-        for (int u = pack_offset; u < n2; u++) {
-          neighptr[n] = neighptr[u];
-          n += swidth;
-        }
-
-        ilist[i] = i;
-        cnumneigh[i] = ct + lane;
-        ns += n2 - pack_offset;
-        #ifndef OUTER_CHUNK
-        int edge = (ns % pad_width);
-        if (edge) {
-          const int pad_end = ns + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; ns < pad_end; ns++)
-            neighptr[ns] = e_nall;
-        }
-        #endif
-        numneigh[i] = ns;
-
-        #ifdef OUTER_CHUNK
-        if (ns > max_chunk) max_chunk = ns;
-        lane++;
-        if (lane == swidth) {
-          ct += max_chunk * swidth;
-          const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-          int edge = (ct % alignb);
-          if (edge) ct += alignb - edge;
-          neighptr = firstneigh + ct;
-          max_chunk = 0;
-          pack_offset = maxnbors * swidth;
-          lane = 0;
-          if (ct + obound > list_size) {
-            if (i < ito - 1) {
-              *overflow = 1;
-              ct = (ifrom + tid * 2) * maxnbors;
-            }
-          }
-        }
-        #else
-        ct += ns;
-        const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-        edge = (ct % alignb);
-        if (edge) ct += alignb - edge;
-        neighptr = firstneigh + ct;
-        if (ct + obound > list_size) {
-          if (i < ito - 1) {
-            *overflow = 1;
-            ct = (ifrom + tid * 2) * maxnbors;
-          }
-        }
-        #endif
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-            if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = e_nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-         if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          #ifndef OUTER_CHUNK
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
-          #pragma simd
-          #endif
-          for (int jj = 0; jj < jnum; jj++) {
-          #else
-          const int trip = jnum * swidth;
-          for (int jj = 0; jj < trip; jj+= swidth) {
-          #endif
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-            if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++)
-            if (jlist[jj] >= nlocal) break;
-          while (jj < jnum) {
-            if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-            else jlist[jj] -= ghost_offset;
-            jj++;
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                        	    _fix->host_used_local(),
-                        	    _fix->host_min_ghost(),
-                        	    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
+    bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal);
   #endif
 }
diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.h b/src/USER-INTEL/npair_half_bin_newton_intel.h
index 9b5d0780a1..54a8e24135 100644
--- a/src/USER-INTEL/npair_half_bin_newton_intel.h
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.h
@@ -36,9 +36,6 @@ class NPairHalfBinNewtonIntel : public NPairIntel {
  private:
   template <class flt_t, class acc_t>
   void hbni(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int, int>
-  void hbni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-            const int, const int offload_end = 0);
 };
 
 }
diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
index 5f191e0797..3c36458f06 100644
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
@@ -26,7 +26,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) :
+NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : 
   NPairIntel(lmp) {}
 
 /* ----------------------------------------------------------------------
@@ -75,439 +75,32 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
   int need_ic = 0;
   if (atom->molecular)
     dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax,
-                         neighbor->cutneighmax);
+			 neighbor->cutneighmax);
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (need_ic) {
     if (offload_noghost) {
-      hbnti<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal, 
+					off_end);
     } else {
-      hbnti<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
     }
   } else {
     if (offload_noghost) {
-      hbnti<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal, 
+					off_end);
     } else {
-      hbnti<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end);
-      hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
+      bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end);
+      bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
     }
   }
   #else
   if (need_ic)
-    hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal);
+    bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal);
   else
-    hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal);
-  #endif
-}
-
-template <class flt_t, class acc_t, int offload_noghost, int need_ic>
-void NPairHalfBinNewtonTriIntel::
-hbnti(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers,
-      const int astart, const int aend, const int offload_end) {
-  if (aend-astart == 0) return;
-
-  const int nall = atom->nlocal + atom->nghost;
-  int pad = 1;
-  int nall_t = nall;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload_noghost && offload) nall_t = atom->nlocal;
-  if (offload) {
-    if (INTEL_MIC_NBOR_PAD > 1)
-      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  } else
-  #endif
-    if (INTEL_NBOR_PAD > 1)
-      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
-  const int pad_width = pad;
-
-  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
-  const int e_nall = nall_t;
-
-  const int molecular = atom->molecular;
-  int *ns = NULL;
-  tagint *s = NULL;
-  int tag_size = 0, special_size;
-  if (buffers->need_tag()) tag_size = e_nall;
-  if (molecular) {
-    s = atom->special[0];
-    ns = atom->nspecial[0];
-    special_size = aend;
-  } else {
-    s = &buffers->_special_holder;
-    ns = &buffers->_nspecial_holder;
-    special_size = 0;
-  }
-  const tagint * _noalias const special = s;
-  const int * _noalias const nspecial = ns;
-  const int maxspecial = atom->maxspecial;
-  const tagint * _noalias const tag = atom->tag;
-
-  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int nstencil = this->nstencil;
-  const int * _noalias const stencil = this->stencil;
-  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
-  const int ntypes = atom->ntypes + 1;
-  const int nlocal = atom->nlocal;
-
-  #ifndef _LMP_INTEL_OFFLOAD
-  int * const mask = atom->mask;
-  tagint * const molecule = atom->molecule;
-  #endif
-
-  int tnum;
-  int *overflow;
-  double *timer_compute;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    timer_compute = _fix->off_watch_neighbor();
-    tnum = buffers->get_off_threads();
-    overflow = _fix->get_off_overflow_flag();
-    _fix->stop_watch(TIME_HOST_NEIGHBOR);
-    _fix->start_watch(TIME_OFFLOAD_LATENCY);
-  } else
-  #endif
-  {
-    tnum = comm->nthreads;
-    overflow = _fix->get_overflow_flag();
-  }
-  const int nthreads = tnum;
-  const int maxnbors = buffers->get_max_nbors();
-  int * _noalias const atombin = buffers->get_atombin();
-  const int * _noalias const binpacked = buffers->get_binpacked();
-
-  const int xperiodic = domain->xperiodic;
-  const int yperiodic = domain->yperiodic;
-  const int zperiodic = domain->zperiodic;
-  const flt_t xprd_half = domain->xprd_half;
-  const flt_t yprd_half = domain->yprd_half;
-  const flt_t zprd_half = domain->zprd_half;
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  const int * _noalias const binhead = this->binhead;
-  const int * _noalias const bins = this->bins;
-  const int cop = _fix->coprocessor_number();
-  const int separate_buffers = _fix->separate_buffers();
-  #pragma offload target(mic:cop) if(offload) \
-    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
-    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
-    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
-    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
-    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
-    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
-    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
-    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
-    out(numneigh:length(0) alloc_if(0) free_if(0)) \
-    in(ilist:length(0) alloc_if(0) free_if(0)) \
-    in(atombin:length(aend) alloc_if(0) free_if(0)) \
-    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \
-    in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \
-    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
-    out(overflow:length(5) alloc_if(0) free_if(0)) \
-    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
-    signal(tag)
-  #endif
-  {
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime();
-    #endif
-
-    #ifdef _LMP_INTEL_OFFLOAD
-    overflow[LMP_LOCAL_MIN] = astart;
-    overflow[LMP_LOCAL_MAX] = aend - 1;
-    overflow[LMP_GHOST_MIN] = e_nall;
-    overflow[LMP_GHOST_MAX] = -1;
-    #endif
-
-    int nstencilp = 0;
-    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
-    for (int k = 0; k < nstencil; k++) {
-      binstart[nstencilp] = stencil[k];
-      int end = stencil[k] + 1;
-      for (int kk = k + 1; kk < nstencil; kk++) {
-        if (stencil[kk-1]+1 == stencil[kk]) {
-          end++;
-          k++;
-        } else break;
-      }
-      binend[nstencilp] = end;
-      nstencilp++;
-    }
-
-    #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
-    #endif
-    {
-      #ifdef _LMP_INTEL_OFFLOAD
-      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
-      #endif
-
-      const int num = aend - astart;
-      int tid, ifrom, ito;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
-      ifrom += astart;
-      ito += astart;
-
-      int which;
-
-      const int list_size = (ito + tid * 2 + 2) * maxnbors;
-      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
-      const int obound = maxnbors * 3;
-
-      for (int i = ifrom; i < ito; i++) {
-        const flt_t xtmp = x[i].x;
-        const flt_t ytmp = x[i].y;
-        const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
-        const int ioffset = ntypes * itype;
-
-        // loop over all atoms in bins in stencil
-        // pairs for atoms j "below" i are excluded
-        // below = lower z or (equal z and lower y) or (equal zy and lower x)
-        //         (equal zyx and j <= i)
-        // latter excludes self-self interaction but allows superposed atoms
-
-        const int ibin = atombin[i];
-
-        int raw_count = maxnbors;
-        for (int k = 0; k < nstencilp; k++) {
-          const int bstart = binhead[ibin + binstart[k]];
-          const int bend = binhead[ibin + binend[k]];
-          for (int jj = bstart; jj < bend; jj++) {
-            const int j = binpacked[jj];
-
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (offload_noghost) {
-              if (j < nlocal) {
-                if (i < offload_end) continue;
-              } else if (offload) continue;
-            }
-            #endif
-
-            if (x[j].z < ztmp) continue;
-            if (x[j].z == ztmp) {
-              if (x[j].y < ytmp) continue;
-              if (x[j].y == ytmp) {
-                if (x[j].x < xtmp) continue;
-                if (x[j].x == xtmp && j <= i) continue;
-              }
-            }
-
-            #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude) {
-              const int jtype = x[j].w;
-              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
-            }
-            #endif
-
-            neighptr[raw_count++] = j;
-          }
-        }
-        if (raw_count > obound)
-          *overflow = 1;
-
-        #if defined(LMP_SIMD_COMPILER)
-        #ifdef _LMP_INTEL_OFFLOAD
-        int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
-        #if __INTEL_COMPILER+0 > 1499
-        #pragma vector aligned
-        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
-        #endif
-        #else
-        #pragma vector aligned
-        #pragma simd
-        #endif
-        #endif
-        for (int u = maxnbors; u < raw_count; u++) {
-          int j = neighptr[u];
-          const flt_t delx = xtmp - x[j].x;
-          const flt_t dely = ytmp - x[j].y;
-          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
-          const flt_t rsq = delx * delx + dely * dely + delz * delz;
-          if (rsq > cutneighsq[ioffset + jtype])
-            neighptr[u] = e_nall;
-          else {
-            if (need_ic) {
-              int no_special;
-              ominimum_image_check(no_special, delx, dely, delz);
-              if (no_special)
-                neighptr[u] = -j - 1;
-            }
-
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j < nlocal) {
-              if (j < vlmin) vlmin = j;
-              if (j > vlmax) vlmax = j;
-            } else {
-              if (j < vgmin) vgmin = j;
-              if (j > vgmax) vgmax = j;
-            }
-            #endif
-          }
-        }
-
-        int n = 0, n2 = maxnbors;
-        for (int u = maxnbors; u < raw_count; u++) {
-          const int j = neighptr[u];
-          int pj = j;
-          if (pj < e_nall) {
-            if (need_ic)
-              if (pj < 0) pj = -pj - 1;
-
-            if (pj < nlocal)
-              neighptr[n++] = j;
-            else
-              neighptr[n2++] = j;
-          }
-        }
-        int ns = n;
-        for (int u = maxnbors; u < n2; u++)
-          neighptr[n++] = neighptr[u];
-
-        ilist[i] = i;
-        cnumneigh[i] = ct;
-        ns += n2 - maxnbors;
-
-        int edge = (ns % pad_width);
-        if (edge) {
-          const int pad_end = ns + (pad_width - edge);
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma loop_count min=1, max=15, avg=8
-          #endif
-          for ( ; ns < pad_end; ns++)
-            neighptr[ns] = e_nall;
-        }
-        numneigh[i] = ns;
-
-        ct += ns;
-        const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
-        edge = (ct % alignb);
-        if (edge) ct += alignb - edge;
-        neighptr = firstneigh + ct;
-        if (ct + obound > list_size) {
-          if (i < ito - 1) {
-            *overflow = 1;
-            ct = (ifrom + tid * 2) * maxnbors;
-          }
-        }
-      }
-
-      if (*overflow == 1)
-        for (int i = ifrom; i < ito; i++)
-          numneigh[i] = 0;
-
-      #ifdef _LMP_INTEL_OFFLOAD
-      if (separate_buffers) {
-        #if defined(_OPENMP)
-        #pragma omp critical
-        #endif
-        {
-          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
-          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
-          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
-          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
-        }
-        #pragma omp barrier
-      }
-
-      int ghost_offset = 0, nall_offset = e_nall;
-      if (separate_buffers) {
-        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
-        if (nghost < 0) nghost = 0;
-        if (offload) {
-          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
-          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
-        } else {
-          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
-          nall_offset = nlocal + nghost;
-        }
-      }
-      #endif
-
-      if (molecular) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          #if defined(LMP_SIMD_COMPILER)
-          #pragma vector aligned
-          #pragma simd
-          #endif
-          for (int jj = 0; jj < jnum; jj++) {
-            const int j = jlist[jj];
-            if (need_ic && j < 0) {
-              which = 0;
-              jlist[jj] = -j - 1;
-            } else
-              ofind_special(which, special, nspecial, i, tag[j]);
-            #ifdef _LMP_INTEL_OFFLOAD
-            if (j >= nlocal) {
-              if (j == e_nall)
-                jlist[jj] = nall_offset;
-              else if (which)
-                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
-              else jlist[jj]-=ghost_offset;
-            } else
-            #endif
-              if (which) jlist[jj] = j ^ (which << SBBITS);
-          }
-        }
-      }
-      #ifdef _LMP_INTEL_OFFLOAD
-      else if (separate_buffers) {
-        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
-          const int jnum = numneigh[i];
-          int jj = 0;
-          for (jj = 0; jj < jnum; jj++)
-            if (jlist[jj] >= nlocal) break;
-          while (jj < jnum) {
-            if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
-            else jlist[jj] -= ghost_offset;
-            jj++;
-          }
-        }
-      }
-      #endif
-    } // end omp
-    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
-    *timer_compute = MIC_Wtime() - *timer_compute;
-    #endif
-  } // end offload
-
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (offload) {
-    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
-    _fix->start_watch(TIME_HOST_NEIGHBOR);
-    for (int n = 0; n < aend; n++) {
-      ilist[n] = n;
-      numneigh[n] = 0;
-    }
-  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
-    if (separate_buffers) {
-      _fix->start_watch(TIME_PACK);
-      _fix->set_neighbor_host_sizes();
-      buffers->pack_sep_from_single(_fix->host_min_local(),
-                        	    _fix->host_used_local(),
-                        	    _fix->host_min_ghost(),
-                        	    _fix->host_used_ghost());
-      _fix->stop_watch(TIME_PACK);
-    }
-  }
-  #else
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
+    bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal);
   #endif
 }
diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.h b/src/USER-INTEL/npair_half_bin_newton_tri_intel.h
index d144c2fc52..7a7f4c8030 100644
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.h
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.h
@@ -36,9 +36,6 @@ class NPairHalfBinNewtonTriIntel : public NPairIntel {
  private:
   template <class flt_t, class acc_t>
   void hbnti(NeighList *, IntelBuffers<flt_t,acc_t> *);
-  template <class flt_t, class acc_t, int, int>
-  void hbnti(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int,
-             const int, const int offload_end = 0);
 };
 
 }
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index c92ed88774..0412398796 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -48,6 +48,678 @@ NPairIntel::~NPairIntel() {
 
 /* ---------------------------------------------------------------------- */
 
+template <class flt_t, class acc_t, int offload_noghost, int need_ic,
+	  int FULL, int TRI, int THREE>
+void NPairIntel::bin_newton(const int offload, NeighList *list, 
+                            IntelBuffers<flt_t,acc_t> *buffers, 
+                            const int astart, const int aend, 
+                            const int offload_end) {
+
+  if (aend-astart == 0) return;
+
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+  int nall_t = nall;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  if (THREE == 0 && offload) {
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else 
+  #endif
+    if (THREE == 0 && INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  const int pad_width = pad;
+  const int pack_width = _fix->nbor_pack_width();
+
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL;
+  tagint *s = NULL;
+  int tag_size = 0, special_size;
+  if (buffers->need_tag()) tag_size = e_nall;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    special_size = 0;
+  }
+  const tagint * _noalias const special = s;
+  const int * _noalias const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const tagint * _noalias const tag = atom->tag;
+
+  int * _noalias const ilist = list->ilist;
+  int * _noalias numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = this->nstencil;
+  const int * _noalias const stencil = this->stencil;
+  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  tagint * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    timer_compute = _fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = _fix->get_off_overflow_flag();
+    _fix->stop_watch(TIME_HOST_NEIGHBOR);
+    _fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else 
+  #endif
+  {
+    tnum = comm->nthreads;
+    overflow = _fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+  int * _noalias const atombin = buffers->get_atombin();
+  const int * _noalias const binpacked = buffers->get_binpacked();
+
+  const int xperiodic = domain->xperiodic;
+  const int yperiodic = domain->yperiodic;
+  const int zperiodic = domain->zperiodic;
+  const flt_t xprd_half = domain->xprd_half;
+  const flt_t yprd_half = domain->yprd_half;
+  const flt_t zprd_half = domain->zprd_half;
+
+  flt_t * _noalias const ncachex = buffers->get_ncachex();
+  flt_t * _noalias const ncachey = buffers->get_ncachey();
+  flt_t * _noalias const ncachez = buffers->get_ncachez();
+  int * _noalias const ncachej = buffers->get_ncachej();
+  int * _noalias const ncachejtype = buffers->get_ncachejtype();
+  const int ncache_stride = buffers->ncache_stride();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * _noalias const binhead = this->binhead;
+  const int * _noalias const bins = this->bins;
+  const int cop = _fix->coprocessor_number();
+  const int separate_buffers = _fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(atombin:length(aend) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \
+    in(ncachejtype:length(0) alloc_if(0) free_if(0)) \
+    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \
+    in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
+    in(pack_width) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(tag)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    int nstencilp = 0;
+    int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL];
+    for (int k = 0; k < nstencil; k++) {
+      binstart[nstencilp] = stencil[k];
+      int end = stencil[k] + 1;
+      for (int kk = k + 1; kk < nstencil; kk++) {
+        if (stencil[kk-1]+1 == stencil[kk]) {
+	  end++;
+	  k++;
+        } else break;
+      }
+      binend[nstencilp] = end;
+      nstencilp++;
+    }
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(numneigh, overflow, nstencilp, binstart, binend)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+
+      if (THREE) {
+	IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
+      } else {
+	IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      }
+      ifrom += astart;
+      ito += astart;
+      int e_ito = ito;
+      if (THREE && ito == num) {
+	int imod = ito % pack_width;
+	if (imod) e_ito += pack_width - imod;
+      }
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
+
+      int which;
+
+      int pack_offset = maxnbors;
+      if (THREE) pack_offset *= pack_width;
+      int ct = (ifrom + tid * 2) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;
+
+      const int toffs = tid * ncache_stride;
+      flt_t * _noalias const tx = ncachex + toffs;
+      flt_t * _noalias const ty = ncachey + toffs;
+      flt_t * _noalias const tz = ncachez + toffs;
+      int * _noalias const tj = ncachej + toffs;
+      int * _noalias const tjtype = ncachejtype + toffs;
+
+      flt_t * _noalias itx;
+      flt_t * _noalias ity;
+      flt_t * _noalias itz;
+      int * _noalias itj;
+      int * _noalias itjtype;
+
+      // loop over all atoms in other bins in stencil, store every pair
+      int istart, icount, ncount, oldbin = -9999999, lane, max_chunk;
+      if (THREE) {
+	lane = 0;
+	max_chunk = 0;
+      }
+      for (int i = ifrom; i < ito; i++) {
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
+	tagint itag;
+	if (THREE) itag = tag[i];
+        const int ioffset = ntypes * itype;
+
+        const int ibin = atombin[i];
+	if (ibin != oldbin) {
+	  oldbin = ibin;
+	  ncount = 0;
+	  for (int k = 0; k < nstencilp; k++) {
+	    const int bstart = binhead[ibin + binstart[k]];
+	    const int bend = binhead[ibin + binend[k]];
+            #if defined(LMP_SIMD_COMPILER)
+	    #pragma vector aligned
+	    #pragma simd
+	    #endif
+	    for (int jj = bstart; jj < bend; jj++)
+	      tj[ncount++] = binpacked[jj];
+	  }
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+	  #pragma simd
+	  #endif
+	  for (int u = 0; u < ncount; u++) {
+	    const int j = tj[u];
+	    tx[u] = x[j].x;
+	    ty[u] = x[j].y;
+	    tz[u] = x[j].z;
+	    tjtype[u] = x[j].w;
+	  }
+
+	  if (FULL == 0 || TRI == 1) {
+	    icount = 0;
+	    istart = ncount;
+	    const int alignb = INTEL_DATA_ALIGN / sizeof(int);
+	    int nedge = istart % alignb;
+	    if (nedge) istart + (alignb - nedge);
+	    itx = tx + istart;
+	    ity = ty + istart;
+	    itz = tz + istart;
+	    itj = tj + istart;
+	    itjtype = tjtype + istart;
+
+            const int bstart = binhead[ibin];
+	    const int bend = binhead[ibin + 1];
+            #if defined(LMP_SIMD_COMPILER)
+	    #pragma vector aligned
+	    #pragma simd
+	    #endif
+	    for (int jj = bstart; jj < bend; jj++) {
+	      const int j = binpacked[jj];
+	      itj[icount] = j;
+	      itx[icount] = x[j].x;
+	      ity[icount] = x[j].y;
+	      itz[icount] = x[j].z;
+	      itjtype[icount] = x[j].w;
+	      icount++;
+	    }
+	    if (icount + istart > obound) *overflow = 1;
+	  } else
+	    if (ncount > obound) *overflow = 1;
+	}
+
+	// ---------------------- Loop over i bin
+
+        int n = 0;
+	if (FULL == 0 || TRI == 1) {
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+	  #pragma ivdep
+	  #endif
+	  for (int u = 0; u < icount; u++) {
+	    int addme = 1;
+	    int j = itj[u];
+
+	    // Cutoff Check
+	    const flt_t delx = xtmp - itx[u];
+	    const flt_t dely = ytmp - ity[u];
+	    const flt_t delz = ztmp - itz[u];
+	    const int jtype = itjtype[u];
+	    const flt_t rsq = delx * delx + dely * dely + delz * delz;
+	    if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
+	  
+	    // i bin (half) check and offload ghost check
+	    if (j < nlocal) {
+       	      const int ijmod = (i + j) % 2;
+	      if (i > j) {
+	        if (ijmod == 0) addme = 0;
+	      } else if (i < j) {
+	        if (ijmod == 1) addme = 0;
+	      } else 
+ 		addme = 0;
+              #ifdef _LMP_INTEL_OFFLOAD
+	      if (offload_noghost && i < offload_end) addme = 0;
+	      #endif
+	    } else {
+              #ifdef _LMP_INTEL_OFFLOAD
+	      if (offload_noghost && offload) addme = 0;
+	      #endif
+	      if (itz[u] < ztmp) addme = 0;
+	      if (itz[u] == ztmp) {
+                if (ity[u] < ytmp) addme = 0;
+                if (ity[u] == ytmp && itx[u] < xtmp) addme = 0;
+              }
+            } 
+
+	    if (need_ic) {
+	      int no_special;
+	      ominimum_image_check(no_special, delx, dely, delz);
+	      if (no_special)
+		j = -j - 1;
+	    }
+
+	    if (addme)
+	      neighptr[n++] = j;
+	  }
+	} // if FULL==0
+
+	// ---------------------- Loop over other bins
+
+	int n2, *neighptr2;
+	if (THREE) {
+	  n = pack_offset;
+	  n2 = pack_offset + maxnbors;
+	  neighptr2 = neighptr;
+	}
+	#if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+	#endif
+	for (int u = 0; u < ncount; u++) {
+	  int addme = 1;
+          int j = tj[u];
+
+	  if (FULL)
+	    if (i == j) addme = 0;
+
+	  // Cutoff Check
+          const flt_t delx = xtmp - tx[u];
+          const flt_t dely = ytmp - ty[u];
+          const flt_t delz = ztmp - tz[u];
+	  const int jtype = tjtype[u];
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutneighsq[ioffset + jtype]) addme = 0;
+	  
+	  // Triclinic
+	  if (TRI) {
+	    if (tz[u] < ztmp) addme = 0;
+	    if (tz[u] == ztmp) {
+	      if (ty[u] < ytmp) addme = 0;
+	      if (ty[u] == ytmp) {
+	        if (tx[u] < xtmp) addme = 0;
+                if (tx[u] == xtmp && j <= i) addme = 0;
+              }
+	    }
+	  }
+
+	  // offload ghost check
+          #ifdef _LMP_INTEL_OFFLOAD
+	  if (offload_noghost) {
+	    if (j < nlocal) {
+	      if (i < offload_end) addme = 0;
+            } else if (offload) addme = 0;
+	  }
+	  #endif
+
+	  int pj;
+	  if (THREE) pj = j;
+	  if (need_ic) {
+	    int no_special;
+	    ominimum_image_check(no_special, delx, dely, delz);
+	    if (no_special)
+	      j = -j - 1;
+	  }
+
+	  if (THREE) {
+	    const int jtag = tag[pj];
+	    int flist = 0;
+	    if (itag > jtag) {
+	      if ((itag+jtag) % 2 == 0) flist = 1;
+	    } else if (itag < jtag) {
+	      if ((itag+jtag) % 2 == 1) flist = 1;
+	    } else {
+	      if (tz[u] < ztmp) flist = 1;
+	      else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1;
+	      else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) 
+	        flist = 1;
+	    }
+	    if (addme) {
+	      if (flist)
+		neighptr2[n2++] = j;
+	      else
+		neighptr[n++] = j;
+	    }
+	  } else {
+	    if (addme)
+	      neighptr[n++] = j;
+	  }
+	} // for u
+
+        #ifndef _LMP_INTEL_OFFLOAD
+	if (exclude) {
+	  int alln = n;
+	  if (THREE) n = pack_offset;
+	  else n = 0;
+	  for (int u = pack_offset; u < alln; u++) {
+	    const int j = neighptr[u];
+	    int pj = j;
+	    if (need_ic)
+	      if (pj < 0) pj = -j - 1;
+	    const int jtype = x[pj].w;
+	    if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+	    neighptr[n++] = j;
+          }
+	  if (THREE) {
+	    alln = n2;
+	    n2 = pack_offset + maxnbors;
+	    for (int u = pack_offset + maxnbors; u < alln; u++) {
+	      const int j = neighptr[u];
+	      int pj = j;
+	      if (need_ic)
+		if (pj < 0) pj = -j - 1;
+	      const int jtype = x[pj].w;
+	      if (exclusion(i,pj,itype,jtype,mask,molecule)) continue;
+	      neighptr[n2++] = j;
+	    }
+	  }
+        }
+	#endif
+	int ns;
+	if (THREE) {
+	  int alln = n;
+	  ns = n - pack_offset;
+	  atombin[i] = ns;
+	  n = lane;
+	  for (int u = pack_offset; u < alln; u++) {
+	    neighptr[n] = neighptr[u];
+	    n += pack_width;
+	  }
+	  ns += n2 - pack_offset - maxnbors;
+	  for (int u = pack_offset + maxnbors; u < n2; u++) {
+	    neighptr[n] = neighptr[u];
+	    n += pack_width;
+	  }
+	  if (ns > maxnbors) *overflow = 1;
+	} else
+	  if (n > maxnbors) *overflow = 1;
+
+        ilist[i] = i;
+        cnumneigh[i] = ct;
+	if (THREE) {
+	  cnumneigh[i] += lane;
+	  numneigh[i] = ns;
+	} else {
+	  int edge = (n % pad_width);
+	  if (edge) {
+	    const int pad_end = n + (pad_width - edge);
+            #if defined(LMP_SIMD_COMPILER)
+	    #pragma vector aligned
+            #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+	            avg=INTEL_COMPILE_WIDTH/2
+            #endif
+            for ( ; n < pad_end; n++)
+              neighptr[n] = e_nall;
+          }
+	  numneigh[i] = n;
+	}
+
+	if (THREE) {
+  	  if (ns > max_chunk) max_chunk = ns;
+	  lane++;
+	  if (lane == pack_width) {
+	    ct += max_chunk * pack_width;
+	    const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	    const int edge = (ct % alignb);
+	    if (edge) ct += alignb - edge;
+	    neighptr = firstneigh + ct;
+	    max_chunk = 0;
+	    pack_offset = maxnbors * pack_width;
+	    lane = 0;
+	    if (ct + obound > list_size) {
+	      if (i < ito - 1) {
+		*overflow = 1;
+		ct = (ifrom + tid * 2) * maxnbors;
+	      }
+	    }
+	  }
+	} else {
+	  ct += n;
+	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	  const int edge = (ct % alignb);
+	  if (edge) ct += alignb - edge;
+	  neighptr = firstneigh + ct;
+	  if (ct + obound > list_size) {
+	    if (i < ito - 1) {
+	      *overflow = 1;
+	      ct = (ifrom + tid * 2) * maxnbors;
+	    }
+	  }
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  #if __INTEL_COMPILER+0 > 1499
+	  #pragma vector aligned
+          #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	  #endif
+	  for (int jj = 0; jj < jnum; jj++) {
+ 	    int j = jlist[jj];
+	    if (need_ic && j < 0) j = -j - 1;
+            if (j < nlocal) {
+              if (j < vlmin) vlmin = j;
+              if (j > vlmax) vlmax = j;
+            } else {
+              if (j < vgmin) vgmin = j;
+              if (j > vgmax) vgmax = j;
+            }
+	  }
+	}
+	lmin = MIN(lmin,vlmin);
+	gmin = MIN(gmin,vgmin);
+	lmax = MAX(lmax,vlmax);
+	gmax = MAX(gmax,vgmax);
+
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+  	  if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+	  if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+	  if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+	  if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+	#pragma omp barrier
+	
+	int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+ 	if (nghost < 0) nghost = 0;
+	if (offload) {
+	  ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+	  nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+	  ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+	  nall_offset = nlocal + nghost;
+	}
+      } // if separate_buffers
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+
+	  if (THREE) {
+	    const int trip = jnum * pack_width;
+            for (int jj = 0; jj < trip; jj+=pack_width) {
+              const int j = jlist[jj];
+	      if (need_ic && j < 0) {
+	        which = 0;
+	        jlist[jj] = -j - 1;
+              } else
+                ofind_special(which, special, nspecial, i, tag[j]);
+	      #ifdef _LMP_INTEL_OFFLOAD
+	      if (j >= nlocal) {
+	        if (j == e_nall)
+		  jlist[jj] = nall_offset;
+	        else if (which) 
+		  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	        else jlist[jj]-=ghost_offset;
+              } else
+	      #endif
+              if (which) jlist[jj] = j ^ (which << SBBITS);
+	    }
+	  } else {
+            #if defined(LMP_SIMD_COMPILER)
+	    #pragma vector aligned
+            #pragma simd
+	    #endif 
+            for (int jj = 0; jj < jnum; jj++) {
+              const int j = jlist[jj];
+	      if (need_ic && j < 0) {
+	        which = 0;
+	        jlist[jj] = -j - 1;
+              } else
+                ofind_special(which, special, nspecial, i, tag[j]);
+	      #ifdef _LMP_INTEL_OFFLOAD
+	      if (j >= nlocal) {
+	        if (j == e_nall)
+		  jlist[jj] = nall_offset;
+	        else if (which) 
+		  jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	        else jlist[jj]-=ghost_offset;
+              } else
+	      #endif
+              if (which) jlist[jj] = j ^ (which << SBBITS);
+            }
+	  }
+	} // for i
+      } // if molecular
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * _noalias jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  #pragma vector aligned
+	  #pragma simd
+	  for (jj = 0; jj < jnum; jj++) {
+	    if (jlist[jj] >= nlocal) {
+ 	      if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+	      else jlist[jj] -= ghost_offset;
+	    }
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    if (separate_buffers) {
+      _fix->start_watch(TIME_PACK);
+      _fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(_fix->host_min_local(),
+				    _fix->host_used_local(),
+				    _fix->host_min_ghost(),
+				    _fix->host_used_ghost());
+      _fix->stop_watch(TIME_PACK);
+    }
+  }
+  #else
+  #pragma vector aligned
+  #pragma simd
+  for (int i = astart; i < aend; i++)
+    list->firstneigh[i] = firstneigh + cnumneigh[i];
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
 #ifdef _LMP_INTEL_OFFLOAD
 void NPairIntel::grow_stencil()
 {
@@ -62,6 +734,204 @@ void NPairIntel::grow_stencil()
     const int maxstencil = ns->get_maxstencil();
     #pragma offload_transfer target(mic:_cop)	\
       in(stencil:length(maxstencil) alloc_if(1) free_if(0))
-  }
+  }  
 }
 #endif
+
+/* ---------------------------------------------------------------------- */
+
+// ---- Half, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Half, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, no IC
+
+template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, IC
+
+template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+#ifdef _LMP_INTEL_OFFLOAD
+
+// ---- Half, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Half, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 0, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Tri, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 0, 1, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- Full, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 0>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, no IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+// ---- 3-body, IC, no ghost
+
+template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,float> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<float,double> *, const int, const int,
+   const int);
+template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 1>
+  (const int, NeighList *, IntelBuffers<double,double> *, const int, const int,
+   const int);
+
+#endif
diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h
index 06d5d79cac..51574a252c 100644
--- a/src/USER-INTEL/npair_intel.h
+++ b/src/USER-INTEL/npair_intel.h
@@ -25,10 +25,6 @@
 #include "intel_simd.h"
 #endif
 
-#ifdef OUTER_CHUNK
-#include "intel_simd.h"
-#endif
-
 #ifdef _LMP_INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif
@@ -87,6 +83,10 @@ class NPairIntel : public NPair {
  protected:
   FixIntel *_fix;
 
+  template <class flt_t, class acc_t, int, int, int, int, int>
+  void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, 
+		  const int, const int, const int offload_end = 0);
+
   #ifdef _LMP_INTEL_OFFLOAD
   int _cop;
   int *_off_map_stencil;
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
index 4f34a484cb..cdea9e76c4 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@@ -85,53 +85,47 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag,
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+				packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 				IntelBuffers<flt_t,acc_t> *buffers,
 				const ForceConst<flt_t> &fc,
@@ -165,7 +159,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -208,27 +202,26 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 			      f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none)        \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -246,10 +239,9 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
+	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
@@ -319,71 +311,72 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
           if (rsq < c_cuti[jtype].cutsq) {
           #endif
             const flt_t fpair = (forcecoul + forcebuck) * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
+	    const flt_t fpx = fpair * delx;
+	    fxtmp += fpx;
+	    if (NEWTON_PAIR) f[j].x -= fpx;
+	    const flt_t fpy = fpair * dely;
+	    fytmp += fpy;
+	    if (NEWTON_PAIR) f[j].y -= fpy;
+	    const flt_t fpz = fpair * delz;
+	    fztmp += fpz;
+	    if (NEWTON_PAIR) f[j].z -= fpz;
+
             
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-              
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                secoul += ev_pre * ecoul;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                  if (NEWTON_PAIR || j < nlocal) 
-                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                }
+	    if (EFLAG) {
+	      sevdwl += evdwl;
+	      secoul += ecoul;
+	      if (eatom) {
+		fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+		if (NEWTON_PAIR) 
+		  f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
               }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
-            }
+	    }
+	    if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
-
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-
-        IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+	  f[i].x += fxtmp;
+	  f[i].y += fytmp;
+	  f[i].z += fztmp;
+	} else {
+	  f[i].x = fxtmp;
+	  f[i].y = fytmp;
+	  f[i].z = fztmp;
+	}
+        IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride, 
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -395,7 +388,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -406,6 +399,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
 void PairBuckCoulCutIntel::init_style()
 {
   PairBuckCoulCut::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.h b/src/USER-INTEL/pair_buck_coul_cut_intel.h
index 6590cd9c16..42a55ac21f 100644
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.h
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.h
@@ -49,7 +49,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
                const ForceConst<flt_t> &fc);
 
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
index 9319f531e1..a9aee1e53e 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@@ -85,53 +85,47 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+    
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+				packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
 				 IntelBuffers<flt_t,acc_t> *buffers,
 				 const ForceConst<flt_t> &fc,
@@ -170,9 +164,17 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -208,8 +210,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
-    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)	\
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -224,27 +228,34 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
 			      f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none)        \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
-
-      for (int i = iifrom; i < iito; ++i) {
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
         const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
@@ -262,85 +273,98 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
-	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-	}
+	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	if (NEWTON_PAIR == 0)
+	  if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
+	int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
-          flt_t forcecoul, forcebuck, evdwl, ecoul;
-          forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
-
-          const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
-
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+	  const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
+	  
+          if (rsq < c_forcei[jtype].cutsq) {
+	    trsq[ej]=rsq;
+	    tdelx[ej]=delx;
+	    tdely[ej]=dely;
+	    tdelz[ej]=delz;
+	    tjtype[ej]=jtype;
+	    tj[ej]=jlist[jj];
+	    ej++;
+	  }
+	}
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+		                 sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcebuck, evdwl, ecoul;
+          forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0;
+
+	  const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+	  const int jtype = tjtype[jj];
+	  const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
           const flt_t r = (flt_t)1.0 / sqrt(r2inv);
 
-          #ifdef INTEL_VMASK
-          if (rsq < c_forcei[jtype].cutsq) {
+          #ifdef INTEL_ALLOW_TABLE
+          if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
-            #ifdef INTEL_ALLOW_TABLE
-            if (!ncoultablebits || rsq <= tabinnersq) {
-            #endif
-              const flt_t A1 =  0.254829592;
-              const flt_t A2 = -0.284496736;
-              const flt_t A3 =  1.421413741;
-              const flt_t A4 = -1.453152027;
-              const flt_t A5 =  1.061405429;
-              const flt_t EWALD_F = 1.12837917;
-              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
-
-              const flt_t grij = g_ewald * r;
-              const flt_t expm2 = exp(-grij * grij);
-              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-              if (EFLAG) ecoul = prefactor * erfc;
-
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+            const flt_t A1 =  0.254829592;
+	    const flt_t A2 = -0.284496736;
+	    const flt_t A3 =  1.421413741;
+	    const flt_t A4 = -1.453152027;
+	    const flt_t A5 =  1.061405429;
+	    const flt_t EWALD_F = 1.12837917;
+	    const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+	    
+	    const flt_t grij = g_ewald * r;
+	    const flt_t expm2 = exp(-grij * grij);
+	    const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+	    const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+	    forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+	    if (EFLAG) ecoul = prefactor * erfc;
+
+	    const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+	      prefactor;
+	    forcecoul -= adjust;
+	    if (EFLAG) ecoul -= adjust;
+	    
+          #ifdef INTEL_ALLOW_TABLE
+          } else {
+	    float rsq_lookup = rsq;
+	    const int itable = (__intel_castf32_u32(rsq_lookup) &
+	      ncoulmask) >> ncoulshiftbits;
+	    const flt_t fraction = (rsq_lookup - table[itable].r) *
+	      table[itable].dr;
+	    
+	    const flt_t tablet = table[itable].f +
+	      fraction * table[itable].df;
+	    forcecoul = qtmp * q[j] * tablet;
+	    if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+	      fraction * detable[itable]);
+	    if (sbindex) {
+	      const flt_t table2 = ctable[itable] +
+		fraction * dctable[itable];
+	      const flt_t prefactor = qtmp * q[j] * table2;
+	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
 		prefactor;
 	      forcecoul -= adjust;
 	      if (EFLAG) ecoul -= adjust;
-
-            #ifdef INTEL_ALLOW_TABLE
-            } else {
-              float rsq_lookup = rsq;
-              const int itable = (__intel_castf32_u32(rsq_lookup) &
-                                  ncoulmask) >> ncoulshiftbits;
-              const flt_t fraction = (rsq_lookup - table[itable].r) *
-                table[itable].dr;
-
-              const flt_t tablet = table[itable].f +
-                fraction * table[itable].df;
-              forcecoul = qtmp * q[j] * tablet;
-              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-                                                fraction * detable[itable]);
-              if (sbindex) {
-                const flt_t table2 = ctable[itable] +
-                  fraction * dctable[itable];
-                const flt_t prefactor = qtmp * q[j] * table2;
-                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-                  prefactor;
-                forcecoul -= adjust;
-                if (EFLAG) ecoul -= adjust;
-              }
             }
-            #endif
-            #ifdef INTEL_VMASK
           }
-	  #endif
+          #endif
 
 	  #ifdef INTEL_VMASK
           if (rsq < c_forcei[jtype].cut_ljsq) {
@@ -361,80 +385,74 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
           #ifdef INTEL_VMASK
           }
           #else
-          if (rsq > c_forcei[jtype].cutsq)
-            { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
           if (rsq > c_forcei[jtype].cut_ljsq)
             { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; }
           #endif
 
-          #ifdef INTEL_VMASK
-          if (rsq < c_forcei[jtype].cutsq) {
-          #endif
-            const flt_t fpair = (forcecoul + forcebuck) * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
+	  const flt_t fpair = (forcecoul + forcebuck) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+	  if (EFLAG) {
+            sevdwl += evdwl;
+	    secoul += ecoul;
+	    if (eatom) {
+	      fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+	      if (NEWTON_PAIR) 
+		f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
-
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                secoul += ev_pre * ecoul;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                  if (NEWTON_PAIR || j < nlocal) 
-                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                }
-              }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
-            }
-          #ifdef INTEL_VMASK
-          }
-          #endif
+	  }
+	  if (NEWTON_PAIR == 0)
+	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+				  fpx, fpy, fpz);
         } // for jj
-
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+	if (NEWTON_PAIR) {
+	  f[i].x += fxtmp;
+	  f[i].y += fytmp;
+	  f[i].z += fztmp;
+	} else {
+	  f[i].x = fxtmp;
+	  f[i].y = fytmp;
+	  f[i].z = fztmp;
+	}	  
+	IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride, 
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -446,7 +464,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -457,6 +475,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
 void PairBuckCoulLongIntel::init_style()
 {
   PairBuckCoulLong::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -484,6 +506,13 @@ template <class flt_t, class acc_t>
 void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
@@ -518,6 +547,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
+      if (cutsq[i][j] < cut_ljsq[i][j])
+        error->all(FLERR,
+	 "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].buck1 = buck1[i][j];
diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.h b/src/USER-INTEL/pair_buck_coul_long_intel.h
index 57e4517404..ec2cdba177 100644
--- a/src/USER-INTEL/pair_buck_coul_long_intel.h
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.h
@@ -40,7 +40,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
 
  private:
   FixIntel *fix;
-  int _cop, _lrt;
+  int _cop, _lrt, _ccache_stride;
 
   template <class flt_t> class ForceConst;
 
@@ -48,7 +48,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong {
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
                const ForceConst<flt_t> &fc);
 
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp
index 4815d1e025..bbfc7225dd 100644
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@@ -78,57 +78,51 @@ void PairBuckIntel::compute(int eflag, int vflag,
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+				packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairBuckIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+			 IntelBuffers<flt_t,acc_t> *buffers,
+			 const ForceConst<flt_t> &fc,
+			 const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -152,7 +146,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -192,27 +186,26 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 			      f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl =  (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl =  (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none)        \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -228,10 +221,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
+	if (EFLAG) fwtmp = sevdwl =  (acc_t)0;
+	if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
@@ -284,69 +276,70 @@ void PairBuckIntel::eval(const int offload, const int vflag,
                 evdwl *= factor_lj;
             }
             const flt_t fpair =  forcebuck * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
-            
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-              
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl;
-                  if (NEWTON_PAIR || j < nlocal) 
-                    f[j].w += (flt_t)0.5 * evdwl;
-                }
-              }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
-            }
+	    const flt_t fpx = fpair * delx;
+	    fxtmp += fpx;
+	    if (NEWTON_PAIR) f[j].x -= fpx;
+	    const flt_t fpy = fpair * dely;
+	    fytmp += fpy;
+	    if (NEWTON_PAIR) f[j].y -= fpy;
+	    const flt_t fpz = fpair * delz;
+	    fztmp += fpz;
+	    if (NEWTON_PAIR) f[j].z -= fpz;
+
+	    if (EFLAG) {
+	      sevdwl += evdwl;
+	      if (eatom) {
+		fwtmp += (flt_t)0.5 * evdwl;
+		if (NEWTON_PAIR) 
+		  f[j].w += (flt_t)0.5 * evdwl;
+	      }
+	    }
+	    if (NEWTON_PAIR == 0)
+	      IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           }
           #endif
         } // for jj
-
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+	if (NEWTON_PAIR) {
+	  f[i].x += fxtmp;
+	  f[i].y += fytmp;
+	  f[i].z += fztmp;
+	} else {
+	  f[i].x = fxtmp;
+	  f[i].y = fytmp;
+	  f[i].z = fztmp;
+	}
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride, 
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;	
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -358,7 +351,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -367,6 +360,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
 void PairBuckIntel::init_style()
 {
   PairBuck::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
diff --git a/src/USER-INTEL/pair_buck_intel.h b/src/USER-INTEL/pair_buck_intel.h
index 4f039c3f97..e699a1611e 100644
--- a/src/USER-INTEL/pair_buck_intel.h
+++ b/src/USER-INTEL/pair_buck_intel.h
@@ -48,7 +48,7 @@ private:
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers,
                const ForceConst<flt_t> &fc);
 
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp
index f8c972ab8b..541f9745cb 100644
--- a/src/USER-INTEL/pair_eam_intel.cpp
+++ b/src/USER-INTEL/pair_eam_intel.cpp
@@ -90,78 +90,58 @@ void PairEAMIntel::compute(int eflag, int vflag,
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-                                nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
   if (_onetype) {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-        if (force->newton_pair) {
-          eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        if (force->newton_pair) {
-          eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-        eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-        if (force->newton_pair) {
-          eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        if (force->newton_pair) {
-          eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-        } else {
-          eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-        }
+	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-        eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+	eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-        eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-        eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+	eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
@@ -169,8 +149,7 @@ void PairEAMIntel::compute(int eflag, int vflag,
 
 /* ---------------------------------------------------------------------- */
 
-template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	  class acc_t>
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairEAMIntel::eval(const int offload, const int vflag,
 			IntelBuffers<flt_t,acc_t> *buffers,
 			const ForceConst<flt_t> &fc,
@@ -186,7 +165,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
     nmax = atom->nmax;
     int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN;
     if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t);
-    memory->create(rho,nmax*comm->nthreads,"pair:rho");
+    if (NEWTON_PAIR)
+      memory->create(rho,nmax*comm->nthreads,"pair:rho");
+    else
+      memory->create(rho,nmax,"pair:rho");
     memory->create(fp,nmax,"pair:fp");
     // Use single precision allocation for single/mixed mode
     // Keep double version for single and swap_eam
@@ -222,9 +204,17 @@ void PairEAMIntel::eval(const int offload, const int vflag,
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
@@ -252,16 +242,12 @@ void PairEAMIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(fp_f, f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
@@ -270,12 +256,25 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      double * _noalias const trho = rho + tid*nmax;
-      if (NEWTON_PAIR)
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) foff = tid * nmax;
+      else foff = 0;
+      double * _noalias const trho = rho + foff;
+      if (NEWTON_PAIR) {
+	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 	memset(trho, 0, nall * sizeof(double));
-      else
-	memset(trho, 0, nlocal * sizeof(double));
+      }
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
 
       flt_t oscale;
       int rhor_joff, frho_ioff;
@@ -300,53 +299,67 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 	const flt_t ztmp = x[i].z;
 
 	acc_t rhoi = (acc_t)0.0;
-	#if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-        #pragma simd reduction(+:rhoi)
+	int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
 	#endif
 	for (int jj = 0; jj < jnum; jj++) {
-          int j, jtype;
-	  j = jlist[jj] & NEIGHMASK;
-
+	  const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
 	  const flt_t rsq = delx*delx + dely*dely + delz*delz;
 
 	  if (rsq < fcutforcesq) {
-	    if (!ONETYPE) jtype = x[j].w;
-	    flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
-	    int m = static_cast<int> (p);
-	    m = MIN(m,nr-1);
-	    p -= m;
-	    p = MIN(p,(flt_t)1.0);
-	    if (!ONETYPE)
-	      rhor_joff = rhor_ioff + jtype * jstride;
-	    const int joff = rhor_joff + m;
-	    flt_t ra;
-	    ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
-		  rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
-	    rhoi += ra;
-	    if (NEWTON_PAIR || j < nlocal) {
-	      if (!ONETYPE) {
-		const int ioff = jtype * istride + itype * jstride + m;
-		ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
-                      rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
-	      }
-	      trho[j] += ra;
-            }
+	    trsq[ej]=rsq;
+	    if (!ONETYPE) tjtype[ej]=x[j].w;
+	    tj[ej]=jlist[jj];
+	    ej++;
           }
+        }
+
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+        #pragma simd reduction(+:rhoi)
+	#endif
+        for (int jj = 0; jj < ej; jj++) {
+	  int jtype;
+	  const int j = tj[jj] & NEIGHMASK;
+	  if (!ONETYPE) jtype = tjtype[jj];
+	  const flt_t rsq = trsq[jj];
+	  flt_t p = sqrt(rsq)*frdr + (flt_t)1.0;
+	  int m = static_cast<int> (p);
+	  m = MIN(m,nr-1);
+	  p -= m;
+	  p = MIN(p,(flt_t)1.0);
+	  if (!ONETYPE)
+	    rhor_joff = rhor_ioff + jtype * jstride;
+	  const int joff = rhor_joff + m;
+	  flt_t ra;
+	  ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p +
+		rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d;
+	  rhoi += ra;
+	  if (NEWTON_PAIR) {
+	    if (!ONETYPE) {
+	      const int ioff = jtype * istride + itype * jstride + m;
+	      ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p +
+		    rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d;
+	    }
+	    trho[j] += ra;
+	  }
         } // for jj
-	trho[i] += rhoi;
+	if (NEWTON_PAIR)
+	  trho[i] += rhoi;
+	else
+	  trho[i] = rhoi;
       } // for i
 
       #if defined(_OPENMP)
-      if (nthreads > 1) {
+      if (NEWTON_PAIR && nthreads > 1) {
         #pragma omp barrier
         if (tid == 0) {  
-          int rcount;
-	  if (NEWTON_PAIR) rcount = nall;
-	  else rcount = nlocal;
+          const int rcount = nall;
 	  if (nthreads == 2) {
             double *trho2 = rho + nmax;
 	    #pragma vector aligned
@@ -431,10 +444,9 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       #pragma omp barrier
       #endif
 
-      if (tid == 0) {
+      if (tid == 0)
         comm->forward_comm_pair(this);
-	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
-      } else
+      if (NEWTON_PAIR)
 	memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       #if defined(_OPENMP)
@@ -462,124 +474,142 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 	const flt_t ytmp = x[i].y;
 	const flt_t ztmp = x[i].z;
 	fxtmp = fytmp = fztmp = (acc_t)0;
-	if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
-
-	#if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl,	\
-	                         sv0, sv1, sv2, sv3, sv4, sv5)
+	if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+	int ej = 0;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
 	#endif
 	for (int jj = 0; jj < jnum; jj++) {
-          int j, jtype;
-	  j = jlist[jj] & NEIGHMASK;
-
+	  const int j = jlist[jj] & NEIGHMASK;
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
 	  const flt_t rsq = delx*delx + dely*dely + delz*delz;
 
-
 	  if (rsq < fcutforcesq) {
-	    if (!ONETYPE) jtype = x[j].w;
-	    const flt_t r = sqrt(rsq);
-	    flt_t p = r*frdr + (flt_t)1.0;
-	    int m = static_cast<int> (p);
-	    m = MIN(m,nr-1);
-	    p -= m;
-	    p = MIN(p,(flt_t)1.0);
-	    if (!ONETYPE)
-	      rhor_joff = rhor_ioff + jtype * jstride;
-	    const int joff = rhor_joff + m;
-	    const flt_t rhojp = (rhor_spline_f[joff].a*p + 
-                                 rhor_spline_f[joff].b)*p + 
-	                        rhor_spline_f[joff].c;
-	    flt_t rhoip;
-	    if (!ONETYPE) {
-	      const int ioff = jtype * istride + itype * jstride + m;
-	      rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + 
-		      rhor_spline_f[ioff].c;
-	    } else
-	      rhoip = rhojp;
-	    const flt_t z2p = (z2r_spline_t[joff].a*p + 
-                               z2r_spline_t[joff].b)*p + 
-                              z2r_spline_t[joff].c;
-	    const flt_t z2 = ((z2r_spline_t[joff].d*p + 
-			       z2r_spline_t[joff].e)*p + 
-			      z2r_spline_t[joff].f)*p + 
-	                     z2r_spline_t[joff].g;
-
-	    const flt_t recip = (flt_t)1.0/r;
-	    const flt_t phi = z2*recip;
-	    const flt_t phip = z2p*recip - phi*recip;
-	    const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
-	    if (!ONETYPE)
-	      oscale = scale_fi[jtype];
-	    const flt_t fpair = -oscale*psip*recip;
-
-	    fxtmp += delx*fpair;
-	    fytmp += dely*fpair;
-	    fztmp += delz*fpair;
-	    if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx*fpair;
-	      f[j].y -= dely*fpair;
-	      f[j].z -= delz*fpair;
-            }
+	    trsq[ej]=rsq;
+	    tdelx[ej]=delx;
+	    tdely[ej]=dely;
+	    tdelz[ej]=delz;
+	    if (!ONETYPE) tjtype[ej]=x[j].w;
+	    tj[ej]=jlist[jj];
+	    ej++;
+	  }
+	}
 
-	    if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i<nlocal)
-	        ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j<nlocal)
-	        ev_pre += (flt_t)0.5;
-
-	      if (EFLAG) {
-                const flt_t evdwl = oscale*phi;
-                sevdwl += ev_pre * evdwl;
-		if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-		    fwtmp += (flt_t)0.5 * evdwl;
-		  if (NEWTON_PAIR || j < nlocal)
-		    f[j].w += (flt_t)0.5 * evdwl;
-                }
-              }
-              IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-	                           delx, dely, delz);
-            }
-          } // if rsq
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+		                 sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+	  int jtype;
+	  const int j = tj[jj] & NEIGHMASK;
+	  if (!ONETYPE) jtype = tjtype[jj];
+	  const flt_t rsq = trsq[jj];
+	  const flt_t r = sqrt(rsq);
+	  flt_t p = r*frdr + (flt_t)1.0;
+	  int m = static_cast<int> (p);
+	  m = MIN(m,nr-1);
+	  p -= m;
+	  p = MIN(p,(flt_t)1.0);
+	  if (!ONETYPE)
+	    rhor_joff = rhor_ioff + jtype * jstride;
+	  const int joff = rhor_joff + m;
+	  const flt_t rhojp = (rhor_spline_f[joff].a*p + 
+			       rhor_spline_f[joff].b)*p + 
+	    rhor_spline_f[joff].c;
+	  flt_t rhoip;
+	  if (!ONETYPE) {
+	    const int ioff = jtype * istride + itype * jstride + m;
+	    rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + 
+	      rhor_spline_f[ioff].c;
+	  } else
+	    rhoip = rhojp;
+	  const flt_t z2p = (z2r_spline_t[joff].a*p + 
+			     z2r_spline_t[joff].b)*p + 
+	    z2r_spline_t[joff].c;
+	  const flt_t z2 = ((z2r_spline_t[joff].d*p + 
+			     z2r_spline_t[joff].e)*p + 
+			    z2r_spline_t[joff].f)*p + 
+	    z2r_spline_t[joff].g;
+	  
+	  const flt_t recip = (flt_t)1.0/r;
+	  const flt_t phi = z2*recip;
+	  const flt_t phip = z2p*recip - phi*recip;
+	  const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip;
+	  if (!ONETYPE)
+	    oscale = scale_fi[jtype];
+	  const flt_t fpair = -oscale*psip*recip;
+	  
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+	  if (EFLAG) {
+	    const flt_t evdwl = oscale*phi;
+	    sevdwl += evdwl;
+	    if (eatom) {
+	      fwtmp += (flt_t)0.5 * evdwl;
+	      if (NEWTON_PAIR)
+		f[j].w += (flt_t)0.5 * evdwl;
+	    }
+	  }
+	  if (NEWTON_PAIR == 0) 
+	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+				  fpx, fpy, fpz);
         } // for jj
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-
-        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+	if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+	  sevdwl *= (acc_t)0.5;
+        }
+	
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for i
 
-      if (vflag == 2) {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride,
-			       x, offload);
-      }
-
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } /// omp
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
+      }	
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -591,7 +621,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -604,6 +634,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 void PairEAMIntel::init_style()
 {
   PairEAM::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -633,6 +667,13 @@ template <class flt_t, class acc_t>
 void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
 				    IntelBuffers<flt_t,acc_t> *buffers)
 {
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,nr,nrho,memory,_cop);
   buffers->set_ntypes(tp1);
diff --git a/src/USER-INTEL/pair_eam_intel.h b/src/USER-INTEL/pair_eam_intel.h
index f7fb71ad2c..c7bb3b7bd0 100644
--- a/src/USER-INTEL/pair_eam_intel.h
+++ b/src/USER-INTEL/pair_eam_intel.h
@@ -41,7 +41,7 @@ class PairEAMIntel : public PairEAM {
  protected:
 
   FixIntel *fix;
-  int _cop, _onetype;
+  int _cop, _onetype, _ccache_stride;
   float *fp_float;
 
   template <class flt_t>
@@ -53,7 +53,7 @@ class PairEAMIntel : public PairEAM {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, 
 	    class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index c1e3d1b37f..af96fcbb79 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -88,12 +88,16 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
     const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
     const int * const ellipsoid = atom->ellipsoid;
     QUAT_T * _noalias const quat = buffers->get_quat();
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
-      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads,
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads,
 				sizeof(ATOM_T));
       if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
 
@@ -114,39 +118,29 @@ void PairGayBerneIntel::compute(int eflag, int vflag,
     fix->stop_watch(TIME_PACK);
   }
 
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairGayBerneIntel::eval(const int offload, const int vflag,
                              IntelBuffers<flt_t,acc_t> *buffers,
                              const ForceConst<flt_t> &fc,
@@ -167,8 +161,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   if (fix->separate_buffers()) {
     fix->start_watch(TIME_PACK);
     if (offload) {
-      #pragma omp parallel default(none) \
-	shared(buffers,nlocal,nall,bonus,ellipsoid)
+      #pragma omp parallel 
       {
         int ifrom, ito, tid;
 	int nthreads = comm->nthreads;
@@ -258,7 +251,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -334,6 +327,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       x[nall].x = (flt_t)INTEL_BIGP;
       x[nall].y = (flt_t)INTEL_BIGP;
       x[nall].z = (flt_t)INTEL_BIGP;
+      x[nall].w = 1;
       quat[nall].w = (flt_t)1.0;
       quat[nall].i = (flt_t)0.0;
       quat[nall].j = (flt_t)0.0;
@@ -342,25 +336,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     #endif
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0.0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0.0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    if (NEWTON_PAIR == 0) f_start[1].w = 0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride);
-      memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2;
+      else foff = minlocal*-2;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
 
       flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors;
       flt_t * _noalias const delx_form = delx_formi + tid * max_nbors;
@@ -370,7 +364,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       int * _noalias const jlist_form = jlist_formi + tid * max_nbors;
 
       int ierror = 0;
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         // const int i = ilist[ii];
         const int itype = x[i].w;
         const int ptr_off = itype * ntypes;
@@ -401,14 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
         fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
 
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+	if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+	if (NEWTON_PAIR == 0)
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
-        }
 
         bool multiple_forms = false;
         int packed_j = 0;
-        for (int jj = 0; jj < jnum; jj++) {
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+	#pragma ivdep
+	#endif
+	for (int jj = 0; jj < jnum; jj++) {
           int jm = jlist[jj];
           int j = jm & NEIGHMASK;
           const int jtype = x[j].w;
@@ -573,7 +570,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ME_cross3(tempv, tempv2, dUr);
           flt_t dUr2_0, dUr2_1, dUr2_2;
 
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             ME_vecmat(kappa, g2, tempv2);
             ME_cross3(tempv, tempv2, dUr2);
           }
@@ -588,7 +585,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           dchi_2 *= temp1;
           flt_t dchi2_0, dchi2_1, dchi2_2;
 
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             ME_vecmat(iota, b2, tempv);
             ME_cross3(tempv, iota, dchi2);
             dchi2_0 *= temp1;
@@ -630,7 +627,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           // compute d_eta for particle 2
 
           flt_t deta2_0, deta2_1, deta2_2;
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
             ME_compute_eta_torque(g12, a2, shape2, temp);
 
@@ -672,7 +669,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) *
 	    (flt_t)-1.0;
 
-          if (NEWTON_PAIR || j < nlocal) {
+          if (NEWTON_PAIR) {
             rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) *
 	      (flt_t)-1.0;
             rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) *
@@ -714,7 +711,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 	    t2tmp += ttor_1;
 	    t3tmp += ttor_2;
 
-	    if (NEWTON_PAIR || j < nlocal) {
+	    if (NEWTON_PAIR) {
 	      rtor_0 *= factor_lj;
 	      rtor_1 *= factor_lj;
 	      rtor_2 *= factor_lj;
@@ -728,34 +725,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 	      f[jp].z += rtor_2;
 	    }
 
-	    if (EVFLAG) {
-	      flt_t ev_pre = (flt_t)0.0;
-	      if (NEWTON_PAIR || i < nlocal)
-		ev_pre += (flt_t)0.5;
-	      if (NEWTON_PAIR || j < nlocal)
-		ev_pre += (flt_t)0.5;
-
-	      if (EFLAG) {
-		evdwl = factor_lj * one_eng;
-		sevdwl += ev_pre * evdwl;
-		if (eatom) {
-		  if (NEWTON_PAIR || i < nlocal)
-		    fwtmp += (flt_t)0.5 * evdwl;
-		  if (NEWTON_PAIR || j < nlocal)
-		    f[j*2].w += (flt_t)0.5 * evdwl;
-		}
+	    if (EFLAG) {
+	      evdwl = factor_lj * one_eng;
+	      sevdwl += evdwl;
+	      if (eatom) {
+		fwtmp += (flt_t)0.5 * evdwl;
+		if (NEWTON_PAIR)
+		  f[j*2].w += (flt_t)0.5 * evdwl;
 	      }
+	    }
 
+	    if (NEWTON_PAIR == 0) {
 	      if (vflag == 1) {
-		ev_pre *= (flt_t)-1.0;
-		sv0 += ev_pre * delx_form[jj] * fforce_0;
-		sv1 += ev_pre * dely_form[jj] * fforce_1;
-		sv2 += ev_pre * delz_form[jj] * fforce_2;
-		sv3 += ev_pre * delx_form[jj] * fforce_1;
-		sv4 += ev_pre * delx_form[jj] * fforce_2;
-		sv5 += ev_pre * dely_form[jj] * fforce_2;
+		sv0 += delx_form[jj] * fforce_0;
+		sv1 += dely_form[jj] * fforce_1;
+		sv2 += delz_form[jj] * fforce_2;
+		sv3 += delx_form[jj] * fforce_1;
+		sv4 += delx_form[jj] * fforce_2;
+		sv5 += dely_form[jj] * fforce_2;
 	      }
-	    } // EVFLAG
+            } // EVFLAG
 	  #ifdef INTEL_VMASK
 	  }
 	  #endif
@@ -767,19 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           ierror = 2;
 
         int ip = i * 2;
-        f[ip].x += fxtmp;
-        f[ip].y += fytmp;
-        f[ip].z += fztmp;
-        ip++;
-        f[ip].x += t1tmp;
-        f[ip].y += t2tmp;
-        f[ip].z += t3tmp;
-
-        if (EVFLAG) {
-          if (EFLAG) {
-            if (eatom) f[i * 2].w += fwtmp;
-            oevdwl += sevdwl;
-          }
+	if (NEWTON_PAIR) {
+	  f[ip].x += fxtmp;
+	  f[ip].y += fytmp;
+	  f[ip].z += fztmp;
+	  ip++;
+	  f[ip].x += t1tmp;
+	  f[ip].y += t2tmp;
+	  f[ip].z += t3tmp;
+	} else {
+	  f[ip].x = fxtmp;
+	  f[ip].y = fytmp;
+	  f[ip].z = fztmp;
+	  ip++;
+	  f[ip].x = t1tmp;
+	  f[ip].y = t2tmp;
+	  f[ip].z = t3tmp;
+	}
+
+	if (EFLAG) {
+	  oevdwl += sevdwl;
+	  if (eatom) f[i * 2].w += fwtmp;
+	}
+	if (NEWTON_PAIR == 0) {
           if (vflag == 1) {
             ov0 += sv0;
             ov1 += sv1;
@@ -791,56 +790,31 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         }
       } // for i
       int o_range;
-      if (NEWTON_PAIR)
+      if (NEWTON_PAIR) {
         o_range = nall;
-      else
-        o_range = nlocal;
-      if (offload == 0) o_range -= minlocal;
-      IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
-			     sizeof(FORCE_T));
-      const int two_iito = iito * 2;
-
-      acc_t *facc = &(f_start[0].x);
-      const int sto = two_iito * 4;
-      const int fst4 = f_stride * 4;
-      #if defined(_OPENMP)
-      #pragma omp barrier
-      #endif
-      int t_off = f_stride;
-      if (EFLAG && eatom) {
-        for (int t = 1; t < nthreads; t++) {
-          #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector nontemporal
-	  #pragma novector
-          #endif
-          for (int n = iifrom * 2; n < two_iito; n++) {
-            f_start[n].x += f_start[n + t_off].x;
-            f_start[n].y += f_start[n + t_off].y;
-            f_start[n].z += f_start[n + t_off].z;
-            f_start[n].w += f_start[n + t_off].w;
-          }
-          t_off += f_stride;
-        }
-      } else {
-        for (int t = 1; t < nthreads; t++) {
+	if (offload == 0) o_range -= minlocal;
+	IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,
+                               sizeof(FORCE_T));
+	const int sto = iito * 8;
+	const int fst4 = f_stride * 4;
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+	acc_t *f_scalar = &f_start[0].x;
+        acc_t *f_scalar2 = f_scalar + fst4;
+	for (int t = 1; t < nthreads; t++) {
           #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector nontemporal
-	  #pragma novector
+	  #pragma vector aligned
+	  #pragma simd
           #endif
-          for (int n = iifrom * 2; n < two_iito; n++) {
-            f_start[n].x += f_start[n + t_off].x;
-            f_start[n].y += f_start[n + t_off].y;
-            f_start[n].z += f_start[n + t_off].z;
-          }
-          t_off += f_stride;
+	  for (int n = iifrom * 8; n < sto; n++)
+	    f_scalar[n] += f_scalar2[n];
+	  f_scalar2 += fst4;
         }
-      }
 
-      if (EVFLAG) {
         if (vflag==2) {
-          const ATOM_T * _noalias const xo = x + minlocal;
+	  const ATOM_T * _noalias const xo = x + minlocal;
           #if defined(LMP_SIMD_COMPILER)
-	  #pragma vector nontemporal
 	  #pragma novector
           #endif
           for (int n = iifrom; n < iito; n++) {
@@ -852,26 +826,33 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
             ov4 += f_start[nt2].z * xo[n].x;
             ov5 += f_start[nt2].z * xo[n].y;
           }
-        }
+	}
       }
 
       if (ierror)
         f_start[1].w = ierror;
     } // omp
 
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)-0.5;
+	ov1 *= (acc_t)-0.5;
+	ov2 *= (acc_t)-0.5;
+	ov3 *= (acc_t)-0.5;
+	ov4 *= (acc_t)-0.5;
+	ov5 *= (acc_t)-0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
 
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
@@ -884,7 +865,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
   else
     fix->add_result_array(f_start, 0, offload, 0, 0, 2);
@@ -895,6 +876,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 void PairGayBerneIntel::init_style()
 {
   PairGayBerne::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h
index aaed31d567..07dfba14d1 100644
--- a/src/USER-INTEL/pair_gayberne_intel.h
+++ b/src/USER-INTEL/pair_gayberne_intel.h
@@ -43,7 +43,7 @@ class PairGayBerneIntel : public PairGayBerne {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index ce6e40141f..7548b6eea3 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -82,54 +82,48 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
-			      nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
   // -------------------- Regular version
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 				     IntelBuffers<flt_t,acc_t> *buffers,
 				     const ForceConst<flt_t> &fc,
@@ -182,7 +176,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -236,25 +230,24 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 			      f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
       flt_t cutboth = cut_coulsq;
 
       const int toffs = tid * ccache_stride;
@@ -265,7 +258,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
       int * _noalias const tj = ccachei + toffs;
       int * _noalias const tjtype = ccachej + toffs;
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
 	//        const int i = ilist[ii];
         const int itype = x[i].w;
 
@@ -284,10 +277,9 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	if (NEWTON_PAIR == 0)
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-	}
 
 	int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
@@ -421,77 +413,76 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 	  #ifdef INTEL_VMASK
 	  }
 	  #else
-	  if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
 	  if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
 	  #endif
 
-	  const flt_t delx = tdelx[jj];
-	  const flt_t dely = tdely[jj];
-	  const flt_t delz = tdelz[jj];
 	  const flt_t fpair = (forcecoul + forcelj) * r2inv;
-	  fxtmp += delx * fpair;
-	  fytmp += dely * fpair;
-	  fztmp += delz * fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    f[j].x -= delx * fpair;
-	    f[j].y -= dely * fpair;
-	    f[j].z -= delz * fpair;
-	  }
-
-	  if (EVFLAG) {
-	    flt_t ev_pre = (flt_t)0;
-	    if (NEWTON_PAIR || i < nlocal)
-	      ev_pre += (flt_t)0.5;
-	    if (NEWTON_PAIR || j < nlocal)
-	      ev_pre += (flt_t)0.5;
-	    
-	    if (EFLAG) {
-	      sevdwl += ev_pre * evdwl;
-	      secoul += ev_pre * ecoul;
-	      if (eatom) {
-		if (NEWTON_PAIR || i < nlocal)
-		  fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-		if (NEWTON_PAIR || j < nlocal)
-		  f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-	      }
+	  const flt_t fpx = fpair * tdelx[jj];
+	  fxtmp += fpx;
+	  if (NEWTON_PAIR) f[j].x -= fpx;
+	  const flt_t fpy = fpair * tdely[jj];
+	  fytmp += fpy;
+	  if (NEWTON_PAIR) f[j].y -= fpy;
+	  const flt_t fpz = fpair * tdelz[jj];
+	  fztmp += fpz;
+	  if (NEWTON_PAIR) f[j].z -= fpz;
+
+	  if (EFLAG) {
+	    sevdwl += evdwl;
+	    secoul += ecoul;
+	    if (eatom) {
+	      fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+	      if (NEWTON_PAIR)
+		f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
 	    }
-
-	    IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-				 delx, dely, delz);
 	  }
+	  if (NEWTON_PAIR == 0)
+	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj],
+				  fpx, fpy, fpz);
         } // for jj
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-
-	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+	IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-	  		       nlocal, minlocal, nthreads, f_start, f_stride,
-	                       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+	oevdwl *= (acc_t)0.5;
+	oecoul *= (acc_t)0.5;
       }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -503,7 +494,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -514,6 +505,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 void PairLJCharmmCoulLongIntel::init_style()
 {
   PairLJCharmmCoulLong::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -541,11 +536,6 @@ template <class flt_t, class acc_t>
 void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
-  int tp1 = atom->ntypes + 1;
-  int ntable = 1;
-  if (ncoultablebits)
-    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
-
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
@@ -553,6 +543,11 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
   buffers->grow_ccache(off_ccache, comm->nthreads, 1);
   _ccache_stride = buffers->ccache_stride();
 
+  int tp1 = atom->ntypes + 1;
+  int ntable = 1;
+  if (ncoultablebits)
+    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
   fc.set_ntypes(tp1, ntable, memory, _cop);
   buffers->set_ntypes(tp1);
   flt_t **cutneighsq = buffers->get_cutneighsq();
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
index 6a207d8400..cafc412a91 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -48,7 +48,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index f26ff724c8..8a0bed2c01 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -83,57 +83,50 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
 
   if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-				nthreads, sizeof(ATOM_T));
+				packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    if (force->newton_pair) {
+      eval<1,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      if (force->newton_pair) {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-      } else {
-	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-      }
+      eval<1,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<1,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   } else {
     if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+      eval<0,1>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,1>(0, ovflag, buffers, fc, host_start, inum);
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      eval<0,0>(1, ovflag, buffers, fc, 0, offload_end);
+      eval<0,0>(0, ovflag, buffers, fc, host_start, inum);
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
-				     IntelBuffers<flt_t,acc_t> *buffers,
-				     const ForceConst<flt_t> &fc,
-				     const int astart, const int aend)
+				  IntelBuffers<flt_t,acc_t> *buffers,
+				  const ForceConst<flt_t> &fc,
+				  const int astart, const int aend)
 {
   const int inum = aend - astart;
   if (inum == 0) return;
@@ -167,9 +160,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   const int ntypes = atom->ntypes + 1;
   const int eatom = this->eflag_atom;
 
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  const int ccache_stride = _ccache_stride;
+
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -204,8 +205,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     in(x:length(x_size) alloc_if(0) free_if(0)) \
     in(q:length(q_size) alloc_if(0) free_if(0)) \
     in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
     in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
-    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload)	\
     out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
     out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
     out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@@ -220,27 +223,34 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 			      f_stride, x, q);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
-
-      for (int i = iifrom; i < iito; ++i) {
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+
+      for (int i = iifrom; i < iito; i += iip) {
         const int itype = x[i].w;
 
         const int ptr_off = itype * ntypes;
@@ -258,86 +268,98 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         const flt_t ztmp = x[i].z;
         const flt_t qtmp = q[i];
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	if (NEWTON_PAIR == 0)
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-	}
 
+	int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
-	#pragma vector aligned
-	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
-	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma vector aligned
+        #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
-          flt_t forcecoul, forcelj, evdwl, ecoul;
-          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
-
-          const int sbindex = jlist[jj] >> SBBITS & 3;
           const int j = jlist[jj] & NEIGHMASK;
-
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+	  const int jtype = x[j].w;
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
+	  if (rsq < c_forcei[jtype].cutsq) {
+	    trsq[ej]=rsq;
+	    tdelx[ej]=delx;
+	    tdely[ej]=dely;
+	    tdelz[ej]=delz;
+	    tjtype[ej]=jtype;
+	    tj[ej]=jlist[jj];
+	    ej++;
+	  }
+	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+		                 sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < ej; jj++) {
+          flt_t forcecoul, forcelj, evdwl, ecoul;
+          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
+
+	  const int j = tj[jj] & NEIGHMASK;
+          const int sbindex = tj[jj] >> SBBITS & 3;
+	  const int jtype = tjtype[jj];
+	  const flt_t rsq = trsq[jj];
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < c_forcei[jtype].cutsq) {
+          #ifdef INTEL_ALLOW_TABLE
+	  if (!ncoultablebits || rsq <= tabinnersq) {
           #endif
-            #ifdef INTEL_ALLOW_TABLE
-            if (!ncoultablebits || rsq <= tabinnersq) {
-            #endif
-              const flt_t A1 =  0.254829592;
-              const flt_t A2 = -0.284496736;
-              const flt_t A3 =  1.421413741;
-              const flt_t A4 = -1.453152027;
-              const flt_t A5 =  1.061405429;
-              const flt_t EWALD_F = 1.12837917;
-              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
-
-              const flt_t r = (flt_t)1.0 / sqrt(r2inv);
-              const flt_t grij = g_ewald * r;
-              const flt_t expm2 = exp(-grij * grij);
-              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
-              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
-              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
-              if (EFLAG) ecoul = prefactor * erfc;
-
-	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+	    const flt_t A1 =  0.254829592;
+	    const flt_t A2 = -0.284496736;
+	    const flt_t A3 =  1.421413741;
+	    const flt_t A4 = -1.453152027;
+	    const flt_t A5 =  1.061405429;
+	    const flt_t EWALD_F = 1.12837917;
+	    const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+	    
+	    const flt_t r = (flt_t)1.0 / sqrt(r2inv);
+	    const flt_t grij = g_ewald * r;
+	    const flt_t expm2 = exp(-grij * grij);
+	    const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+	    const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+	    const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+	    forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+	    if (EFLAG) ecoul = prefactor * erfc;
+	    
+	    const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+	      prefactor;
+	    forcecoul -= adjust;
+	    if (EFLAG) ecoul -= adjust;
+
+          #ifdef INTEL_ALLOW_TABLE
+          } else {
+	    float rsq_lookup = rsq;
+	    const int itable = (__intel_castf32_u32(rsq_lookup) &
+				ncoulmask) >> ncoulshiftbits;
+	    const flt_t fraction = (rsq_lookup - table[itable].r) *
+	      table[itable].dr;
+
+	    const flt_t tablet = table[itable].f +
+	      fraction * table[itable].df;
+	    forcecoul = qtmp * q[j] * tablet;
+	    if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+					      fraction * detable[itable]);
+	    if (sbindex) {
+	      const flt_t table2 = ctable[itable] +
+		fraction * dctable[itable];
+	      const flt_t prefactor = qtmp * q[j] * table2;
+	      const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
 		prefactor;
 	      forcecoul -= adjust;
 	      if (EFLAG) ecoul -= adjust;
-
-            #ifdef INTEL_ALLOW_TABLE
-            } else {
-              float rsq_lookup = rsq;
-              const int itable = (__intel_castf32_u32(rsq_lookup) &
-                  ncoulmask) >> ncoulshiftbits;
-              const flt_t fraction = (rsq_lookup - table[itable].r) *
-                  table[itable].dr;
-
-              const flt_t tablet = table[itable].f +
-                  fraction * table[itable].df;
-              forcecoul = qtmp * q[j] * tablet;
-              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
-                  fraction * detable[itable]);
-              if (sbindex) {
-                const flt_t table2 = ctable[itable] +
-                    fraction * dctable[itable];
-                const flt_t prefactor = qtmp * q[j] * table2;
-                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
-                    prefactor;
-                forcecoul -= adjust;
-                if (EFLAG) ecoul -= adjust;
-              }
-            }
-            #endif
-	  #ifdef INTEL_VMASK
+	    }
 	  }
-	  #endif
+          #endif
 
 	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cut_ljsq) {
@@ -357,80 +379,79 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 	  #ifdef INTEL_VMASK
 	  }
 	  #else
-	  if (rsq > c_forcei[jtype].cutsq)
-	    { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
 	  if (rsq > c_forcei[jtype].cut_ljsq)
 	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
 	  #endif
 
-	  #ifdef INTEL_VMASK
-	  if (rsq < c_forcei[jtype].cutsq) {
-	  #endif
-            const flt_t fpair = (forcecoul + forcelj) * r2inv;
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
+	  const flt_t fpair = (forcecoul + forcelj) * r2inv;
+          const flt_t fpx = fpair * tdelx[jj];
+          fxtmp += fpx;
+          if (NEWTON_PAIR) f[j].x -= fpx;
+          const flt_t fpy = fpair * tdely[jj];
+          fytmp += fpy;
+          if (NEWTON_PAIR) f[j].y -= fpy;
+          const flt_t fpz = fpair * tdelz[jj];
+          fztmp += fpz;
+          if (NEWTON_PAIR) f[j].z -= fpz;
+
+	  if (EFLAG) {
+	    sevdwl += evdwl;
+	    secoul += ecoul;
+	    if (eatom) {
+	      fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+	      if (NEWTON_PAIR)
+		f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
             }
-
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i < nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j < nlocal)
-                ev_pre += (flt_t)0.5;
-
-              if (EFLAG) {
-                sevdwl += ev_pre * evdwl;
-                secoul += ev_pre * ecoul;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                  if (NEWTON_PAIR || j < nlocal)
-                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
-                }
-              }
- 	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
-            }
-          #ifdef INTEL_VMASK
-	  }
-	  #endif
+          } 
+	  if (NEWTON_PAIR == 0)
+ 	    IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], 
+				  fpx, fpy, fpz);
         } // for jj
 
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
-	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+	if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+	  f[i].y += fytmp;
+	  f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+	  f[i].y = fytmp;
+	  f[i].z = fztmp;
+        }
+
+	IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride,
-	                       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) {
+	oevdwl *= (acc_t)0.5;
+	oecoul *= (acc_t)0.5;
       }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
+      }   
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -442,7 +463,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -453,6 +474,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 void PairLJCutCoulLongIntel::init_style()
 {
   PairLJCutCoulLong::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
@@ -480,6 +505,13 @@ template <class flt_t, class acc_t>
 void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
                                           IntelBuffers<flt_t,acc_t> *buffers)
 {
+  int off_ccache = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop >= 0) off_ccache = 1;
+  #endif
+  buffers->grow_ccache(off_ccache, comm->nthreads, 1);
+  _ccache_stride = buffers->ccache_stride();
+
   int tp1 = atom->ntypes + 1;
   int ntable = 1;
   if (ncoultablebits)
@@ -514,6 +546,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
 
   for (int i = 0; i < tp1; i++) {
     for (int j = 0; j < tp1; j++) {
+      if (cutsq[i][j] < cut_ljsq[i][j])
+	error->all(FLERR,
+	 "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic");
       fc.c_force[i][j].cutsq = cutsq[i][j];
       fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
       fc.c_force[i][j].lj1 = lj1[i][j];
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
index dad73d18bd..2b7d87c040 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@@ -42,13 +42,13 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
 
  private:
   FixIntel *fix;
-  int _cop, _lrt;
+  int _cop, _lrt, _ccache_stride;
 
   template <class flt_t> class ForceConst;
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index dd08dc023c..8620646343 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -75,85 +75,64 @@ void PairLJCutIntel::compute(int eflag, int vflag,
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-                                nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
 
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
   if (_onetype) {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-	if (force->newton_pair) {
-	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	if (force->newton_pair) {
-	  eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-	eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   } else {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-	if (force->newton_pair) {
-	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	if (force->newton_pair) {
-	  eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-	} else {
-	  eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-	  eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
-	}
+	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     } else {
       if (force->newton_pair) {
-	eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
-	eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+	eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
       } else {
-	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
-	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+	eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
       }
     }
   }
 }
 
-template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	  class acc_t>
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
 void PairLJCutIntel::eval(const int offload, const int vflag,
                           IntelBuffers<flt_t,acc_t> *buffers,
                           const ForceConst<flt_t> &fc,
@@ -181,7 +160,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -200,25 +179,24 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 			      f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
-      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
-      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       flt_t cutsq, lj1, lj2, lj3, lj4, offset;
       if (ONETYPE) {
@@ -229,7 +207,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 	lj4 = lj34[3].lj4;
 	offset = ljc12o[3].offset;
       }
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         int itype, ptr_off;
         const FC_PACKED1_T * _noalias ljc12oi;
         const FC_PACKED2_T * _noalias lj34i;
@@ -250,10 +228,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
         fxtmp = fytmp = fztmp = (acc_t)0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
+	if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+	if (NEWTON_PAIR == 0)
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
         #pragma vector aligned
@@ -301,83 +278,84 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 	    else
 	      fpair = forcelj * r2inv;
 
-            fxtmp += delx * fpair;
-            fytmp += dely * fpair;
-            fztmp += delz * fpair;
-            if (NEWTON_PAIR || j < nlocal) {
-              f[j].x -= delx * fpair;
-              f[j].y -= dely * fpair;
-              f[j].z -= delz * fpair;
-            }
-
-            if (EVFLAG) {
-              flt_t ev_pre = (flt_t)0;
-              if (NEWTON_PAIR || i<nlocal)
-                ev_pre += (flt_t)0.5;
-              if (NEWTON_PAIR || j<nlocal)
-                ev_pre += (flt_t)0.5;
-
-              if (EFLAG) {
-		if (!ONETYPE) {
-		  lj3 = lj34i[jtype].lj3;
-		  lj4 = lj34i[jtype].lj4;
-		  offset = ljc12oi[jtype].offset;
-		}
-                evdwl = r6inv * (lj3 * r6inv - lj4);
-                #ifdef INTEL_VMASK
-		evdwl -= offset;
-		#else
-		if (rsq < cutsq) evdwl -= offset;
-		#endif
-                if (!ONETYPE) evdwl *= factor_lj;
-                sevdwl += ev_pre*evdwl;
-                if (eatom) {
-                  if (NEWTON_PAIR || i < nlocal)
-                    fwtmp += 0.5 * evdwl;
-                  if (NEWTON_PAIR || j < nlocal)
-                    f[j].w += 0.5 * evdwl;
-                }
+	    const flt_t fpx = fpair * delx;
+	    fxtmp += fpx;
+	    if (NEWTON_PAIR) f[j].x -= fpx;
+	    const flt_t fpy = fpair * dely;
+	    fytmp += fpy;
+	    if (NEWTON_PAIR) f[j].y -= fpy;
+	    const flt_t fpz = fpair * delz;
+	    fztmp += fpz;
+	    if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+	      if (!ONETYPE) {
+		lj3 = lj34i[jtype].lj3;
+		lj4 = lj34i[jtype].lj4;
+		offset = ljc12oi[jtype].offset;
+	      }
+	      evdwl = r6inv * (lj3 * r6inv - lj4);
+              #ifdef INTEL_VMASK
+	      evdwl -= offset;
+              #else
+	      if (rsq < cutsq) evdwl -= offset;
+              #endif
+	      if (!ONETYPE) evdwl *= factor_lj;
+	      sevdwl += evdwl;
+	      if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+		  f[j].w += (flt_t)0.5 * evdwl;
               }
+	    }
 
-	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
-				   delx, dely, delz);
-            }
+	    if (NEWTON_PAIR == 0)
+	      IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
           #ifdef INTEL_VMASK
           } // if rsq
           #endif
         } // for jj
-        f[i].x += fxtmp;
-        f[i].y += fytmp;
-        f[i].z += fztmp;
+	if (NEWTON_PAIR) {
+	  f[i].x += fxtmp;
+	  f[i].y += fytmp;
+	  f[i].z += fztmp;
+	} else {
+	  f[i].x = fxtmp;
+	  f[i].y = fytmp;
+	  f[i].z = fztmp;
+	}
 
-	IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+	IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-	                       nlocal, minlocal, nthreads, f_start, f_stride,
-	                       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end omp
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-	ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+	ov0 *= (acc_t)0.5;
+	ov1 *= (acc_t)0.5;
+	ov2 *= (acc_t)0.5;
+	ov3 *= (acc_t)0.5;
+	ov4 *= (acc_t)0.5;
+	ov5 *= (acc_t)0.5;
       }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -389,7 +367,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -400,6 +378,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 void PairLJCutIntel::init_style()
 {
   PairLJCut::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
   neighbor->requests[neighbor->nrequest-1]->intel = 1;
 
   int ifix = modify->find_fix("package_intel");
diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h
index a9c77324f3..b577a04658 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@@ -45,8 +45,7 @@ class PairLJCutIntel : public PairLJCut {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
-	    class acc_t>
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
new file mode 100644
index 0000000000..99c7045098
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_lj_long_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+
+
+using namespace LAMMPS_NS;
+
+#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
+#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) :
+  PairLJLongCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+
+PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel()
+{
+}
diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.h b/src/USER-INTEL/pair_lj_long_coul_long_intel.h
new file mode 100644
index 0000000000..b8e4e68928
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.h
@@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H
+
+#include "pair_lj_long_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+  class PairLJLongCoulLongIntel : public PairLJLongCoulLong {
+  public:
+    PairLJLongCoulLongIntel(class LAMMPS *);
+    virtual ~PairLJLongCoulLongIntel();
+
+  };
+}
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 09e00fd867..835f78664a 100644
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -109,85 +109,59 @@ void PairSWIntel::compute(int eflag, int vflag,
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
 
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
-                                nthreads, sizeof(ATOM_T));
+                                packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom, ito, ago);
     }
 
     fix->stop_watch(TIME_PACK);
   }
 
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
   if (_onetype) {
     if (_spq) {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<1,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<0,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   } else {
     if (_spq) {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<1,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<1,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<1,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
-      if (evflag || vflag_fdotr) {
-	int ovflag = 0;
-	if (vflag_fdotr) ovflag = 2;
-	else if (vflag) ovflag = 1;
-	if (eflag) {
-	  eval<0,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	} else {
-	  eval<0,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	  eval<0,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
-	}
+      if (eflag) {
+	eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       } else {
-	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+	eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
       }
     }
   }
@@ -196,7 +170,7 @@ void PairSWIntel::compute(int eflag, int vflag,
 /* ---------------------------------------------------------------------- */
 #ifndef LMP_USE_AVXCD
 
-template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
@@ -235,7 +209,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
@@ -276,19 +250,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
       iifrom += astart;
       iito += astart;
 
@@ -328,7 +298,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
         }
       }
 
-      for (int i = iifrom; i < iito; ++i) {
+      for (int i = iifrom; i < iito; i += iip) {
         int itype, itype_offset;
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
@@ -344,14 +314,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	const int jnumhalf = numneighhalf[i];
 
         acc_t fxtmp, fytmp, fztmp, fwtmp;
-        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+        acc_t sevdwl;
         fxtmp = fytmp = fztmp = (acc_t)0.0;
-        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
-        }
+	if (EFLAG) fwtmp = sevdwl = (acc_t)0;
 
 	int ejnum = 0, ejnumhalf = 0;
+	#pragma vector aligned
+	#pragma ivdep
         for (int jj = 0; jj < jnum; jj++) {
           int j = jlist[jj];
           j &= NEIGHMASK;
@@ -390,8 +359,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	
         #if defined(LMP_SIMD_COMPILER)
 	#pragma vector aligned
-		#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-					 sv0, sv1, sv2, sv3, sv4, sv5)
+        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl)
 	#endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
@@ -399,9 +367,6 @@ void PairSWIntel::eval(const int offload, const int vflag,
           if (EFLAG) fjtmp = (acc_t)0.0;
 	  int ijtype;
 
-          const flt_t delx = tdelx[jj];
-          const flt_t dely = tdely[jj];
-          const flt_t delz = tdelz[jj];
 	  if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
           const flt_t rsq1 = trsq[jj];
 
@@ -440,29 +405,31 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	  const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * 
 			       rainvsq) * expsrainv * rinvsq1;
 
-	  fxtmp -= delx * fpair;
-	  fytmp -= dely * fpair;
-	  fztmp -= delz * fpair;
-	  fjxtmp += delx * fpair;
-	  fjytmp += dely * fpair;
-	  fjztmp += delz * fpair;
+	  const flt_t delx = tdelx[jj];
+	  const flt_t dely = tdely[jj];
+	  const flt_t delz = tdelz[jj];
+	  const flt_t fpx = fpair * delx;
+	  fxtmp -= fpx;
+	  fjxtmp += fpx;
+	  const flt_t fpy = fpair * dely;
+	  fytmp -= fpy;
+	  fjytmp += fpy;
+	  const flt_t fpz = fpair * delz;
+	  fztmp -= fpz;
+	  fjztmp += fpz;
 
-	  if (EVFLAG) {
-	    if (EFLAG) {
-	      flt_t evdwl;
-	      if (!ONETYPE) {
-		c5 = p2e[ijtype].c5;
-		c6 = p2e[ijtype].c6;
-	      }
-	      evdwl = (c5 * rp - c6 * rq) * expsrainv;
-	      sevdwl += evdwl;
-	      if (eatom) {
-		fwtmp += (acc_t)0.5 * evdwl;
-		fjtmp += (acc_t)0.5 * evdwl;
-	      }
-	    }
-	    IP_PRE_ev_tally_nbor(vflag, (flt_t)1.0, fpair,
-				 -delx, -dely, -delz);
+	  if (EFLAG) {
+	    flt_t evdwl;
+	    if (!ONETYPE) {
+	      c5 = p2e[ijtype].c5;
+	      c6 = p2e[ijtype].c6;
+            }
+	    evdwl = (c5 * rp - c6 * rq) * expsrainv;
+	    sevdwl += evdwl;
+	    if (eatom) {
+	      fwtmp += (flt_t)0.5 * evdwl;
+	      fjtmp += (flt_t)0.5 * evdwl;
+            }
 	  }
 
 	  /*---------------------------------------------*/
@@ -533,17 +500,13 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	    fjytmp += fjy;
 	    fjztmp += fjz;
 
-	    if (EVFLAG) {
-	      if (EFLAG) {
-	        const flt_t evdwl = facrad * (flt_t)0.5;
-		sevdwl += evdwl;
-		if (eatom) {
-		  fwtmp += (acc_t)0.33333333 * evdwl;
-		  fjtmp += (acc_t)0.33333333 * facrad;
-		}
+	    if (EFLAG) {
+	      const flt_t evdwl = facrad * (flt_t)0.5;
+	      sevdwl += evdwl;
+	      if (eatom) {
+		fwtmp += (acc_t)0.33333333 * evdwl;
+		fjtmp += (acc_t)0.33333333 * facrad;
 	      }
-	      IP_PRE_ev_tally_nbor3v(vflag, fjx, fjy, fjz,
-				     delx, dely, delz);
 	    }
 	  } // for kk
 	  const int j = tj[jj];
@@ -557,34 +520,31 @@ void PairSWIntel::eval(const int offload, const int vflag,
         f[i].x += fxtmp;
         f[i].y += fytmp;
         f[i].z += fztmp;
-        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+
+	if (EFLAG) {
+	  f[i].w += fwtmp;
+	  oevdwl += sevdwl;
+	}
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall,
-			       nlocal, minlocal, nthreads, f_start, f_stride,
-			       x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, 
+			      x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -595,7 +555,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -614,7 +574,7 @@ authors for more details.
 
 ------------------------------------------------------------------------- */
 
-template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart, 
@@ -659,7 +619,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag,
                        buffers, offload, fix, separate_flag,
                        x_size, q_size, ev_size, f_stride);
 
@@ -701,19 +661,17 @@ void PairSWIntel::eval(const int offload, const int vflag,
                               f_stride, x, 0);
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal) \
-      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
-      int iifrom, iito, tid;
-      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, swidth);
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads, 
+			       swidth);
+      
       iifrom += astart;
       iito += astart;
 
@@ -760,7 +718,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 					144,160,176,192,208,224,240);
       ilist = ilist + iifrom;
       acc_t * const dforce = &(f[0].x);
-      for (int i = iifrom; i < iito; i += swidth) {
+      for (int i = iifrom; i < iito; i += iip) {
 	SIMD_mask imask = ilist < iito;
 	SIMD_flt_t xtmp, ytmp, ztmp;
 	SIMD_int itype, itype_offset;
@@ -793,20 +751,10 @@ void PairSWIntel::eval(const int offload, const int vflag,
           if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
 	}
 
-        SIMD_acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
-        if (EVFLAG) {
-          if (EFLAG) {
-            fwtmp = SIMD_set((acc_t)0);
-	    sevdwl = SIMD_set((acc_t)0);
-          }
-          if (vflag==1) {
-            sv0 = SIMD_set((acc_t)0);
-	    sv1 = SIMD_set((acc_t)0);
-	    sv2 = SIMD_set((acc_t)0);
-	    sv3 = SIMD_set((acc_t)0);
-	    sv4 = SIMD_set((acc_t)0);
-	    sv5 = SIMD_set((acc_t)0);
-          }
+        SIMD_acc_t sevdwl;
+	if (EFLAG) {
+          fwtmp = SIMD_set((acc_t)0);
+	  sevdwl = SIMD_set((acc_t)0);
         }
 
 	SIMD_int ejnum = SIMD_set(0);
@@ -930,19 +878,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 			     fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
 			     fztmp2, fjxtmp2, fjytmp2, fjztmp2); 
           
-	    if (EVFLAG) {
-	      if (EFLAG) {
-	        if (!ONETYPE) {
-	          c5 = SIMD_gather(&(p2e[0].c5), ijtype);
-	          c6 = SIMD_gather(&(p2e[0].c6), ijtype);
-                }            
-	        SIMD_flt_t evdwl;
-		evdwl = (c5 * rp - c6 * rq) * expsrainv;
-		SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
-                                 fwtmp2, fjtmp2);
-	      }
-	      SIMD_ev_tally_nbor(hmask, vflag, (flt_t)1.0, fpair, delx, dely, 
-                                 delz, sv0, sv1, sv2, sv3, sv4, sv5);
+	    if (EFLAG) {
+	      if (!ONETYPE) {
+		c5 = SIMD_gather(&(p2e[0].c5), ijtype);
+		c6 = SIMD_gather(&(p2e[0].c6), ijtype);
+	      }            
+	      SIMD_flt_t evdwl;
+	      evdwl = (c5 * rp - c6 * rq) * expsrainv;
+	      SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
+			       fwtmp2, fjtmp2);
 	    }
           }
 
@@ -1012,21 +956,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 			    fztmp2, fjxtmp2, fjytmp2, fjztmp2, 
 			    tf + kcoffset * 3, swidth); 
 
-	    if (EVFLAG) {
-	      if (EFLAG) {
-		SIMD_int k;
-		if (eatom) {
-		  k = SIMD_load(tj + kcoffset);
-		  k = k << 4;
-		}
-		SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
-			       fwtmp2, fjtmp2, k, dforce);
+	    if (EFLAG) {
+	      SIMD_int k;
+	      if (eatom) {
+		k = SIMD_load(tj + kcoffset);
+		k = k << 4;
 	      }
-	      SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
-				   delx, dely, delz, delr2x, delr2y, delr2z,
-				   sv0, sv1, sv2, sv3, sv4, sv5);
+	      SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
+			     fwtmp2, fjtmp2, k, dforce);
 	    }
-	    
 	  } // for kk
 	  if (is_same<flt_t,acc_t>::value == 1)
 	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
@@ -1087,52 +1025,34 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	} // for jj second loop
 
 	SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
-			   EVFLAG, eatom, fwtmp);
+			   EFLAG, eatom, fwtmp);
 	if (is_same<flt_t,acc_t>::value == 0) {
 	  imask = imask >> 8;
 	  SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, 
-			     fztmp2, EVFLAG, eatom, fwtmp2);
-	}
-	if (EVFLAG) {
-	  if (EFLAG) oevdwl += SIMD_sum(sevdwl);
-	  if (vflag == 1) {
-	    ov0 += SIMD_sum(sv0);
-	    ov1 += SIMD_sum(sv1);
-	    ov2 += SIMD_sum(sv2);
-	    ov3 += SIMD_sum(sv3);
-	    ov4 += SIMD_sum(sv4);
-	    ov5 += SIMD_sum(sv5);
-	  }
+			     fztmp2, EFLAG, eatom, fwtmp2);
 	}
-	ilist = ilist + swidth;
+	if (EFLAG) oevdwl += SIMD_sum(sevdwl);
+	ilist = ilist + iip;
       } // for ii
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall, nlocal, 
-			       minlocal, nthreads, f_start, f_stride, x, 
-			       offload);
-      }
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, 
+			      x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5);
     } // end omp
   
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = (acc_t)0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
     #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
@@ -1143,7 +1063,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -1212,6 +1132,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
   #ifdef LMP_USE_AVXCD
   fix->nbor_pack_width(SIMD_type<flt_t>::width());
   #endif
+  fix->three_body_neighbor(1);
 
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h
index 8723803a35..b55022328f 100644
--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@@ -46,7 +46,7 @@ class PairSWIntel : public PairSW {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+  template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
 	    const int astart, const int aend, const int pad_width);
diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp
index 88354ec4d0..f59a6b7c96 100644
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@@ -119,32 +119,30 @@ void PairTersoffIntel::compute(int eflag, int vflag,
 
   if (ago != 0 && fix->separate_buffers() == 0) {
     fix->start_watch(TIME_PACK);
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #pragma omp parallel if(packthreads > 1)
     #endif
     {
       int ifrom, ito, tid;
       IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
-				nthreads, sizeof(ATOM_T));
+				packthreads, sizeof(ATOM_T));
       buffers->thr_pack(ifrom,ito,ago);
     }
     fix->stop_watch(TIME_PACK);
   }
   
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
-    } else {
-	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
-    }
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (eflag) {
+    eval<1>(1, ovflag, buffers, fc, 0, offload_end);
+    eval<1>(0, ovflag, buffers, fc, host_start, inum);
   } else {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    eval<0>(1, ovflag, buffers, fc, 0, offload_end);
+    eval<0>(0, ovflag, buffers, fc, host_start, inum);
   }
 }
 
@@ -202,7 +200,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
   );
 
   // perform the actual computation
-  template<bool EVFLAG, bool EFLAG>
+  template<bool EFLAG>
   static void kernel(
       int iito, int iifrom, int eatom, int vflag, 
       const int * _noalias const numneigh,
@@ -213,11 +211,11 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
       const c_inner_t * _noalias const c_inner, 
       const c_outer_t * _noalias const c_outer, 
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-      acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
+      acc_t *evdwl
   );
 
   // perform one step of calculation, pass in i-j pairs of atoms (is, js)
-  template<int EVFLAG, int EFLAG>
+  template<int EFLAG>
   static void kernel_step(
       int eatom, int vflag, 
       const int * _noalias const numneigh,
@@ -228,13 +226,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
       const c_inner_t * _noalias const c_inner, 
       const c_outer_t * _noalias const c_outer, 
       typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-      avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5,
-      int compress_idx, iarr is, iarr js, bvec vmask_repulsive
+      avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive
   );
 
   // perform one step of calculation, as opposed to the previous method now
   //  with fixed i and a number of js
-  template<int EVFLAG, int EFLAG>
+  template<int EFLAG>
   static void kernel_step_const_i(
     int eatom, int vflag, 
     const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
@@ -243,8 +240,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
     const c_inner_t * _noalias const c_inner, 
     const c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5,
-    int compress_idx, int i, iarr js, bvec vmask_repulsive
+    avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
   );
 };
 
@@ -257,7 +253,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
 // Dispatch to correct kernel instatiation and perform all the work neccesary
 //  for offloading. In this routine we enter the Phi.
 // This method is nearly identical to what happens in the other /intel styles
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int EFLAG, class flt_t, class acc_t>
 void PairTersoffIntel::eval(const int offload, const int vflag,
 				     IntelBuffers<flt_t,acc_t> *buffers,
 				     const ForceConst<flt_t> &fc,
@@ -292,7 +288,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
 
   // Determine how much data to transfer
   int x_size, q_size, f_stride, ev_size, separate_flag;
-  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+  IP_PRE_get_transfern(ago, 1, EFLAG, vflag,
 		       buffers, offload, fix, separate_flag,
 		       x_size, q_size, ev_size, f_stride);
 
@@ -330,20 +326,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
     #endif
     #endif
 
-    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
-			       f_stride, x, 0);
+    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, 
+			      f_stride, x, 0);
 
     acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EFLAG) oevdwl = oecoul = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
 
     // loop over neighbors of my atoms
     #if defined(_OPENMP)
-    #pragma omp parallel default(none) \
-      shared(f_start,f_stride,nlocal,nall,minlocal)	\
-      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
     #endif
     {
       int iifrom, iito, tid;
@@ -355,10 +347,10 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
       {
-        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
-        sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.;
+        acc_t sevdwl;
+        sevdwl = 0.;
         #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \
-          firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5
+          firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl
         // Pick the variable i algorithm under specific conditions
         // do use scalar algorithm with very short vectors
         int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL;
@@ -366,50 +358,34 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
           lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
         bool use_scalar = VL < 4;
         if (use_scalar) {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS);
         } else if (pack_i) {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS);
         } else {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS);
-        }
-	if (EVFLAG) {
-          if (EFLAG) oevdwl += sevdwl;
-          if (vflag == 1) {
-            ov0 += sv0;
-            ov1 += sv1;
-            ov2 += sv2;
-            ov3 += sv3;
-            ov4 += sv4;
-            ov5 += sv5;
-          }
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
         }
+	if (EFLAG) oevdwl += sevdwl;
       }
 
-      #ifndef _LMP_INTEL_OFFLOAD
-      if (vflag == 2)
-      #endif
-      {
-        #if defined(_OPENMP)
-        #pragma omp barrier
-        #endif
-        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-	  		       nlocal, minlocal, nthreads, f_start, f_stride, 
-                               x, offload);
-      }
+      IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start,
+			      f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+			      ov4, ov5);
     } // end of omp parallel region
-    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = 0.0;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+
+    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
+			ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      ev_global[0] = oevdwl;
+      ev_global[1] = 0.0;
+    }
+    if (vflag) {
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
     }
 
     #ifdef _LMP_INTEL_OFFLOAD
@@ -424,7 +400,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
   else
     fix->stop_watch(TIME_HOST_PAIR);
 
-  if (EVFLAG)
+  if (EFLAG || vflag)
     fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
   else
     fix->add_result_array(f_start, 0, offload);
@@ -457,6 +433,7 @@ void PairTersoffIntel::init_style()
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
   
   fix->pair_init_check();
+  fix->three_body_neighbor(1);
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
@@ -663,7 +640,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
 static const int N_CACHE = 8;
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-template<int EVFLAG, int EFLAG>
+template<int EFLAG>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     int eatom, int vflag, 
     const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
@@ -673,12 +650,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl, 
-    avec *vsv0, 
-    avec *vsv1, 
-    avec *vsv2, 
-    avec* vsv3, 
-    avec *vsv4, 
-    avec *vsv5,
     int compress_idx, 
     iarr is,
     iarr js,
@@ -829,21 +800,11 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair;
   vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair;
  
-  if (EVFLAG) {
-    if (EFLAG) {
-      *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
-      if (eatom) {
-        v::store(fw, (v_0_5 * vevdwl));
-      }
+  if (EFLAG) {
+    *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
+    if (eatom) {
+      v::store(fw, (v_0_5 * vevdwl));
     }
-    if (vflag == 1) {				
-      *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
-      *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
-      *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
-      *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
-      *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
-      *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
-    }						
   }
   {
     while (cache_idx-- > 0) {
@@ -933,7 +894,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
-    if (EVFLAG && EFLAG && eatom) {
+    if (EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
@@ -945,7 +906,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
-    if (EVFLAG && EFLAG && eatom) {
+    if (EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
@@ -954,7 +915,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
 // Specialized kernel step for fixed i, means that we don't have to use the
 //  convoluted iteration scheme above, as the loop variables are uniform.
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-template<int EVFLAG, int EFLAG>
+template<int EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     int eatom, int vflag, 
     const int * _noalias const numneigh, const int * _noalias const cnumneigh, 
@@ -964,12 +925,6 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
     avec *vsevdwl, 
-    avec *vsv0, 
-    avec *vsv1, 
-    avec *vsv2, 
-    avec* vsv3, 
-    avec *vsv4, 
-    avec *vsv5,
     int compress_idx, 
     int i,
     iarr js,
@@ -1097,22 +1052,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
   vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair);
   vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair);
  
-  if (EVFLAG) {
-    if (EFLAG) {
-      *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
-      if (eatom) {
-        vfwtmp = v_0_5 * vevdwl;
-        v::store(fw, vfwtmp);
-      }
+  if (EFLAG) {
+    *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl);
+    if (eatom) {
+      vfwtmp = v_0_5 * vevdwl;
+      v::store(fw, vfwtmp);
     }
-    if (vflag == 1) {				
-      *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair);
-      *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair);
-      *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair);
-      *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair);
-      *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair);
-      *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair);
-    }						
   }
   while (cache_idx-- > 0) {
     fvec vfkx = vprefactor * vfkx_cache[cache_idx];
@@ -1169,20 +1114,20 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     f[t_].x += fx[t];
     f[t_].y += fy[t];
     f[t_].z += fz[t];
-    if (EVFLAG && EFLAG && eatom) {
+    if (EFLAG && eatom) {
       f[t_].w += fw[t];
     }
   }
   f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero()));
   f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero()));
   f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero()));
-  if (EVFLAG && EFLAG && eatom) {
+  if (EFLAG && eatom) {
     f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero()));
   }
 }
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-template<bool EVFLAG, bool EFLAG>
+template<bool EFLAG>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
     int iito, int iifrom, int eatom, int vflag, 
     const int * _noalias const numneigh, 
@@ -1193,14 +1138,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
     const c_inner_t * _noalias const c_inner, 
     const c_outer_t * _noalias const c_outer, 
     typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5
+    acc_t *evdwl
 ) {
   int compress_idx = 0;
   int ii, jj;
   iarr is, js;
   avec vsevdwl = v::acc_zero();
-  avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero();
-  avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero();
   ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
   ivec vj, v_NEIGHMASK(NEIGHMASK);
   bvec vmask_repulsive(0);
@@ -1237,11 +1180,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
       if (pack_i) {
         if (compress_idx == v::VL) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
-          kernel_step<EVFLAG,EFLAG>(
+          kernel_step<EFLAG>(
               eatom, vflag, 
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
-              &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
+              &vsevdwl, compress_idx, 
               is, js, vmask_repulsive
           );
           compress_idx = 0;
@@ -1250,11 +1193,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
       } else {
         if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) {
           vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
-          kernel_step_const_i<EVFLAG,EFLAG>(
+          kernel_step_const_i<EFLAG>(
               eatom, vflag, 
               numneigh, cnumneigh, firstneigh, ntypes,
               x, c_inner, c_outer, f,
-              &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
+              &vsevdwl, compress_idx, 
               i, js, vmask_repulsive
           );
           compress_idx = 0;
@@ -1265,26 +1208,16 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
   }
   if (compress_idx > 0) {
         vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
-        IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>(
+        IntelKernelTersoff::kernel_step<EFLAG>(
             eatom, vflag, 
             numneigh, cnumneigh, firstneigh, ntypes,
             x, c_inner, c_outer, f,
-            &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, 
+            &vsevdwl, compress_idx, 
             is, js, vmask_repulsive
         );
   }
-  if (EVFLAG) {
-    if (EFLAG) {
-      *evdwl += v::acc_reduce_add(vsevdwl);
-    }
-    if (vflag == 1) {
-      *ov0 += v::acc_reduce_add(vsv0);
-      *ov1 += v::acc_reduce_add(vsv1);
-      *ov2 += v::acc_reduce_add(vsv2);
-      *ov3 += v::acc_reduce_add(vsv3);
-      *ov4 += v::acc_reduce_add(vsv4);
-      *ov5 += v::acc_reduce_add(vsv5);
-    }
+  if (EFLAG) {
+    *evdwl += v::acc_reduce_add(vsevdwl);
   }
 }
 
diff --git a/src/USER-INTEL/pair_tersoff_intel.h b/src/USER-INTEL/pair_tersoff_intel.h
index c9604f2797..c725487ae7 100644
--- a/src/USER-INTEL/pair_tersoff_intel.h
+++ b/src/USER-INTEL/pair_tersoff_intel.h
@@ -79,7 +79,7 @@ class PairTersoffIntel : public PairTersoff {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int EFLAG, class flt_t, class acc_t>
   void eval(const int offload, const int vflag,
 	    IntelBuffers<flt_t,acc_t> * buffers,
 	    const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp
new file mode 100644
index 0000000000..110649f8ee
--- /dev/null
+++ b/src/USER-INTEL/pppm_disp_intel.cpp
@@ -0,0 +1,3034 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pppm_disp_intel.h"
+#include "atom.h"
+#include "error.h"
+#include "fft3d_wrap.h"
+#include "gridcomm.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER   7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
+enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE};
+enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM,
+     FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G,
+     FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A,
+     FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE, 
+     FORWARD_AD_PERATOM_NONE};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) : 
+  PPPMDisp(lmp, narg, arg)
+{
+  suffix_flag |= Suffix::INTEL;
+
+  order = 7;
+  order_6 = 7; //sets default stencil sizes to 7
+
+  perthread_density = NULL;
+  particle_ekx = particle_eky = particle_ekz = NULL;
+  particle_ekx0 = particle_eky0 = particle_ekz0 = NULL;
+  particle_ekx1 = particle_eky1 = particle_ekz1 = NULL;
+  particle_ekx2 = particle_eky2 = particle_ekz2 = NULL;
+  particle_ekx3 = particle_eky3 = particle_ekz3 = NULL;
+  particle_ekx4 = particle_eky4 = particle_ekz4 = NULL;
+  particle_ekx5 = particle_eky5 = particle_ekz5 = NULL;
+  particle_ekx6 = particle_eky6 = particle_ekz6 = NULL;
+  
+  rho_lookup = drho_lookup = NULL;
+  rho6_lookup = drho6_lookup = NULL;
+  rho_points = 0;
+
+  _use_table = _use_packing = _use_lrt = 0;
+}
+
+PPPMDispIntel::~PPPMDispIntel()
+{
+  memory->destroy(perthread_density);
+  memory->destroy(particle_ekx);
+  memory->destroy(particle_eky);
+  memory->destroy(particle_ekz);
+
+  memory->destroy(rho_lookup);
+  memory->destroy(drho_lookup);
+  memory->destroy(rho6_lookup);
+  memory->destroy(drho6_lookup);  
+}
+
+
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+
+void PPPMDispIntel::init()
+{
+
+  PPPMDisp::init();
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->kspace_init_check();
+
+  _use_lrt = fix->lrt();
+  if (_use_lrt)
+    error->all(FLERR,
+               "LRT mode is currently not supported for pppm/disp/intel");
+
+  
+  // For vectorization, we need some padding in the end
+  // The first thread computes on the global density
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    memory->destroy(perthread_density);
+    memory->create(perthread_density, comm->nthreads-1, 
+		   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmdispintel:perthread_density");
+  }
+
+  _use_table = fix->pppm_table();
+  if (_use_table) {
+    rho_points = 5000;
+    memory->destroy(rho_lookup);
+    memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
+		   "pppmdispintel:rho_lookup");
+    memory->destroy(rho6_lookup);
+    memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
+		   "pppmdispintel:rho6_lookup");
+
+    if(differentiation_flag == 1) {
+      memory->destroy(drho_lookup);
+      memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
+		     "pppmdispintel:drho_lookup");
+      memory->destroy(drho6_lookup);
+      memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, 
+		     "pppmdispintel:drho6_lookup");
+    }
+    precompute_rho();
+  }
+  if (order > INTEL_P3M_MAXORDER)
+    error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDispIntel long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDispIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    PPPMDisp::compute(eflag, vflag);
+    return;
+  }
+  #endif
+  int i;
+  // convert atoms from box to lamda coords
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+	 eflag_atom = vflag_atom = 0;
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    if (function[0]) {
+      cg_peratom->ghost_notify();
+      cg_peratom->setup();
+    }
+    if (function[1] + function[2] + function[3]) {
+      cg_peratom_6->ghost_notify();
+      cg_peratom_6->setup();
+    }
+    peratom_allocate_flag = 1;
+  }
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+
+    if (function[0]) memory->destroy(part2grid);
+    if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6);
+    if (differentiation_flag == 1) {
+      memory->destroy(particle_ekx);
+      memory->destroy(particle_eky);
+      memory->destroy(particle_ekz);
+      if (function[2] == 1){
+	memory->destroy(particle_ekx0);
+	memory->destroy(particle_eky0);
+	memory->destroy(particle_ekz0);
+	memory->destroy(particle_ekx1);
+	memory->destroy(particle_eky1);
+	memory->destroy(particle_ekz1);
+	memory->destroy(particle_ekx2);
+	memory->destroy(particle_eky2);
+	memory->destroy(particle_ekz2);
+	memory->destroy(particle_ekx3);
+	memory->destroy(particle_eky3);
+	memory->destroy(particle_ekz3);
+	memory->destroy(particle_ekx4);
+	memory->destroy(particle_eky4);
+	memory->destroy(particle_ekz4);
+	memory->destroy(particle_ekx5);
+	memory->destroy(particle_eky5);
+	memory->destroy(particle_ekz5);
+	memory->destroy(particle_ekx6);
+	memory->destroy(particle_eky6);
+	memory->destroy(particle_ekz6);	
+      }
+      
+    }    
+    nmax = atom->nmax;
+    if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid");
+    if (function[1] + function[2] + function[3])
+      memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6");
+    if (differentiation_flag == 1) {
+      memory->create(particle_ekx, nmax, "pppmdispintel:pekx");
+      memory->create(particle_eky, nmax, "pppmdispintel:peky");
+      memory->create(particle_ekz, nmax, "pppmdispintel:pekz");
+      if (function[2] == 1){
+	memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0");
+	memory->create(particle_eky0, nmax, "pppmdispintel:peky0");
+	memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0");
+	memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1");
+	memory->create(particle_eky1, nmax, "pppmdispintel:peky1");
+	memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1");
+	memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2");
+	memory->create(particle_eky2, nmax, "pppmdispintel:peky2");
+	memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2");
+	memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3");
+	memory->create(particle_eky3, nmax, "pppmdispintel:peky3");
+	memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3");
+	memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4");
+	memory->create(particle_eky4, nmax, "pppmdispintel:peky4");
+	memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4");
+	memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5");
+	memory->create(particle_eky5, nmax, "pppmdispintel:peky5");
+	memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5");
+	memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6");
+	memory->create(particle_eky6, nmax, "pppmdispintel:peky6");
+	memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6");	
+      }
+    }    
+  }
+  energy = 0.0;
+  energy_1 = 0.0;
+  energy_6 = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0;
+
+  // find grid points for all my particles
+  // distribute partcles' charges/dispersion coefficients on the grid
+  // communication between processors and remapping two fft
+  // Solution of poissons equation in k-space and backtransformation
+  // communication between processors
+  // calculation of forces
+  
+  if (function[0]) {
+
+    //perform calculations for coulomb interactions only
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid, 
+				 nupper, nlower, nxlo_out, nylo_out, nzlo_out, 
+				 nxhi_out, nyhi_out, nzhi_out, 
+				 fix->get_mixed_buffers());
+      make_rho_c<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid, 
+				  nupper, nlower, nxlo_out, nylo_out, 
+				  nzlo_out, nxhi_out, nyhi_out, nzhi_out, 
+				  fix->get_double_buffers());
+      make_rho_c<double,double>(fix->get_double_buffers());
+    } else {
+      particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid, 
+				nupper, nlower, nxlo_out, nylo_out, nzlo_out, 
+				nxhi_out, nyhi_out, nzhi_out, 
+				fix->get_single_buffers());
+      make_rho_c<float,float>(fix->get_single_buffers());
+    }
+  
+    cg->reverse_comm(this,REVERSE_RHO);
+
+    brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+	      density_brick, density_fft, work1,remap);
+    
+    if (differentiation_flag == 1) {
+      poisson_ad(work1, work2, density_fft, fft1, fft2,
+                 nx_pppm, ny_pppm, nz_pppm, nfft,
+                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
+                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+                 energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick, 
+		 v1_brick, v2_brick, v3_brick, v4_brick, v5_brick);
+
+      cg->forward_comm(this,FORWARD_AD);
+      
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_c_ad<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_c_ad<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_c_ad<float,float>(fix->get_single_buffers());
+      }      
+
+      if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM);
+
+    } else {
+      poisson_ik(work1, work2, density_fft, fft1, fft2,
+                 nx_pppm, ny_pppm, nz_pppm, nfft,
+                 nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft,
+                 nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in,
+                 energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2,
+                 vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2,
+                 u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, 
+		 v5_brick);
+
+      cg->forward_comm(this, FORWARD_IK);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_c_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_c_ik<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_c_ik<float,float>(fix->get_single_buffers());
+      }       
+
+      if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM);
+    }
+    if (evflag_atom) fieldforce_c_peratom();
+  }
+
+  if (function[1]) {
+    //perfrom calculations for geometric mixing
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				 part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
+				 nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				 nyhi_out_6, nzhi_out_6, 
+				 fix->get_mixed_buffers());
+      make_rho_g<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				  part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
+				  nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				  nyhi_out_6, nzhi_out_6,
+				  fix->get_double_buffers());
+      make_rho_g<double,double>(fix->get_double_buffers());      
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
+				nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				nyhi_out_6, nzhi_out_6, 
+				fix->get_single_buffers());
+      make_rho_g<float,float>(fix->get_single_buffers());      
+    }    
+
+
+    cg_6->reverse_comm(this, REVERSE_RHO_G);
+
+    brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6,
+	      density_brick_g, density_fft_g, work1_6,remap_6);
+
+    if (differentiation_flag == 1) {
+
+      poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6,
+                 nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, 
+		 nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6, 
+		 nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6,
+                 virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g, 
+		 v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+
+      cg_6->forward_comm(this,FORWARD_AD_G);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_g_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_g_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_g_ad<float,float>(fix->get_single_buffers());
+    }          
+
+      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G);
+
+    } else {
+      poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, 
+		 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, 
+		 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,
+		 fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g, 
+		 vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, 
+		 v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g);
+
+      cg_6->forward_comm(this,FORWARD_IK_G);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_g_ik<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_g_ik<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_g_ik<float,float>(fix->get_single_buffers());
+    }        
+
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G);
+    }
+    if (evflag_atom) fieldforce_g_peratom();
+  }
+
+  if (function[2]) {
+    //perform calculations for arithmetic mixing
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				 part2grid_6, nupper_6, nlower_6,
+				 nxlo_out_6, nylo_out_6, nzlo_out_6, 
+				 nxhi_out_6, nyhi_out_6, nzhi_out_6,
+				 fix->get_mixed_buffers());
+      make_rho_a<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+				  nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				  nyhi_out_6, nzhi_out_6,
+				  fix->get_double_buffers());
+      make_rho_a<double,double>(fix->get_double_buffers());      
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
+				nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				nyhi_out_6, nzhi_out_6,
+				fix->get_single_buffers());
+      make_rho_a<float,float>(fix->get_single_buffers());      
+    }        
+
+    cg_6->reverse_comm(this, REVERSE_RHO_A);
+
+    brick2fft_a();
+
+    if ( differentiation_flag == 1) {
+
+      poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, 
+		 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, 
+		 nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6,
+                 u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, 
+		 v3_brick_a3, v4_brick_a3, v5_brick_a3);
+      poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0, 
+		    v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, 
+		    v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6, 
+		    v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6);
+      poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1, 
+		    v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, 
+		    v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5, 
+		    v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5);
+      poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2, 
+		    v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, 
+		    v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4, 
+		    v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4);
+
+      cg_6->forward_comm(this, FORWARD_AD_A);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_a_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_a_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_a_ad<float,float>(fix->get_single_buffers());
+    }        
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A);
+
+    }  else {
+
+      poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6,
+                 nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, 
+		 nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6,
+                 nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, 
+		 nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6,
+		 fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, 
+		 virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3, 
+		 v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3);
+      poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0, 
+		    vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6, 
+		    vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0, 
+		    v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0,
+                    u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, 
+		    v3_brick_a6, v4_brick_a6, v5_brick_a6);
+      poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1, 
+		    vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5, 
+		    vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1, 
+		    v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1,
+                    u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, 
+		    v3_brick_a5, v4_brick_a5, v5_brick_a5);
+      poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2, 
+		    vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4, 
+		    vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2, 
+		    v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2,
+                    u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, 
+		    v3_brick_a4, v4_brick_a4, v5_brick_a4);
+
+      cg_6->forward_comm(this, FORWARD_IK_A);
+
+      if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+	fieldforce_a_ik<float,double>(fix->get_mixed_buffers());
+      } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+	fieldforce_a_ik<double,double>(fix->get_double_buffers());
+      } else {
+	fieldforce_a_ik<float,float>(fix->get_single_buffers());
+      }             
+
+      if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A);
+    }
+    if (evflag_atom) fieldforce_a_peratom();
+  }
+  
+  if (function[3]) {
+    //perform calculations if no mixing rule applies
+    
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				 part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
+				 nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				 nyhi_out_6, nzhi_out_6, 
+				 fix->get_mixed_buffers());
+      make_rho_none<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				  part2grid_6, nupper_6, nlower_6, nxlo_out_6,
+				  nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				  nyhi_out_6, nzhi_out_6,
+				  fix->get_double_buffers());
+      make_rho_none<double,double>(fix->get_double_buffers());      
+    } else {
+      particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, 
+				part2grid_6, nupper_6, nlower_6, nxlo_out_6, 
+				nylo_out_6, nzlo_out_6, nxhi_out_6, 
+				nyhi_out_6, nzhi_out_6,
+				fix->get_single_buffers());
+      make_rho_none<float,float>(fix->get_single_buffers());      
+    }         
+
+    cg_6->reverse_comm(this, REVERSE_RHO_NONE);
+
+    brick2fft_none();
+
+    if (differentiation_flag == 1) {
+
+      int n = 0;
+      for (int k = 0; k<nsplit_alloc/2; k++) {
+        poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1],
+                        u_brick_none[n],u_brick_none[n+1],
+                        v0_brick_none, v1_brick_none, v2_brick_none,
+                        v3_brick_none, v4_brick_none, v5_brick_none);
+        n += 2;
+      }
+
+      cg_6->forward_comm(this,FORWARD_AD_NONE);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_none_ad<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_none_ad<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_none_ad<float,float>(fix->get_single_buffers());
+    }          
+
+      if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE);
+
+    } else {
+      int n = 0;
+      for (int k = 0; k<nsplit_alloc/2; k++) {
+
+        poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1],
+                        vdx_brick_none[n], vdy_brick_none[n], 
+			vdz_brick_none[n], vdx_brick_none[n+1], 
+			vdy_brick_none[n+1], vdz_brick_none[n+1],
+                        u_brick_none, v0_brick_none, v1_brick_none, 
+			v2_brick_none, v3_brick_none, v4_brick_none, 
+			v5_brick_none);
+        n += 2;
+      }
+
+      cg_6->forward_comm(this,FORWARD_IK_NONE);
+
+    if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      fieldforce_none_ik<float,double>(fix->get_mixed_buffers());
+    } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      fieldforce_none_ik<double,double>(fix->get_double_buffers());
+    } else {
+      fieldforce_none_ik<float,float>(fix->get_single_buffers());
+    }          
+
+      if (evflag_atom)
+        cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE);
+    }
+    if (evflag_atom) fieldforce_none_peratom();
+  }
+
+  // update qsum and qsqsum, if atom count has changed and energy needed
+
+  if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atom->natoms;
+  }
+
+  // sum energy across procs and add in volume-dependent term
+
+  const double qscale = force->qqrd2e * scale;
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy_1 = energy_all;
+    MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy_6 = energy_all;
+
+    energy_1 *= 0.5*volume;
+    energy_6 *= 0.5*volume;
+
+    energy_1 -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij +
+      1.0/12.0*pow(g_ewald_6,6)*csum;
+    energy_1 *= qscale;
+  }
+
+  // sum virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+    MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i];
+    if (function[1]+function[2]+function[3]){
+      double a =  MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij;
+      virial[0] -= a;
+      virial[1] -= a;
+      virial[2] -= a;
+    }
+  }
+
+  if (eflag_atom) {
+    if (function[0]) {
+      double *q = atom->q;
+      for (i = 0; i < atom->nlocal; i++) {
+        eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]*
+	  qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction
+      }
+    }
+    if (function[1] + function[2] + function[3]) {
+      int tmp;
+      for (i = 0; i < atom->nlocal; i++) {
+        tmp = atom->type[i];
+        eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] +
+                      1.0/12.0*pow(g_ewald_6,6)*cii[tmp];
+      }
+    }
+  }
+
+  if (vflag_atom) {
+    if (function[1] + function[2] + function[3]) {
+      int tmp;
+      for (i = 0; i < atom->nlocal; i++) {
+        tmp = atom->type[i];
+	//dispersion self virial correction
+        for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)*
+				      pow(g_ewald_6,3)*csumi[tmp]; 
+      }
+    }
+  }
+
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+  if (function[0]) energy += energy_1;
+  if (function[1] + function[2] + function[3]) energy += energy_6;
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t>
+void PPPMDispIntel::particle_map(double delx, double dely, double delz,
+				 double sft, int** p2g, int nup, int nlow,
+				 int nxlo, int nylo, int nzlo,
+				 int nxhi, int nyhi, int nzhi,
+				 IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
+    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
+
+  int flag = 0;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\
+	   nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt)
+  #endif
+  {
+    double **x = atom->x;
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delx;
+    const flt_t yi = dely;
+    const flt_t zi = delz;
+    const flt_t fshift = sft;
+
+
+    int iifrom, iito, tid;
+    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma simd reduction(+:flag)
+    #endif    
+    for (int i = iifrom; i < iito; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET;
+    int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET;
+    int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET;
+
+    p2g[i][0] = nx;
+    p2g[i][1] = ny;
+    p2g[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlow < nxlo || nx+nup > nxhi ||
+	ny+nlow < nylo || ny+nup > nyhi ||
+	nz+nlow < nzlo || nz+nup > nzhi)
+      flag = 1;
+  }
+  }
+
+  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias global_density = 
+    &(density_brick[nzlo_out][nylo_out][nxlo_out]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+  double *q = atom->q;
+  double **x = atom->x;
+    
+    const int nix = nxhi_out - nxlo_out + 1;
+    const int niy = nyhi_out - nylo_out + 1;
+    
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
+    const flt_t fshiftone = shiftone;
+    const flt_t fdelvolinv = delvolinv;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : 
+      perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+  
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nysum = nlower + ny - nylo_out;
+      int nxsum = nlower + nx - nxlo_out;
+      int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+  
+          for (int l = order-1; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+        }
+      }
+  
+      FFT_SCALAR z0 = fdelvolinv * q[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- geometric mixing
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias global_density = 
+    &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    int type;
+    double **x = atom->x;
+    
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+    
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : 
+      perthread_density[tid - 1];
+
+    // clear 3d density array
+    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+  
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nysum = nlower_6 + ny - nylo_out_6;
+      int nxsum = nlower_6 + nx - nxlo_out_6;
+      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+  
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv * B[type];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+  
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- arithmetic mixing
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // clear 3d density array
+
+  memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+  memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0,
+	 ngrid_6*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+
+    double **x = atom->x;
+    
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+    
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    for (int i = 0; i < nlocal; i++) {
+  
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+  
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      const int type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+	    FFT_SCALAR w = x0*rho[0][l];
+            density_brick_a0[mz][my][mx] += w*B[7*type];
+	    density_brick_a1[mz][my][mx] += w*B[7*type+1];
+	    density_brick_a2[mz][my][mx] += w*B[7*type+2];
+	    density_brick_a3[mz][my][mx] += w*B[7*type+3];
+	    density_brick_a4[mz][my][mx] += w*B[7*type+4];
+	    density_brick_a5[mz][my][mx] += w*B[7*type+5];
+	    density_brick_a6[mz][my][mx] += w*B[7*type+6];
+          }
+        }
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = dispersion "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid --- case when mixing rules don't apply
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    int type;
+    double **x = atom->x;
+    
+    const int nix = nxhi_out_6 - nxlo_out_6 + 1;
+    const int niy = nyhi_out_6 - nylo_out_6 + 1;
+    
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshift = shift_6;
+    const flt_t fshiftone = shiftone_6;
+    const flt_t fdelvolinv = delvolinv_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : 
+      perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+  
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nysum = nlower_6 + ny - nylo_out_6;
+      int nxsum = nlower_6 + nx - nxlo_out_6;
+      int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+  
+          for (int l = order_6-1; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+        }
+      }
+
+      type = atom->type[i];
+      FFT_SCALAR z0 = fdelvolinv;
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+	    FFT_SCALAR w0 = x0*rho[0][l];
+	    for(int k = 0; k < nsplit; k++)
+	      my_density[mzyx + k*ngrid_6] += x0*rho[0][l];
+          }
+        }
+      }
+    }
+  }
+
+  // reduce all the perthread_densities into global_density
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, global_density) if(!_use_lrt)
+  #endif
+  {
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr);
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      for(int j = 1; j < nthr; j++) {
+        global_density[i] += perthread_density[j-1][i];
+      }
+    }
+  }
+  
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  //double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;
+  
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho_lookup[idx][k];
+          rho1[k] = rho_lookup[idy][k];
+          rho2[k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1 = rho_coeff[order-1][k];
+          FFT_SCALAR r2 = rho_coeff[order-1][k];
+          FFT_SCALAR r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower] = r1;
+          rho1[k-nlower] = r2;
+          rho2[k-nlower] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
+           
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      f[i][0] += qfactor*ekx;
+      f[i][1] += qfactor*eky;
+      if (slabflag != 2) f[i][2] += qfactor*ekz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  //double *q = atom->q;
+  //double **x = atom->x;
+  //double **f = atom->f;
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+    
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;    
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm/xprd;
+    const flt_t hy_inv = ny_pppm/yprd;
+    const flt_t hz_inv = nz_pppm/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff[0];
+    const flt_t fsf_coeff1 = sf_coeff[1];
+    const flt_t fsf_coeff2 = sf_coeff[2];
+    const flt_t fsf_coeff3 = sf_coeff[3];
+    const flt_t fsf_coeff4 = sf_coeff[4];
+    const flt_t fsf_coeff5 = sf_coeff[5];
+  
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+          drho[0][k] = drho_lookup[idx][k];
+          drho[1][k] = drho_lookup[idy][k];
+          drho[2][k] = drho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+  
+          r1 = rho_coeff[order-1][k];
+          r2 = rho_coeff[order-1][k];
+          r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1 * dx;
+            r2 = rho_coeff[l][k] + r2 * dy;
+            r3 = rho_coeff[l][k] + r3 * dz;
+            dr1 = drho_coeff[l][k] + dr1 * dx;
+            dr2 = drho_coeff[l][k] + dr2 * dy;
+            dr3 = drho_coeff[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+          drho[0][k-nlower] = dr1;
+          drho[1][k-nlower] = dr2;
+          drho[2][k-nlower] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick[mz][my][mx];
+          }
+        }
+      }
+  
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      	particle_ekx[i] += ekx[l];
+      	particle_eky[i] += eky[l];
+      	particle_ekz[i] += ekz[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+  
+      // convert E-field to force
+  
+      const flt_t qfactor = fqqrd2es * q[i];
+      const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
+  
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoqsq;
+      f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf;
+  
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoqsq;
+      f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf;
+  
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoqsq;
+  
+      if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for geometric mixing rule
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double lj;
+    int type;
+    double **x = atom->x;
+    double **f = atom->f;
+  
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick_g[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx];
+           
+          }
+        }
+      }
+
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+
+      // convert E-field to force
+
+      type = atom->type[i];
+      lj = B[type];
+      f[i][0] += lj*ekx;
+      f[i][1] += lj*eky;
+      if (slabflag != 2) f[i][2] += lj*ekz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for geometric mixing rule for ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle  
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+    
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+  
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+  
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick_g[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick_g[mz][my][mx];
+          }
+        }
+      }
+  
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      	particle_ekx[i] += ekx[l];
+      	particle_eky[i] += eky[l];
+      	particle_ekz[i] += ekz[l];
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+  
+      // convert E-field to force
+
+      const int type = atom->type[i];
+      const flt_t lj = B[type];
+      const flt_t twoljsq = 2.*lj*lj;
+  
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoljsq;
+      f[i][0] += lj * particle_ekx[i] - sf;
+  
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoljsq;
+      f[i][1] += lj * particle_eky[i] - sf;
+  
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoljsq;
+  
+      if (slabflag != 2) f[i][2] += lj * particle_ekz[i] -  sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for arithmetic mixing rule and ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+    double **x = atom->x;
+    double **f = atom->f;
+  
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+      _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};   
+      
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+              ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx];
+              eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx];
+              ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx];
+              ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx];
+              eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx];
+              ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx];	      
+              ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx];
+              eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx];
+              ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx];
+              ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx];
+              eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx];
+              ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx];
+              ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx];
+              eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx];
+              ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx];
+              ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx];
+              eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx];
+              ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx];	      
+              ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx];
+              eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx];
+              ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx];     
+          }
+        }
+      }
+
+      FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+      FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+      FFT_SCALAR ekx6, eky6, ekz6;
+      ekx0 = eky0 = ekz0 = ZEROF;
+      ekx1 = eky1 = ekz1 = ZEROF;
+      ekx2 = eky2 = ekz2 = ZEROF;
+      ekx3 = eky3 = ekz3 = ZEROF;
+      ekx4 = eky4 = ekz4 = ZEROF;
+      ekx5 = eky5 = ekz5 = ZEROF;
+      ekx6 = eky6 = ekz6 = ZEROF;      
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx0 += ekx0_arr[l];
+          eky0 += eky0_arr[l];
+          ekz0 += ekz0_arr[l];
+          ekx1 += ekx1_arr[l];
+          eky1 += eky1_arr[l];
+          ekz1 += ekz1_arr[l];
+          ekx2 += ekx2_arr[l];
+          eky2 += eky2_arr[l];
+          ekz2 += ekz2_arr[l];
+          ekx3 += ekx3_arr[l];
+          eky3 += eky3_arr[l];
+          ekz3 += ekz3_arr[l];
+          ekx4 += ekx4_arr[l];
+          eky4 += eky4_arr[l];
+          ekz4 += ekz4_arr[l];
+          ekx5 += ekx5_arr[l];
+          eky5 += eky5_arr[l];
+          ekz5 += ekz5_arr[l];
+          ekx6 += ekx6_arr[l];
+          eky6 += eky6_arr[l];
+          ekz6 += ekz6_arr[l];	  
+        }
+
+      // convert D-field to force
+
+      const int type = atom->type[i];
+      const FFT_SCALAR lj0 = B[7*type+6];
+      const FFT_SCALAR lj1 = B[7*type+5];
+      const FFT_SCALAR lj2 = B[7*type+4];
+      const FFT_SCALAR lj3 = B[7*type+3];
+      const FFT_SCALAR lj4 = B[7*type+2];
+      const FFT_SCALAR lj5 = B[7*type+1];
+      const FFT_SCALAR lj6 = B[7*type];
+      
+      f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + 
+	lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
+      f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + 
+	lj4*eky4 + lj5*eky5 + lj6*eky6;
+      if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + 
+			   lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for arithmetic mixing rule for the ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle  
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0;
+  FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0;
+  FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0;
+  FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1;
+  FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1;
+  FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1;  
+  FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2;
+  FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2;
+  FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2;
+  FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3;
+  FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3;
+  FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3;
+  FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4;
+  FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4;
+  FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4;
+  FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5;
+  FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5;
+  FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5;
+  FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6;
+  FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6;
+  FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6;
+  
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+    
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+  
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+  
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};      
+      _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};     
+
+      particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF;
+      particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF; 
+      particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF;
+      particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF;
+      particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF;
+      particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF;
+      particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF;
+      
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order_6; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order_6; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+	    FFT_SCALAR x0 = drho[0][l] * ekx_p;
+	    FFT_SCALAR y0 = rho[0][l] * eky_p;
+	    FFT_SCALAR z0 = rho[0][l] * ekz_p;
+	    
+            ekx0[l] +=  x0 * u_brick_a0[mz][my][mx];
+            eky0[l] +=  y0 * u_brick_a0[mz][my][mx];
+            ekz0[l] +=  z0 * u_brick_a0[mz][my][mx];
+            ekx1[l] +=  x0 * u_brick_a1[mz][my][mx];
+            eky1[l] +=  y0 * u_brick_a1[mz][my][mx];
+            ekz1[l] +=  z0 * u_brick_a1[mz][my][mx];
+            ekx2[l] +=  x0 * u_brick_a2[mz][my][mx];
+            eky2[l] +=  y0 * u_brick_a2[mz][my][mx];
+            ekz2[l] +=  z0 * u_brick_a2[mz][my][mx];
+            ekx3[l] +=  x0 * u_brick_a3[mz][my][mx];
+            eky3[l] +=  y0 * u_brick_a3[mz][my][mx];
+            ekz3[l] +=  z0 * u_brick_a3[mz][my][mx];
+            ekx4[l] +=  x0 * u_brick_a4[mz][my][mx];
+            eky4[l] +=  y0 * u_brick_a4[mz][my][mx];
+            ekz4[l] +=  z0 * u_brick_a4[mz][my][mx];
+            ekx5[l] +=  x0 * u_brick_a5[mz][my][mx];
+            eky5[l] +=  y0 * u_brick_a5[mz][my][mx];
+            ekz5[l] +=  z0 * u_brick_a5[mz][my][mx];
+            ekx6[l] +=  x0 * u_brick_a6[mz][my][mx];
+            eky6[l] +=  y0 * u_brick_a6[mz][my][mx];
+            ekz6[l] +=  z0 * u_brick_a6[mz][my][mx];	    	    
+          }
+        }
+      }
+  
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      	particle_ekx0[i] += ekx0[l];
+      	particle_eky0[i] += eky0[l];
+      	particle_ekz0[i] += ekz0[l];
+      	particle_ekx1[i] += ekx1[l];
+      	particle_eky1[i] += eky1[l];
+      	particle_ekz1[i] += ekz1[l];
+      	particle_ekx2[i] += ekx2[l];
+      	particle_eky2[i] += eky2[l];
+      	particle_ekz2[i] += ekz2[l];
+      	particle_ekx3[i] += ekx3[l];
+      	particle_eky3[i] += eky3[l];
+      	particle_ekz3[i] += ekz3[l];
+      	particle_ekx4[i] += ekx4[l];
+      	particle_eky4[i] += eky4[l];
+      	particle_ekz4[i] += ekz4[l];
+      	particle_ekx5[i] += ekx5[l];
+      	particle_eky5[i] += eky5[l];
+      	particle_ekz5[i] += ekz5[l];
+      	particle_ekx6[i] += ekx6[l];
+      	particle_eky6[i] += eky6[l];
+      	particle_ekz6[i] += ekz6[l];	
+      }
+    }
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx0[i] *= hx_inv;
+      particle_eky0[i] *= hy_inv;
+      particle_ekz0[i] *= hz_inv;
+      particle_ekx1[i] *= hx_inv;
+      particle_eky1[i] *= hy_inv;
+      particle_ekz1[i] *= hz_inv;
+      particle_ekx2[i] *= hx_inv;
+      particle_eky2[i] *= hy_inv;
+      particle_ekz2[i] *= hz_inv;
+      particle_ekx3[i] *= hx_inv;
+      particle_eky3[i] *= hy_inv;
+      particle_ekz3[i] *= hz_inv;
+      particle_ekx4[i] *= hx_inv;
+      particle_eky4[i] *= hy_inv;
+      particle_ekz4[i] *= hz_inv;
+      particle_ekx5[i] *= hx_inv;
+      particle_eky5[i] *= hy_inv;
+      particle_ekz5[i] *= hz_inv;
+      particle_ekx6[i] *= hx_inv;
+      particle_eky6[i] *= hy_inv;
+      particle_ekz6[i] *= hz_inv;      
+  
+      // convert D-field to force
+
+      const int type = atom->type[i];
+      const FFT_SCALAR lj0 = B[7*type+6];
+      const FFT_SCALAR lj1 = B[7*type+5];
+      const FFT_SCALAR lj2 = B[7*type+4];
+      const FFT_SCALAR lj3 = B[7*type+3];
+      const FFT_SCALAR lj4 = B[7*type+2];
+      const FFT_SCALAR lj5 = B[7*type+1];
+      const FFT_SCALAR lj6 = B[7*type];
+  
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] + 
+	lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] + 
+	lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf;      
+  
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] + 
+	lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] + 
+	lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf;
+  
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+      if (slabflag != 2)
+      f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] + 
+	lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] + 
+	lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for no mixing rule and ik scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
+  #endif
+  {
+
+    double lj;
+    int type;
+    double **x = atom->x;
+    double **f = atom->f;
+  
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho0[k] = rho6_lookup[idx][k];
+          rho1[k] = rho6_lookup[idy][k];
+          rho2[k] = rho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r2 = rho_coeff_6[order_6-1][k];
+          FFT_SCALAR r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1*dx;
+            r2 = rho_coeff_6[l][k] + r2*dy;
+            r3 = rho_coeff_6[l][k] + r3*dz;
+          }
+
+          rho0[k-nlower_6] = r1;
+          rho1[k-nlower_6] = r2;
+          rho2[k-nlower_6] = r3;
+        }
+      }
+
+
+      _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+      _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+      _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64);
+
+      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
+	ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF;
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+	for (int n = 0; n < order_6; n++) {
+	  int mz = n+nzsum;
+	  FFT_SCALAR z0 = rho2[n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count=7
+          #endif   
+	  for (int m = 0; m < order_6; m++) {
+	    int my = m+nysum;
+	    FFT_SCALAR y0 = z0*rho1[m];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma simd
+            #endif   
+	    for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+	      int mx = l+nxsum;
+	      FFT_SCALAR x0 = y0*rho0[l];
+              ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= 
+		x0*vdx_brick_none[k][mz][my][mx];
+              eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= 
+		x0*vdy_brick_none[k][mz][my][mx];
+              ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= 
+		x0*vdz_brick_none[k][mz][my][mx];
+	    }
+	  }
+	}
+      }
+
+      _alignvar(FFT_SCALAR ekx[nsplit], 64);
+      _alignvar(FFT_SCALAR eky[nsplit], 64);
+      _alignvar(FFT_SCALAR ekz[nsplit], 64);
+      for (int k = 0; k < nsplit; k++) {
+	ekx[k] = eky[k] = ekz[k] = ZEROF;
+      }      
+
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+	  for (int k = 0; k < nsplit; k++) {
+	    ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	    eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	    ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l];
+	  }
+        }
+
+      // convert E-field to force
+
+      type = atom->type[i];
+      for (int k = 0; k < nsplit; k++) {
+	lj = B[nsplit*type + k];
+	f[i][0] += lj*ekx[k];
+	f[i][1] += lj*eky[k];
+	if (slabflag != 2) f[i][2] += lj*ekz[k];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for no mixing rule for the ad scheme
+------------------------------------------------------------------------- */
+
+template<class flt_t, class acc_t, int use_table>
+void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers)
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of dispersion field on particle  
+
+  int nlocal = atom->nlocal;
+  int nthr = comm->nthreads;
+
+   #if defined(_OPENMP)
+   #pragma omp parallel default(none)		\
+     shared(nlocal, nthr) if(!_use_lrt)
+   #endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+    
+    double **x = atom->x;
+    double **f = atom->f;
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv_6;
+    const flt_t yi = delyinv_6;
+    const flt_t zi = delzinv_6;
+    const flt_t fshiftone = shiftone_6;
+
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2]*slab_volfactor;
+
+    const flt_t hx_inv = nx_pppm_6/xprd;
+    const flt_t hy_inv = ny_pppm_6/yprd;
+    const flt_t hz_inv = nz_pppm_6/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff_6[0];
+    const flt_t fsf_coeff1 = sf_coeff_6[1];
+    const flt_t fsf_coeff2 = sf_coeff_6[2];
+    const flt_t fsf_coeff3 = sf_coeff_6[3];
+    const flt_t fsf_coeff4 = sf_coeff_6[4];
+    const flt_t fsf_coeff5 = sf_coeff_6[5];
+  
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid_6[i][0];
+      int ny = part2grid_6[i][1];
+      int nz = part2grid_6[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi;
+  
+      int nxsum = nx + nlower_6;
+      int nysum = ny + nlower_6;
+      int nzsum = nz + nlower_6;
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho6_lookup[idx][k];
+          rho[1][k] = rho6_lookup[idy][k];
+          rho[2][k] = rho6_lookup[idz][k];
+          drho[0][k] = drho6_lookup[idx][k];
+          drho[1][k] = drho6_lookup[idy][k];
+          drho[2][k] = drho6_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower_6; k <= nupper_6; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+  
+          r1 = rho_coeff_6[order_6-1][k];
+          r2 = rho_coeff_6[order_6-1][k];
+          r3 = rho_coeff_6[order_6-1][k];
+          for (int l = order_6-2; l >= 0; l--) {
+            r1 = rho_coeff_6[l][k] + r1 * dx;
+            r2 = rho_coeff_6[l][k] + r2 * dy;
+            r3 = rho_coeff_6[l][k] + r3 * dz;
+            dr1 = drho_coeff_6[l][k] + dr1 * dx;
+            dr2 = drho_coeff_6[l][k] + dr2 * dy;
+            dr3 = drho_coeff_6[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower_6] = r1;
+          rho[1][k-nlower_6] = r2;
+          rho[2][k-nlower_6] = r3;
+          drho[0][k-nlower_6] = dr1;
+          drho[1][k-nlower_6] = dr2;
+          drho[2][k-nlower_6] = dr3;
+        }
+      }
+      _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+      _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+      _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64);
+
+      for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) {
+	ekx[k]=eky[k]=ekz[k]=ZEROF;
+      }
+
+      for (int k = 0; k < nsplit; k++) { 
+	particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+	for (int n = 0; n < order_6; n++) {
+	  int mz = n + nzsum;
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma loop_count=7
+          #endif   
+	  for (int m = 0; m < order_6; m++) {
+	    int my = m + nysum;
+	    FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+	    FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+	    FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+            #if defined(LMP_SIMD_COMPILER)
+            #pragma simd
+            #endif   
+	    for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+	      int mx = l + nxsum;
+	      ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p * 
+		u_brick_none[k][mz][my][mx];
+	      eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * eky_p * 
+		u_brick_none[k][mz][my][mx];
+	      ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] +=  rho[0][l] * ekz_p * 
+		u_brick_none[k][mz][my][mx];
+	    }
+	  }
+	}
+      }
+      
+      _alignvar(FFT_SCALAR ekx_tot[nsplit], 64);
+      _alignvar(FFT_SCALAR eky_tot[nsplit], 64);
+      _alignvar(FFT_SCALAR ekz_tot[nsplit], 64);
+      for (int k = 0; k < nsplit; k++) {
+	ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF;
+      }
+      
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+	for (int k = 0; k < nsplit; k++) {
+	  ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+	  eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+	  ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l];
+	}
+      }
+
+      for (int k = 0; k < nsplit; k++) {
+	ekx_tot[k] *= hx_inv;
+	eky_tot[k] *= hy_inv;
+	ekz_tot[k] *= hz_inv;
+      }
+      // convert D-field to force
+
+      const int type = atom->type[i];
+  
+      const flt_t s1 = x[i][0] * hx_inv;
+      const flt_t s2 = x[i][1] * hy_inv;
+      const flt_t s3 = x[i][2] * hz_inv;
+      flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf1 += fsf_coeff1 * sin(ffour_pi * s1);
+  
+      flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf2 += fsf_coeff3 * sin(ffour_pi * s2);
+  
+      flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf3 += fsf_coeff5 * sin(ffour_pi * s3); 
+      for (int k = 0; k < nsplit; k++) {
+	const flt_t lj = B[nsplit*type + k];
+	const flt_t twoljsq = lj*lj * B[k] * 2;
+	flt_t sf = sf1*twoljsq;
+	f[i][0] += lj * ekx_tot[k] - sf;
+	sf = sf2*twoljsq;
+	f[i][1] += lj * eky_tot[k] - sf;
+	sf = sf3*twoljsq;
+	if (slabflag != 2) f[i][2] += lj * ekz_tot[k] -  sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   precompute rho coefficients as a lookup table to save time in make_rho
+   and fieldforce.  Instead of doing this polynomial for every atom 6 times
+   per time step, precompute it for some number of points.
+------------------------------------------------------------------------- */
+
+void PPPMDispIntel::precompute_rho()
+{
+
+  half_rho_scale = (rho_points - 1.)/2.;
+  half_rho_scale_plus = half_rho_scale + 0.5;
+
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower; k<=nupper;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order-1; l>=0; l--){
+        r1 = rho_coeff[l][k] + r1*dx;
+      }
+      rho_lookup[i][k-nlower] = r1;
+    }
+    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower; k<=nupper;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order-2; l>=0; l--){
+          r1 = drho_coeff[l][k] + r1*dx;
+        }
+        drho_lookup[i][k-nlower] = r1;
+      }
+      for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho_lookup[i][k] = 0;
+      }
+    }
+  }
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower_6; k<=nupper_6;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order_6-1; l>=0; l--){
+        r1 = rho_coeff_6[l][k] + r1*dx;
+      }
+      rho6_lookup[i][k-nlower_6] = r1;
+    }
+    for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho6_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower_6; k<=nupper_6;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order_6-2; l>=0; l--){
+          r1 = drho_coeff_6[l][k] + r1*dx;
+        }
+        drho6_lookup[i][k-nlower_6] = r1;
+      }
+      for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho6_lookup[i][k] = 0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Returns 0 if Intel optimizations for PPPM ignored due to offload
+------------------------------------------------------------------------- */
+
+#ifdef _LMP_INTEL_OFFLOAD
+int PPPMDispIntel::use_base() {
+  return _use_base;
+}
+#endif
diff --git a/src/USER-INTEL/pppm_disp_intel.h b/src/USER-INTEL/pppm_disp_intel.h
new file mode 100644
index 0000000000..166152004e
--- /dev/null
+++ b/src/USER-INTEL/pppm_disp_intel.h
@@ -0,0 +1,238 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: William McDoniel (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/disp/intel,PPPMDispIntel)
+
+#else
+
+#ifndef LMP_PPPMINTEL_DISP_H
+#define LMP_PPPMINTEL_DISP_H
+
+#include "pppm_disp.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMDispIntel : public PPPMDisp {
+  public:
+    PPPMDispIntel(class LAMMPS *, int, char **);
+    virtual ~PPPMDispIntel();
+    virtual void init();
+    virtual void compute(int, int);
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    int use_base();
+    #endif
+    
+  protected:
+    FixIntel *fix;
+
+    int _use_lrt;
+    FFT_SCALAR **perthread_density;
+    FFT_SCALAR *particle_ekx;
+    FFT_SCALAR *particle_eky;
+    FFT_SCALAR *particle_ekz;
+    FFT_SCALAR *particle_ekx0;
+    FFT_SCALAR *particle_eky0;
+    FFT_SCALAR *particle_ekz0;
+    FFT_SCALAR *particle_ekx1;
+    FFT_SCALAR *particle_eky1;
+    FFT_SCALAR *particle_ekz1;
+    FFT_SCALAR *particle_ekx2;
+    FFT_SCALAR *particle_eky2;
+    FFT_SCALAR *particle_ekz2;
+    FFT_SCALAR *particle_ekx3;
+    FFT_SCALAR *particle_eky3;
+    FFT_SCALAR *particle_ekz3;
+    FFT_SCALAR *particle_ekx4;
+    FFT_SCALAR *particle_eky4;
+    FFT_SCALAR *particle_ekz4;
+    FFT_SCALAR *particle_ekx5;
+    FFT_SCALAR *particle_eky5;
+    FFT_SCALAR *particle_ekz5;
+    FFT_SCALAR *particle_ekx6;
+    FFT_SCALAR *particle_eky6;
+    FFT_SCALAR *particle_ekz6;  
+            
+    
+
+    int _use_table;
+    int rho_points;
+    FFT_SCALAR **rho_lookup;
+    FFT_SCALAR **rho6_lookup;
+    FFT_SCALAR **drho_lookup;
+    FFT_SCALAR **drho6_lookup;
+    FFT_SCALAR half_rho_scale, half_rho_scale_plus;
+
+    int _use_packing;
+
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    int _use_base;
+    #endif
+    
+    template<class flt_t, class acc_t>
+    void particle_map(double, double, double,
+                      double, int **, int, int,
+                      int, int, int,
+		      int, int, int,
+		      IntelBuffers<flt_t,acc_t> *buffers);
+    
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_c<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_c<flt_t,acc_t,0>(buffers);
+      }
+    }
+  
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_g<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_g<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_a<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_a<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    
+    template<class flt_t, class acc_t, int use_table>
+    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        make_rho_none<flt_t,acc_t,1>(buffers);
+      } else {
+        make_rho_none<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_c_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_c_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_c_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_c_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_g_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_g_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_g_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_g_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_a_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_a_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_a_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_a_ad<flt_t,acc_t,0>(buffers);
+      }
+    }    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers);
+     template<class flt_t, class acc_t>
+    void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_none_ik<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_none_ik<flt_t,acc_t,0>(buffers);
+      }
+    }
+    
+    template<class flt_t, class acc_t, int use_table>
+    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers);
+    template<class flt_t, class acc_t>
+    void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+      if (_use_table == 1) {
+        fieldforce_none_ad<flt_t,acc_t,1>(buffers);
+      } else {
+        fieldforce_none_ad<flt_t,acc_t,0>(buffers);
+      }
+    }
+
+    void precompute_rho();
+    
+  };
+
+}
+#endif
+#endif
+    
+  
diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp
index c420a23bf4..42bdec46ee 100644
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@@ -12,7 +12,9 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Rodrigo Canales (RWTH Aachen University)
+   Contributing authors: William McDoniel (RWTH Aachen University)
+                         Rodrigo Canales (RWTH Aachen University)
+			 Markus Hoehnerbach (RWTH Aachen University)
                          W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
@@ -22,6 +24,7 @@
 #include "pppm_intel.h"
 #include "atom.h"
 #include "error.h"
+#include "fft3d_wrap.h"
 #include "gridcomm.h"
 #include "math_const.h"
 #include "math_special.h"
@@ -54,10 +57,37 @@ enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
 PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
 {
   suffix_flag |= Suffix::INTEL;
+
+  order = 7; //sets default stencil size to 7
+
+  perthread_density = NULL;
+  particle_ekx = particle_eky = particle_ekz = NULL;
+  
+  rho_lookup = drho_lookup = NULL;
+  rho_points = 0;
+  
+  vdxy_brick = vdz0_brick = NULL;
+  work3 = NULL;
+  cg_pack = NULL;
+
+  _use_table = _use_packing = _use_lrt = 0;
 }
 
 PPPMIntel::~PPPMIntel()
 {
+  memory->destroy(perthread_density);
+  memory->destroy(particle_ekx);
+  memory->destroy(particle_eky);
+  memory->destroy(particle_ekz);
+
+  memory->destroy(rho_lookup);
+  memory->destroy(drho_lookup);
+
+  memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
+  memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
+  memory->destroy(work3);
+
+  delete cg_pack;
 }
 
 /* ----------------------------------------------------------------------
@@ -83,17 +113,64 @@ void PPPMIntel::init()
 
   fix->kspace_init_check();
 
+  _use_lrt = fix->lrt();
+
+  // For vectorization, we need some padding in the end
+  // The first thread computes on the global density
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    memory->destroy(perthread_density);
+    memory->create(perthread_density, comm->nthreads-1,
+		   ngrid + INTEL_P3M_ALIGNED_MAXORDER,
+                   "pppmintel:perthread_density");
+  }
+  
+  _use_table = fix->pppm_table();
+  if (_use_table) {
+    rho_points = 5000;
+    memory->destroy(rho_lookup);
+    memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+		   "pppmintel:rho_lookup");
+    if(differentiation_flag == 1) {
+      memory->destroy(drho_lookup);
+      memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER,
+		     "pppmintel:drho_lookup");
+    }
+    precompute_rho();
+  }
+
   if (order > INTEL_P3M_MAXORDER)
     error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");
 
-  /*
-  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
-    pack_force_const(force_const_single, fix->get_mixed_buffers());
-  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
-    pack_force_const(force_const_double, fix->get_double_buffers());
-  else
-    pack_force_const(force_const_single, fix->get_single_buffers());
-  */
+  _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16) 
+                              && (sizeof(FFT_SCALAR) == sizeof(float))
+                              && (differentiation_flag == 0);
+  if (_use_packing) {
+    memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
+    memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out);
+    memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, 
+			    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+			    "pppmintel:vdxy_brick");
+    memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out);
+    memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, 
+			    nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1,
+			    "pppmintel:vdz0_brick");
+    memory->destroy(work3);
+    memory->create(work3, 2*nfft_both, "pppmintel:work3");
+
+    // new communicator for the double-size bricks
+    delete cg_pack;
+    int (*procneigh)[2] = comm->procneigh;
+    cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in,
+			   nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1,
+			   nylo_out,nyhi_out,nzlo_out,nzhi_out,
+			   procneigh[0][0],procneigh[0][1],procneigh[1][0],
+			   procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+
+    cg_pack->ghost_notify();
+    cg_pack->setup();
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -154,8 +231,18 @@ void PPPMIntel::compute_first(int eflag, int vflag)
 
   if (atom->nmax > nmax) {
     memory->destroy(part2grid);
+    if (differentiation_flag == 1) {
+      memory->destroy(particle_ekx);
+      memory->destroy(particle_eky);
+      memory->destroy(particle_ekz);
+    }
     nmax = atom->nmax;
     memory->create(part2grid,nmax,3,"pppm:part2grid");
+    if (differentiation_flag == 1) {
+      memory->create(particle_ekx, nmax, "pppmintel:pekx");
+      memory->create(particle_eky, nmax, "pppmintel:peky");
+      memory->create(particle_ekz, nmax, "pppmintel:pekz");
+    }
   }
 
   // find grid points for all my particles
@@ -184,13 +271,19 @@ void PPPMIntel::compute_first(int eflag, int vflag)
   // return gradients (electric fields) in 3d brick decomposition
   // also performs per-atom calculations via poisson_peratom()
 
-  poisson();
+  if (differentiation_flag == 1) poisson_ad();
+  else poisson_ik_intel();
 
   // all procs communicate E-field values
   // to fill ghost cells surrounding their 3d bricks
 
   if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
-  else cg->forward_comm(this,FORWARD_IK);
+  else {
+    if (_use_packing)
+      cg_pack->forward_comm(this,FORWARD_IK);
+    else
+      cg->forward_comm(this,FORWARD_IK);
+  }
 
   // extra per-atom energy/virial communication
 
@@ -297,48 +390,60 @@ void PPPMIntel::compute_second(int eflag, int vflag)
 template<class flt_t, class acc_t>
 void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
 {
-  int nx,ny,nz;
-
   ATOM_T * _noalias const x = buffers->get_x(0);
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
   int flag = 0;
 
   if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
     error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshift = shift;
-
-  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned
-  #pragma simd
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) reduction(+:flag) if(!_use_lrt)
   #endif
-  for (int i = 0; i < nlocal; i++) {
-
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // current particle coord can be outside global and local box
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
-    ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
-    nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;
-
-    part2grid[i][0] = nx;
-    part2grid[i][1] = ny;
-    part2grid[i][2] = nz;
-
-    // check that entire stencil around nx,ny,nz will fit in my 3d brick
-
-    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
-        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
-      flag = 1;
+  {
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
+
+    int iifrom, iito, tid;
+    IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T));
+
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma vector aligned
+    #pragma simd reduction(+:flag)
+    #endif
+    for (int i = iifrom; i < iito; i++) {
+
+      // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+      // current particle coord can be outside global and local box
+      // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+      int nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
+      int ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
+      int nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;
+
+      part2grid[i][0] = nx;
+      part2grid[i][1] = ny;
+      part2grid[i][2] = nz;
+
+      // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+      if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+          ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+          nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+        flag = 1;
+    }
   }
 
   if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
@@ -352,13 +457,11 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
    in global grid
 ------------------------------------------------------------------------- */
 
-template<class flt_t, class acc_t>
+template<class flt_t, class acc_t, int use_table>
 void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
 {
-  // clear 3d density array
-
-  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
+  FFT_SCALAR * _noalias global_density =
+    &(density_brick[nzlo_out][nylo_out][nxlo_out]);
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -368,52 +471,129 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
   ATOM_T * _noalias const x = buffers->get_x(0);
   flt_t * _noalias const q = buffers->get_q(0);
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshift = shift;
-  const flt_t fshiftone = shiftone;
-  const flt_t fdelvolinv = delvolinv;
-
-  for (int i = 0; i < nlocal; i++) {
-
-    int nx = part2grid[i][0];
-    int ny = part2grid[i][1];
-    int nz = part2grid[i][2];
-    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
-    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
-    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
-
-    flt_t rho[3][INTEL_P3M_MAXORDER];
-
-    for (int k = nlower; k <= nupper; k++) {
-      FFT_SCALAR r1,r2,r3;
-      r1 = r2 = r3 = ZEROF;
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nthr, nlocal, global_density) if(!_use_lrt)
+  #endif
+  {
+    const int nix = nxhi_out - nxlo_out + 1;
+    const int niy = nyhi_out - nylo_out + 1;
+  
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshift = shift;
+    const flt_t fshiftone = shiftone;
+    const flt_t fdelvolinv = delvolinv;
 
-      for (int l = order-1; l >= 0; l--) {
-        r1 = rho_coeff[l][k] + r1*dx;
-        r2 = rho_coeff[l][k] + r2*dy;
-        r3 = rho_coeff[l][k] + r3*dz;
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+    FFT_SCALAR * _noalias my_density = tid == 0 ?
+      global_density : perthread_density[tid - 1];
+    // clear 3d density array
+    memset(my_density, 0, ngrid * sizeof(FFT_SCALAR));
+
+    for (int i = ifrom; i < ito; i++) {
+  
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nysum = nlower + ny - nylo_out;
+      int nxsum = nlower + nx - nxlo_out;
+      int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+  
+      _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3;
+          r1 = r2 = r3 = ZEROF;
+  
+          for (int l = order-1; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+        }
+      }
+  
+      FFT_SCALAR z0 = fdelvolinv * q[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order; n++) {
+        int mz = n*nix*niy + nzsum;
+        FFT_SCALAR y0 = z0*rho[2][n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order; m++) {
+          int mzy = m*nix + mz;
+          FFT_SCALAR x0 = y0*rho[1][m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mzyx = l + mzy;
+            my_density[mzyx] += x0*rho[0][l];
+          }
+        }
       }
-      rho[0][k-nlower] = r1;
-      rho[1][k-nlower] = r2;
-      rho[2][k-nlower] = r3;
     }
+  }
 
-    FFT_SCALAR z0 = fdelvolinv * q[i];
-    for (int n = nlower; n <= nupper; n++) {
-      int mz = n+nz;
-      FFT_SCALAR y0 = z0*rho[2][n-nlower];
-      for (int m = nlower; m <= nupper; m++) {
-        int my = m+ny;
-        FFT_SCALAR x0 = y0*rho[1][m-nlower];
-        for (int l = nlower; l <= nupper; l++) {
-          int mx = l+nx;
-          density_brick[mz][my][mx] += x0*rho[0][l-nlower];
+  // reduce all the perthread_densities into global_density
+  if (nthr > 1) {
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(nthr, global_density) if(!_use_lrt)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr);
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int i = ifrom; i < ito; i++) {
+        for(int j = 1; j < nthr; j++) {
+          global_density[i] += perthread_density[j-1][i];
         }
       }
     }
@@ -424,7 +604,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
    interpolate from grid to get electric field & force on my particles for ik
 ------------------------------------------------------------------------- */
 
-template<class flt_t, class acc_t>
+template<class flt_t, class acc_t, int use_table, int use_packing>
 void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
 {
   // loop over my charges, interpolate electric field from nearby grid points
@@ -437,68 +617,151 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
   flt_t * _noalias const q = buffers->get_q(0);
   FORCE_T * _noalias const f = buffers->get_f();
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
 
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshiftone = shiftone;
-  const flt_t fqqrd2es = qqrd2e * scale;
-
-  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned nontemporal
-  #pragma simd
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
   #endif
-  for (int i = 0; i < nlocal; i++) {
-    int nx = part2grid[i][0];
-    int ny = part2grid[i][1];
-    int nz = part2grid[i][2];
-    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
-    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
-    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
-
-    flt_t rho[3][INTEL_P3M_MAXORDER];
-
-    for (int k = nlower; k <= nupper; k++) {
-      FFT_SCALAR r1 = rho_coeff[order-1][k];
-      FFT_SCALAR r2 = rho_coeff[order-1][k];
-      FFT_SCALAR r3 = rho_coeff[order-1][k];
-      for (int l = order-2; l >= 0; l--) {
-        r1 = rho_coeff[l][k] + r1*dx;
-        r2 = rho_coeff[l][k] + r2*dy;
-        r3 = rho_coeff[l][k] + r3*dz;
+  {
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho0[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+    _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0};
+
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+
+      int nxsum = (use_packing ? 2 : 1) * (nx + nlower);
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;;
+
+      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          if (use_packing) {
+            rho0[2 * k] = rho_lookup[idx][k];
+            rho0[2 * k + 1] = rho_lookup[idx][k];
+          } else {
+            rho0[k] = rho_lookup[idx][k];
+          }
+          rho1[k] = rho_lookup[idy][k];
+          rho2[k] = rho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1 = rho_coeff[order-1][k];
+          FFT_SCALAR r2 = rho_coeff[order-1][k];
+          FFT_SCALAR r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1*dx;
+            r2 = rho_coeff[l][k] + r2*dy;
+            r3 = rho_coeff[l][k] + r3*dz;
+          }
+          if (use_packing) {
+            rho0[2 * (k-nlower)] = r1;
+            rho0[2 * (k-nlower) + 1] = r1;
+          } else {
+            rho0[k-nlower] = r1;
+          }
+          rho1[k-nlower] = r2;
+          rho2[k-nlower] = r3;
+        }
       }
-      rho[0][k-nlower] = r1;
-      rho[1][k-nlower] = r2;
-      rho[2][k-nlower] = r3;
-    }
 
-    FFT_SCALAR ekx, eky, ekz;
-    ekx = eky = ekz = ZEROF;
-    for (int n = nlower; n <= nupper; n++) {
-      int mz = n+nz;
-      FFT_SCALAR z0 = rho[2][n-nlower];
-      for (int m = nlower; m <= nupper; m++) {
-        int my = m+ny;
-        FFT_SCALAR y0 = z0*rho[1][m-nlower];
-        for (int l = nlower; l <= nupper; l++) {
-          int mx = l+nx;
-          FFT_SCALAR x0 = y0*rho[0][l-nlower];
-          ekx -= x0*vdx_brick[mz][my][mx];
-          eky -= x0*vdy_brick[mz][my][mx];
-          ekz -= x0*vdz_brick[mz][my][mx];
+      _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekxy_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order; n++) {
+        int mz = n+nzsum;
+        FFT_SCALAR z0 = rho2[n];
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order; m++) {
+          int my = m+nysum;
+          FFT_SCALAR y0 = z0*rho1[m];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < (use_packing ? 2 : 1) *
+		 INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l+nxsum;
+            FFT_SCALAR x0 = y0*rho0[l];
+            if (use_packing) {
+              ekxy_arr[l] -= x0*vdxy_brick[mz][my][mx];
+              ekz0_arr[l] -= x0*vdz0_brick[mz][my][mx];
+            } else {
+              ekx_arr[l] -= x0*vdx_brick[mz][my][mx];
+              eky_arr[l] -= x0*vdy_brick[mz][my][mx];
+              ekz_arr[l] -= x0*vdz_brick[mz][my][mx];
+            }
+          }
         }
       }
-    }
 
-    // convert E-field to force
+      FFT_SCALAR ekx, eky, ekz;
+      ekx = eky = ekz = ZEROF;
+
+      if (use_packing) {
+        for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) {
+          ekx += ekxy_arr[l];
+          eky += ekxy_arr[l+1];
+          ekz += ekz0_arr[l];
+        }
+      } else {
+        for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+          ekx += ekx_arr[l];
+          eky += eky_arr[l];
+          ekz += ekz_arr[l];
+        }
+      }
 
-    const flt_t qfactor = fqqrd2es * q[i];
-    f[i].x += qfactor*ekx;
-    f[i].y += qfactor*eky;
-    if (slabflag != 2) f[i].z += qfactor*ekz;
+      // convert E-field to force
+
+      const flt_t qfactor = fqqrd2es * q[i];
+      f[i].x += qfactor*ekx;
+      f[i].y += qfactor*eky;
+      if (slabflag != 2) f[i].z += qfactor*ekz;
+    }
   }
 }
 
@@ -506,7 +769,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
    interpolate from grid to get electric field & force on my particles for ad
 ------------------------------------------------------------------------- */
 
-template<class flt_t, class acc_t>
+template<class flt_t, class acc_t, int use_table>
 void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
 {
   // loop over my charges, interpolate electric field from nearby grid points
@@ -519,118 +782,434 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
   const flt_t * _noalias const q = buffers->get_q(0);
   FORCE_T * _noalias const f = buffers->get_f();
   int nlocal = atom->nlocal;
+  int nthr;
+  if (_use_lrt)
+    nthr = 1;
+  else
+    nthr = comm->nthreads;
+
+  FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx;
+  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
+  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;
 
-  const flt_t ftwo_pi = MY_PI * 2.0;
-  const flt_t ffour_pi = MY_PI * 4.0;
-
-  const flt_t lo0 = boxlo[0];
-  const flt_t lo1 = boxlo[1];
-  const flt_t lo2 = boxlo[2];
-  const flt_t xi = delxinv;
-  const flt_t yi = delyinv;
-  const flt_t zi = delzinv;
-  const flt_t fshiftone = shiftone;
-  const flt_t fqqrd2es = qqrd2e * scale;
-
-  const double *prd = domain->prd;
-  const double xprd = prd[0];
-  const double yprd = prd[1];
-  const double zprd = prd[2];
-
-  const flt_t hx_inv = nx_pppm/xprd;
-  const flt_t hy_inv = ny_pppm/yprd;
-  const flt_t hz_inv = nz_pppm/zprd;
-
-  const flt_t fsf_coeff0 = sf_coeff[0];
-  const flt_t fsf_coeff1 = sf_coeff[1];
-  const flt_t fsf_coeff2 = sf_coeff[2];
-  const flt_t fsf_coeff3 = sf_coeff[3];
-  const flt_t fsf_coeff4 = sf_coeff[4];
-  const flt_t fsf_coeff5 = sf_coeff[5];
-
-  #if defined(LMP_SIMD_COMPILER)
-  #pragma vector aligned nontemporal
-  #pragma simd
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(nlocal, nthr) if(!_use_lrt)
   #endif
-  for (int i = 0; i < nlocal; i++) {
-    int nx = part2grid[i][0];
-    int ny = part2grid[i][1];
-    int nz = part2grid[i][2];
-    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
-    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
-    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
-
-    flt_t rho[3][INTEL_P3M_MAXORDER];
-    flt_t drho[3][INTEL_P3M_MAXORDER];
-
-    for (int k = nlower; k <= nupper; k++) {
-      FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
-      dr1 = dr2 = dr3 = ZEROF;
-
-      r1 = rho_coeff[order-1][k];
-      r2 = rho_coeff[order-1][k];
-      r3 = rho_coeff[order-1][k];
-      for (int l = order-2; l >= 0; l--) {
-        r1 = rho_coeff[l][k] + r1 * dx;
-        r2 = rho_coeff[l][k] + r2 * dy;
-        r3 = rho_coeff[l][k] + r3 * dz;
-	dr1 = drho_coeff[l][k] + dr1 * dx;
-	dr2 = drho_coeff[l][k] + dr2 * dy;
-	dr3 = drho_coeff[l][k] + dr3 * dz;
+  {
+    const flt_t ftwo_pi = MY_PI * 2.0;
+    const flt_t ffour_pi = MY_PI * 4.0;
+
+    const flt_t lo0 = boxlo[0];
+    const flt_t lo1 = boxlo[1];
+    const flt_t lo2 = boxlo[2];
+    const flt_t xi = delxinv;
+    const flt_t yi = delyinv;
+    const flt_t zi = delzinv;
+    const flt_t fshiftone = shiftone;
+    const flt_t fqqrd2es = qqrd2e * scale;
+
+    const double *prd = domain->prd;
+    const double xprd = prd[0];
+    const double yprd = prd[1];
+    const double zprd = prd[2];
+
+    const flt_t hx_inv = nx_pppm/xprd;
+    const flt_t hy_inv = ny_pppm/yprd;
+    const flt_t hz_inv = nz_pppm/zprd;
+
+    const flt_t fsf_coeff0 = sf_coeff[0];
+    const flt_t fsf_coeff1 = sf_coeff[1];
+    const flt_t fsf_coeff2 = sf_coeff[2];
+    const flt_t fsf_coeff3 = sf_coeff[3];
+    const flt_t fsf_coeff4 = sf_coeff[4];
+    const flt_t fsf_coeff5 = sf_coeff[5];
+  
+    int ifrom, ito, tid;
+    IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr);
+
+    _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+    _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+    for (int i = ifrom; i < ito; i++) {
+      int nx = part2grid[i][0];
+      int ny = part2grid[i][1];
+      int nz = part2grid[i][2];
+      FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
+      FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
+      FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;
+  
+      int nxsum = nx + nlower;
+      int nysum = ny + nlower;
+      int nzsum = nz + nlower;
+  
+      if (use_table) {
+        dx = dx*half_rho_scale + half_rho_scale_plus;
+        int idx = dx;
+        dy = dy*half_rho_scale + half_rho_scale_plus;
+        int idy = dy;
+        dz = dz*half_rho_scale + half_rho_scale_plus;
+        int idz = dz;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+          rho[0][k] = rho_lookup[idx][k];
+          rho[1][k] = rho_lookup[idy][k];
+          rho[2][k] = rho_lookup[idz][k];
+          drho[0][k] = drho_lookup[idx][k];
+          drho[1][k] = drho_lookup[idy][k];
+          drho[2][k] = drho_lookup[idz][k];
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma simd
+        #endif   
+        for (int k = nlower; k <= nupper; k++) {
+          FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
+          dr1 = dr2 = dr3 = ZEROF;
+  
+          r1 = rho_coeff[order-1][k];
+          r2 = rho_coeff[order-1][k];
+          r3 = rho_coeff[order-1][k];
+          for (int l = order-2; l >= 0; l--) {
+            r1 = rho_coeff[l][k] + r1 * dx;
+            r2 = rho_coeff[l][k] + r2 * dy;
+            r3 = rho_coeff[l][k] + r3 * dz;
+            dr1 = drho_coeff[l][k] + dr1 * dx;
+            dr2 = drho_coeff[l][k] + dr2 * dy;
+            dr3 = drho_coeff[l][k] + dr3 * dz;
+          }
+          rho[0][k-nlower] = r1;
+          rho[1][k-nlower] = r2;
+          rho[2][k-nlower] = r3;
+          drho[0][k-nlower] = dr1;
+          drho[1][k-nlower] = dr2;
+          drho[2][k-nlower] = dr3;
+        }
+      }
+  
+      _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+      _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0};
+  
+      particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF;
+  
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma loop_count=7
+      #endif   
+      for (int n = 0; n < order; n++) {
+        int mz = n + nzsum;
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma loop_count=7
+        #endif   
+        for (int m = 0; m < order; m++) {
+          int my = m + nysum;
+          FFT_SCALAR ekx_p = rho[1][m] * rho[2][n];
+          FFT_SCALAR eky_p = drho[1][m] * rho[2][n];
+          FFT_SCALAR ekz_p = rho[1][m] * drho[2][n];
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma simd
+          #endif   
+          for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) {
+            int mx = l + nxsum;
+            ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx];
+            eky[l] +=  rho[0][l] * eky_p * u_brick[mz][my][mx];
+            ekz[l] +=  rho[0][l] * ekz_p * u_brick[mz][my][mx];
+          }
+        }
+      }
+  
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){
+      	particle_ekx[i] += ekx[l];
+      	particle_eky[i] += eky[l];
+      	particle_ekz[i] += ekz[l];
+      }
+    }
+  
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int i = ifrom; i < ito; i++) {
+      particle_ekx[i] *= hx_inv;
+      particle_eky[i] *= hy_inv;
+      particle_ekz[i] *= hz_inv;
+  
+      // convert E-field to force
+  
+      const flt_t qfactor = fqqrd2es * q[i];
+      const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
+  
+      const flt_t s1 = x[i].x * hx_inv;
+      const flt_t s2 = x[i].y * hy_inv;
+      const flt_t s3 = x[i].z * hz_inv;
+      flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
+      sf += fsf_coeff1 * sin(ffour_pi * s1);
+      sf *= twoqsq;
+      f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf;
+  
+      sf = fsf_coeff2 * sin(ftwo_pi * s2);
+      sf += fsf_coeff3 * sin(ffour_pi * s2);
+      sf *= twoqsq;
+      f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf;
+  
+      sf = fsf_coeff4 * sin(ftwo_pi * s3);
+      sf += fsf_coeff5 * sin(ffour_pi * s3);
+      sf *= twoqsq;
+  
+      if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+   Does special things for packing mode to avoid repeated copies
+------------------------------------------------------------------------- */
+
+void PPPMIntel::poisson_ik_intel()
+{
+  if (_use_packing == 0) {
+    poisson_ik();
+    return;
+  }
+
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = ZEROF;
+  }
+
+  fft1->compute(work1,work1,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        eng = s2 * greensfn[i] * (work1[n]*work1[n] +
+				  work1[n+1]*work1[n+1]);
+        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+        if (eflag_global) energy += eng;
+        n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+        energy +=
+          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+        n += 2;
       }
-      rho[0][k-nlower] = r1;
-      rho[1][k-nlower] = r2;
-      rho[2][k-nlower] = r3;
-      drho[0][k-nlower] = dr1;
-      drho[1][k-nlower] = dr2;
-      drho[2][k-nlower] = dr3;
     }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (evflag_atom) poisson_peratom();
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_ik_triclinic();
+    return;
+  }
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkx[i]*work1[n+1];
+        work2[n+1] = -fkx[i]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  // y direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work3[n] = fky[j]*work1[n+1];
+        work3[n+1] = -fky[j]*work1[n];
+        n += 2;
+      }
+
+  fft2->compute(work3,work3,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxy_brick[k][j][2*i] = work2[n];
+	vdxy_brick[k][j][2*i+1] = work3[n];
+        n += 2;
+      }
+  
+  // z direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work2[n] = fkz[k]*work1[n+1];
+        work2[n+1] = -fkz[k]*work1[n];
+        n += 2;
+      }
 
-    FFT_SCALAR ekx, eky, ekz;
-    ekx = eky = ekz = ZEROF;
-    for (int n = nlower; n <= nupper; n++) {
-      int mz = n+nz;
-      for (int m = nlower; m <= nupper; m++) {
-        int my = m+ny;
-        FFT_SCALAR ekx_p = rho[1][m-nlower] * rho[2][n-nlower];
-        FFT_SCALAR eky_p = drho[1][m-nlower] * rho[2][n-nlower];
-        FFT_SCALAR ekz_p = rho[1][m-nlower] * drho[2][n-nlower];
-        for (int l = nlower; l <= nupper; l++) {
-          int mx = l+nx;
-          ekx += drho[0][l-nlower] * ekx_p * u_brick[mz][my][mx];
-          eky += rho[0][l-nlower] * eky_p * u_brick[mz][my][mx];
-          ekz += rho[0][l-nlower] * ekz_p * u_brick[mz][my][mx];
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz0_brick[k][j][2*i] = work2[n];
+	vdz0_brick[k][j][2*i+1] = 0.;
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   precompute rho coefficients as a lookup table to save time in make_rho
+   and fieldforce.  Instead of doing this polynomial for every atom 6 times
+   per time step, precompute it for some number of points.
+------------------------------------------------------------------------- */
+
+void PPPMIntel::precompute_rho()
+{
+
+  half_rho_scale = (rho_points - 1.)/2.;
+  half_rho_scale_plus = half_rho_scale + 0.5;
+
+  for (int i = 0; i < rho_points; i++) {
+    FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i;
+    #if defined(LMP_SIMD_COMPILER)
+    #pragma simd
+    #endif
+    for (int k=nlower; k<=nupper;k++){
+      FFT_SCALAR r1 = ZEROF;
+      for(int l=order-1; l>=0; l--){
+        r1 = rho_coeff[l][k] + r1*dx;
+      }
+      rho_lookup[i][k-nlower] = r1;
+    }
+    for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+      rho_lookup[i][k] = 0;
+    }
+    if (differentiation_flag == 1) {
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma simd
+      #endif
+      for(int k=nlower; k<=nupper;k++){
+        FFT_SCALAR r1 = ZEROF;
+        for(int l=order-2; l>=0; l--){
+          r1 = drho_coeff[l][k] + r1*dx;
         }
+        drho_lookup[i][k-nlower] = r1;
+      }
+      for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) {
+        drho_lookup[i][k] = 0;
       }
     }
-    ekx *= hx_inv;
-    eky *= hy_inv;
-    ekz *= hz_inv;
+  }
+}
 
-    // convert E-field to force
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
 
-    const flt_t qfactor = fqqrd2es * q[i];
-    const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];
+void PPPMIntel::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if ((flag == FORWARD_IK) && _use_packing) {
+    FFT_SCALAR *xsrc = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
+    FFT_SCALAR *zsrc = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = xsrc[list[i]];
+      buf[n++] = zsrc[list[i]];
+    }
+  } else {
+    PPPM::pack_forward(flag, buf, nlist, list);
+  }
+}
 
-    const flt_t s1 = x[i].x * hx_inv;
-    const flt_t s2 = x[i].y * hy_inv;
-    const flt_t s3 = x[i].z * hz_inv;
-    flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
-    sf += fsf_coeff1 * sin(ffour_pi * s1);
-    sf *= twoqsq;
-    f[i].x += qfactor * ekx - fqqrd2es * sf;
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
 
-    sf = fsf_coeff2 * sin(ftwo_pi * s2);
-    sf += fsf_coeff3 * sin(ffour_pi * s2);
-    sf *= twoqsq;
-    f[i].y += qfactor * eky - fqqrd2es * sf;
+void PPPMIntel::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if ((flag == FORWARD_IK) && _use_packing) {
+    FFT_SCALAR *xdest = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out];
+    FFT_SCALAR *zdest = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      xdest[list[i]] = buf[n++];
+      zdest[list[i]] = buf[n++];
+    }
+  } else {
+    PPPM::unpack_forward(flag, buf, nlist, list);
+  }
+}
 
-    sf = fsf_coeff4 * sin(ftwo_pi * s3);
-    sf += fsf_coeff5 * sin(ffour_pi * s3);
-    sf *= twoqsq;
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
 
-    if (slabflag != 2) f[i].z += qfactor * ekz - fqqrd2es * sf;
+double PPPMIntel::memory_usage()
+{
+  double bytes = PPPM::memory_usage();
+  if ((comm->nthreads > 1) && !_use_lrt) {
+    bytes += (comm->nthreads - 1) * (ngrid + INTEL_P3M_ALIGNED_MAXORDER) *
+      sizeof(FFT_SCALAR);
+  }
+  if (differentiation_flag == 1) {
+    bytes += 3 * nmax * sizeof(FFT_SCALAR);
+  }
+  if (_use_table) {
+    bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
+    if (differentiation_flag == 1) {
+      bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR);
+    }
+  }
+  if (_use_packing) {
+    bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1) 
+               * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR);
+    bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1)
+               * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR);
+    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);
+    bytes += cg_pack->memory_usage();
   }
+  return bytes;
 }
 
 /* ----------------------------------------------------------------------
@@ -640,13 +1219,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
 void PPPMIntel::pack_buffers()
 {
   fix->start_watch(TIME_PACK);
+  int packthreads;
+  if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
+  else packthreads = 1;
   #if defined(_OPENMP)
-  #pragma omp parallel default(none)
+  #pragma omp parallel if(packthreads > 1)
   #endif
   {
     int ifrom, ito, tid;
     IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
-                              comm->nthreads, 
+                              packthreads, 
                               sizeof(IntelBuffers<float,double>::atom_t));
     if (fix->precision() == FixIntel::PREC_MODE_MIXED)
       fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);
diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h
index 40669a5561..89bc3998e0 100644
--- a/src/USER-INTEL/pppm_intel.h
+++ b/src/USER-INTEL/pppm_intel.h
@@ -1,4 +1,4 @@
-/* -*- c++ -*- ----------------------------------------------------------
+/* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -12,7 +12,9 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Rodrigo Canales (RWTH Aachen University)
+   Contributing authors: William McDoniel (RWTH Aachen University)
+                         Rodrigo Canales (RWTH Aachen University)
+			 Markus Hoehnerbach (RWTH Aachen University)
                          W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
@@ -36,6 +38,9 @@ class PPPMIntel : public PPPM {
   virtual ~PPPMIntel();
   virtual void init();
   virtual void compute(int, int);
+  virtual void pack_forward(int, FFT_SCALAR *, int, int *);
+  virtual void unpack_forward(int, FFT_SCALAR *, int, int *);
+  virtual double memory_usage();
   void compute_first(int, int);
   void compute_second(int, int);
   void pack_buffers();
@@ -47,18 +52,74 @@ class PPPMIntel : public PPPM {
  protected:
   FixIntel *fix;
 
+  int _use_lrt;
+  FFT_SCALAR **perthread_density;
+  FFT_SCALAR *particle_ekx;
+  FFT_SCALAR *particle_eky;
+  FFT_SCALAR *particle_ekz;
+
+  int _use_table;
+  int rho_points;
+  FFT_SCALAR **rho_lookup;
+  FFT_SCALAR **drho_lookup;
+  FFT_SCALAR half_rho_scale, half_rho_scale_plus;
+
+  int _use_packing;
+  FFT_SCALAR ***vdxy_brick;
+  FFT_SCALAR ***vdz0_brick;
+  FFT_SCALAR *work3;
+  class GridComm *cg_pack;
+
   #ifdef _LMP_INTEL_OFFLOAD
   int _use_base;
   #endif
 
+    template<class flt_t, class acc_t>
+    void test_function(IntelBuffers<flt_t,acc_t> *buffers);
+
+  
+  void precompute_rho();
   template<class flt_t, class acc_t>
   void particle_map(IntelBuffers<flt_t,acc_t> *buffers);
-  template<class flt_t, class acc_t>
+  template<class flt_t, class acc_t, int use_table>
   void make_rho(IntelBuffers<flt_t,acc_t> *buffers);
   template<class flt_t, class acc_t>
+  void make_rho(IntelBuffers<flt_t,acc_t> *buffers) {
+    if (_use_table == 1) {
+      make_rho<flt_t,acc_t,1>(buffers);
+    } else {
+      make_rho<flt_t,acc_t,0>(buffers);
+    }
+  }
+  void poisson_ik_intel();
+  template<class flt_t, class acc_t, int use_table, int use_packing>
   void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers);
   template<class flt_t, class acc_t>
+  void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) {
+    if (_use_table == 1) {
+      if (_use_packing == 1) {
+        fieldforce_ik<flt_t, acc_t, 1, 1>(buffers);
+      } else {
+        fieldforce_ik<flt_t, acc_t, 1, 0>(buffers);
+      }
+    } else {
+      if (_use_packing == 1) {
+        fieldforce_ik<flt_t, acc_t, 0, 1>(buffers);
+      } else {
+        fieldforce_ik<flt_t, acc_t, 0, 0>(buffers);
+      }
+    }
+  }
+  template<class flt_t, class acc_t, int use_table>
   void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers);
+  template<class flt_t, class acc_t>
+  void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) {
+    if (_use_table == 1) {
+      fieldforce_ad<flt_t,acc_t,1>(buffers);
+    } else {
+      fieldforce_ad<flt_t,acc_t,0>(buffers);
+    }
+  }
 };
 
 }
diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index afb7852f98..b44870e9b0 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -78,17 +78,17 @@ void VerletLRTIntel::init()
    setup before run
 ------------------------------------------------------------------------- */
 
-void VerletLRTIntel::setup()
+void VerletLRTIntel::setup(int flag)
 {
   if (_intel_kspace == 0) {
-    Verlet::setup();
+    Verlet::setup(flag);
     return;
   } 
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (_intel_kspace->use_base()) {
     _intel_kspace = 0;
-    Verlet::setup();
+    Verlet::setup(flag);
     return;
   }
   #endif
diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h
index a699c20796..0521b161c7 100644
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@@ -42,7 +42,7 @@ class VerletLRTIntel : public Verlet {
   VerletLRTIntel(class LAMMPS *, int, char **);
   virtual ~VerletLRTIntel();
   virtual void init();
-  virtual void setup();
+  virtual void setup(int flag = 1);
   virtual void run(int);
 
  protected:
diff --git a/src/atom.cpp b/src/atom.cpp
index 6fa1cd8ef8..df4db0a842 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -40,6 +40,10 @@
 #include "memory.h"
 #include "error.h"
 
+#ifdef LMP_USER_INTEL
+#include "neigh_request.h"
+#endif
+
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
@@ -1882,6 +1886,53 @@ void Atom::setup_sort_bins()
   bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
   bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
 
+  #ifdef LMP_USER_INTEL
+  int intel_neigh = 0;
+  if (neighbor->nrequest) {
+    if (neighbor->requests[0]->intel) intel_neigh = 1;
+  } else if (neighbor->old_nrequest)
+    if (neighbor->old_requests[0]->intel) intel_neigh = 1;
+  if (intel_neigh && userbinsize == 0.0) {
+    if (neighbor->binsizeflag) bininv = 1.0/neighbor->binsize_user;
+
+    double nx_low = neighbor->bboxlo[0];
+    double ny_low = neighbor->bboxlo[1];
+    double nz_low = neighbor->bboxlo[2];
+    double nxbbox = neighbor->bboxhi[0] - nx_low;
+    double nybbox = neighbor->bboxhi[1] - ny_low;
+    double nzbbox = neighbor->bboxhi[2] - nz_low;
+    int nnbinx = static_cast<int> (nxbbox * bininv);
+    int nnbiny = static_cast<int> (nybbox * bininv);
+    int nnbinz = static_cast<int> (nzbbox * bininv);
+    if (domain->dimension == 2) nnbinz = 1;
+
+    if (nnbinx == 0) nnbinx = 1;
+    if (nnbiny == 0) nnbiny = 1;
+    if (nnbinz == 0) nnbinz = 1;
+
+    double binsizex = nxbbox/nnbinx;
+    double binsizey = nybbox/nnbiny;
+    double binsizez = nzbbox/nnbinz;
+
+    bininvx = 1.0 / binsizex;
+    bininvy = 1.0 / binsizey;
+    bininvz = 1.0 / binsizez;
+
+    int lxo = (bboxlo[0] - nx_low) * bininvx;
+    int lyo = (bboxlo[1] - ny_low) * bininvy;
+    int lzo = (bboxlo[2] - nz_low) * bininvz;
+    bboxlo[0] = nx_low + static_cast<double>(lxo) / bininvx;
+    bboxlo[1] = ny_low + static_cast<double>(lyo) / bininvy;
+    bboxlo[2] = nz_low + static_cast<double>(lzo) / bininvz;
+    nbinx = static_cast<int>((bboxhi[0] - bboxlo[0]) * bininvx) + 1;
+    nbiny = static_cast<int>((bboxhi[1] - bboxlo[1]) * bininvy) + 1;
+    nbinz = static_cast<int>((bboxhi[2] - bboxlo[2]) * bininvz) + 1;
+    bboxhi[0] = bboxlo[0] + static_cast<double>(nbinx) / bininvx;
+    bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy;
+    bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz;
+  }
+  #endif
+
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
 
-- 
GitLab