From 9b9f6d6fe2f4559c7123a017171b2698bd3837d5 Mon Sep 17 00:00:00 2001 From: Steve Plimpton <sjplimp@sandia.gov> Date: Fri, 16 Jun 2017 16:56:28 -0600 Subject: [PATCH] USER-INTEL upgrade from M Brown --- doc/src/JPG/user_intel.png | Bin 14684 -> 14487 bytes doc/src/accelerate_intel.txt | 112 +- doc/src/fix_neb.txt | 261 +- doc/src/kspace_modify.txt | 3 +- doc/src/kspace_style.txt | 4 + doc/src/pair_lj_long.txt | 1 + examples/neb/in.neb.hop1 | 2 +- .../{in.neb.hop1freeend => in.neb.hop1.end} | 4 +- .../{initial.hop1freeend => initial.hop1.end} | 0 src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi | 2 +- .../OPTIONS/Makefile.intel_knl_coprocessor | 2 +- src/MAKE/OPTIONS/Makefile.knl | 2 +- src/REPLICA/fix_neb.cpp | 188 +- src/USER-INTEL/README | 13 +- src/USER-INTEL/TEST/README | 57 +- src/USER-INTEL/TEST/in.intel.eam | 3 + src/USER-INTEL/TEST/in.intel.lc | 2 + src/USER-INTEL/TEST/in.intel.lj | 2 + src/USER-INTEL/TEST/in.intel.rhodo | 4 +- src/USER-INTEL/TEST/in.intel.sw | 2 + src/USER-INTEL/TEST/in.intel.tersoff | 2 + src/USER-INTEL/TEST/in.intel.water | 2 + src/USER-INTEL/TEST/in.lc_generate_restart | 12 +- src/USER-INTEL/TEST/run_benchmarks.sh | 86 + src/USER-INTEL/angle_charmm_intel.cpp | 113 +- src/USER-INTEL/angle_harmonic_intel.cpp | 113 +- src/USER-INTEL/bond_fene_intel.cpp | 101 +- src/USER-INTEL/bond_harmonic_intel.cpp | 98 +- src/USER-INTEL/dihedral_charmm_intel.cpp | 270 +- src/USER-INTEL/dihedral_harmonic_intel.cpp | 88 +- src/USER-INTEL/dihedral_opls_intel.cpp | 88 +- src/USER-INTEL/fix_intel.cpp | 166 +- src/USER-INTEL/fix_intel.h | 26 +- src/USER-INTEL/improper_cvff_intel.cpp | 176 +- src/USER-INTEL/improper_harmonic_intel.cpp | 121 +- src/USER-INTEL/intel_buffers.cpp | 132 +- src/USER-INTEL/intel_buffers.h | 25 +- src/USER-INTEL/intel_preprocess.h | 548 ++- src/USER-INTEL/intel_simd.h | 8 +- src/USER-INTEL/nbin_intel.cpp | 20 +- src/USER-INTEL/npair_full_bin_intel.cpp | 505 +-- src/USER-INTEL/npair_full_bin_intel.h | 5 +- .../npair_half_bin_newtoff_intel.cpp | 451 --- src/USER-INTEL/npair_half_bin_newtoff_intel.h | 52 - .../npair_half_bin_newton_intel.cpp | 534 +-- src/USER-INTEL/npair_half_bin_newton_intel.h | 3 - .../npair_half_bin_newton_tri_intel.cpp | 435 +-- .../npair_half_bin_newton_tri_intel.h | 3 - src/USER-INTEL/npair_intel.cpp | 872 ++++- src/USER-INTEL/npair_intel.h | 8 +- src/USER-INTEL/pair_buck_coul_cut_intel.cpp | 195 +- src/USER-INTEL/pair_buck_coul_cut_intel.h | 2 +- src/USER-INTEL/pair_buck_coul_long_intel.cpp | 364 +- src/USER-INTEL/pair_buck_coul_long_intel.h | 4 +- src/USER-INTEL/pair_buck_intel.cpp | 199 +- src/USER-INTEL/pair_buck_intel.h | 2 +- src/USER-INTEL/pair_eam_intel.cpp | 451 +-- src/USER-INTEL/pair_eam_intel.h | 4 +- src/USER-INTEL/pair_gayberne_intel.cpp | 273 +- src/USER-INTEL/pair_gayberne_intel.h | 2 +- .../pair_lj_charmm_coul_long_intel.cpp | 209 +- .../pair_lj_charmm_coul_long_intel.h | 2 +- .../pair_lj_cut_coul_long_intel.cpp | 375 +- src/USER-INTEL/pair_lj_cut_coul_long_intel.h | 4 +- src/USER-INTEL/pair_lj_cut_intel.cpp | 250 +- src/USER-INTEL/pair_lj_cut_intel.h | 3 +- .../pair_lj_long_coul_long_intel.cpp | 50 + src/USER-INTEL/pair_lj_long_coul_long_intel.h | 39 + src/USER-INTEL/pair_sw_intel.cpp | 371 +- src/USER-INTEL/pair_sw_intel.h | 2 +- src/USER-INTEL/pair_tersoff_intel.cpp | 223 +- src/USER-INTEL/pair_tersoff_intel.h | 2 +- src/USER-INTEL/pppm_disp_intel.cpp | 3034 +++++++++++++++++ src/USER-INTEL/pppm_disp_intel.h | 238 ++ src/USER-INTEL/pppm_intel.cpp | 1072 ++++-- src/USER-INTEL/pppm_intel.h | 67 +- src/USER-INTEL/verlet_lrt_intel.cpp | 6 +- src/USER-INTEL/verlet_lrt_intel.h | 2 +- src/atom.cpp | 51 + 79 files changed, 8553 insertions(+), 4670 deletions(-) rename examples/neb/{in.neb.hop1freeend => in.neb.hop1.end} (91%) rename examples/neb/{initial.hop1freeend => initial.hop1.end} (100%) create mode 100755 src/USER-INTEL/TEST/run_benchmarks.sh delete mode 100644 src/USER-INTEL/npair_half_bin_newtoff_intel.cpp delete mode 100644 src/USER-INTEL/npair_half_bin_newtoff_intel.h create mode 100644 src/USER-INTEL/pair_lj_long_coul_long_intel.cpp create mode 100644 src/USER-INTEL/pair_lj_long_coul_long_intel.h create mode 100644 src/USER-INTEL/pppm_disp_intel.cpp create mode 100644 src/USER-INTEL/pppm_disp_intel.h diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png index 0ebb2d1ae08cdd8ddd0d150f29d0da7b12e5520d..302b50124a0429d0f64df1a9979a5265051f8112 100755 GIT binary patch literal 14487 zcmeHuXH-*Z7j76UqB4p!L2#rA(h&iv77z$UDI&cGLX}=ZQ&a>gLk~5e6lo%aUIhiD zmrx8{I)n}hC6s#-(7|tJ?sxB6_x`%~hs#CIIs4uFd7u64z2B4gKU8{f?#$&gAQ0%B zoa}uS5a_oQ5QrrB^l!i?ZTMFLz<(qTDi7|0KD5!#0)HGgyQ6pq1j-8`+c7)={7w2) zR?`6ly68ywM=}gybOnKgIpyx(QFGN>NDvR3ED3{*sJNfXX}g>_d{;B1V^#YOH*<@O z`AW{!z6<tbs;Oeg0-c6H-|F*hPiJ@TywG2!cgVVShi!(DWSMOEb>QYe^&U2NpA1+l z;hN08(ALQQF%ZZ{>B75)c@hxF{Uq`f2sA_A!T<s#Iq<@OOKmlT3%5`Gw+nmXo4acb zjNUsd<I_t}ZyVaJ&G^e)9MD$wTfc#%sts@}TRXD1_SvZ_V=HNJ6#}j<orMh)e%VgQ zZ>K>ZabI*bStF{sO-D4W!ei^&RhHBQBPI~Y@f>8IySr6lZ7Fi+1vK}Y@B#@)D%}9L zbhj0r^~(DE*XZN<8dmYn^gG*FFa!wezEol(3gt#FS3f`UJU=77csoFMnoR*#R}Uoz z!Z?BSran`@Q#(t6ZotnvR+*85mKAvME;Vq^elC&S4hh;4e%w!&8ra<OSv4!HwRN=P zb_;|Is+B$lfj}KRaB&bwYT5utK+zWs0O)ug0$A@5`M-bRk4?Pf-qzCoinr2>cwr|J zO2Q_ce)bJFvbRuu^dl0;$a%sx-t|Ej9|Jbe=XV3#!Y5!auO5eciV~mtzk17v?}#Ko zh6YASuf0Uv%RK1lxPt}ql7krAZy*m5z=dr={0IWkL^~4G9%8bFFzRfYZ>Trau%0@+ z=NRE00sMpkMR;hc5PD;yKu21ov?40~-b<Htqu}ZLAfF`=()NKZq`E4*sdqj|x5Sh6 znuc1hM$1b4Y0xqk5bsROd=D&(AHU;-Q)~=GmlX7tJp`_9KG(1IYP2E&nb%Mv5oNA4 zs|tyUZ7^ATGl*ndG>#9wh@KDfRX%ittO8y9P5`OAD2R4n#4jyDOZm+7t5WX0%ry-V zNu(eFRUK=ox{?sHYqFs|Jkn*4>`SX$S*m^Jgmk_J`XuBFJS~b;W~NZBz=JD81u&Y` z*`2=?qBZQB2v7(BVV2zlHqlj*7w=5q)bGC$&3SIWC|gIb?=%V`w2y)Ap8!JjgH-ct zm)q1Wl7Nh!17;r}5#D0fTZU_<?{H%P0Y}b5MuOW~^sIDa+*WZWtCpBSKzTZ68t3=- zLcHbG+3qUEmz6}a!7JSngnWN{?Xj88Ag%paw=~UL=xob;pYnXeLd|&i=a8T#96%K+ zc;#`(lF3KhOOu6JG9OAk;Gw(E(O?6d8BKydz4zWlP}V8X=`#?XXFwWAwHx3r!Yjjf zjCUx9XMsC?33qa@F17P4tGWS>)XcN+!pIvjY{nXr(Fi@9Tiz+q>vM#MVef^-Y{)^s zGqmXahTKSMk+5z@3}YYwwMhw=%XxEgH(bR*QdYb$(w5-|_V~$$m9F!Tf$XIstO?}f z`q^Kc@M{lQWy2LZgFvr;BQ8KwC1>m9+qNDA1G3`kO=4iiJG47!bUNbeH1!M#=*1~y z^r!iuv*-86r`cox(=r6pyBDUsjoZyOK%ga}Aq~wv!QC6KH9tJ>`|7Q#N2#~^BV0fR z1P+XEDoDcKj1&P<Ax8B2^K$53{kQO;abS=04xpgaZU6+WmWUGrkdr<xCkQ_1GHVy_ zH`#2ni0+_7K|WuJu!b~Aei^R>d>1B~Z=jJN_;~V{@ISv=r#M3~iI97Uuroo}cJ~A1 zBqu=XqOz5?`$!czbbS4O-ZpwnwNSfULox@xJ8G131vDy3bcedJ;tc&dQl2O8tR(G@ zN~l^zCtQt_l%u@&JZL$G=!D;d@mzwFeV*^Ds4k(cd&VDKMw$62V=V@vqKZGj3Y~qX zmp8n3R2edeRP#Cc^~$Zb>9ZnsbJxyx7C4&YPWbf=;7x3+ZbVKknJ(j3HU6ltXF;0a zEX(~4QqtAsgV^hI?j#cNpyodFP_xx_$KiWnqJppmqcS%{Q}be;Qk73(lG;kzgid{# zhUF>O^BN3aRH-|B66;xFqX6=Yu8#VU`B{~XVlvnx*cV~WY1dd3L;=?tI>Iy7?<<dA zR0P{X5OMbXA~sn6$rk<#R{Aj9^2^qbNE=hy+ok>ImoK^VM<&idb%2~`o4376JIe6K zSrzKr-doA7Uj5C%Apow4ca+n*D8z%nH@N`zi3KMrq_VkRjyBiY=EGx6Bm2}ru5v%` zNPt&5Z<AJ0oC8t*W%={`nynhyY&kkze@Wk?Ju&wx$UTWob=IF4h(wDQUSM*7O7dk0 z=|viGDV}u`4)IP@$$ivww5Xgu$?`i8$n&p&Bq7zGo!(XLVorfJS%0qK7Kqlz&9=6m z0DUU&`WZYU#G5B_2~FlIE=7IBs%?gc(kGXB68GdEPEY}v!tmf{6u-vFix1u2m=)Me z6PF_YX=U{+(q$*BbeD0aVdfV@RZ_G1=v3~_3*tT!GmUEQZ`1Ak(@r2qR2riogA7Sz zKV7bwFYa#LlVSt?9{m@jys-9=zAaDp8SAx+h>AoZm=E_)BdO#_WtobG(CAfp?~1-- z>woQwC)|@P@cWtyc?Y?8jJ)#tP43PDKwp=Ik1VFYieC$-cK2BX?Ct%fk*9stOl?X8 zCcb36;k}RQ@<T7}ChZKboSPHanO(!LtHL}Y$U9EK1CDFO0&I$gRIlKCA&9OA#?d+G z?{+XBn@?)Y9bFOTTD*8J4(*uT8YR$G?!(*3Al1?$75!r0NI_ptsD?^`RRNkc%x^5? z3$}YUOPAL8B%~M(I4+t)<pFMa`Zw850BN5@&Pz#Ja1)@oL`&6)DYDagb0pg&KOX-) zKdDGBa1zAx!zD%tVz@TWqrQ!&hh*l0g7j}(iUhoQ-b#E%Nbrv$(8tz~rCoa#l}6wb zwd_z(J-0UUGoVaEJ{ht}sqCEv>&7@y`ei6K)t4VObQWZL?(p_i8l>bXs_8o)%!i@o zFiL>)cn`RK<}siHI>3YsqCe*8l@Xj-upasPc08p%kf<mBy2gafc)h8D;KZxz&V7D8 z@8GWmyX3x&AN-vnhHHuxb08Szp;FD<@Ll}1O#N*~XN@lElPIMe@w4JwuCL{k9nNxH zG)DuQ@6G%fwi~kDu(Ac)2lOE1o-!GOQ5TB--kalE&a>04ULn4@xsAsKJBPg<bRreW zdtxsJ1@Rz`b@z{Vpq~4|*Kam-06^$&^TIGKEWssClW+W<yg~%%aPXf`{2JF=sdXq# z(AG$4ekYRUu*4}EEoO!+>BB~Ial?402f6Fa%Dg{7gYR&CAJ=PgOLuj?RpS`ZUk$3p z?h~KD?y<(ifD&SNE!NlqIR!pP%CBuCpOxsjWs_E4s-SN7*ds^$3=rL(=^cfM^Kp&i zM_{Oo*&TIaN*7m%dC!`}L)C48>?18r(t@3RvV4)GaQ(Q00&#~^8Ie3RBhWKdiLuu% z8;5)xW%<owmUIe!z*9zuB;Bs3O?<`fiEm!u|9FxdIg&T^g#_ljPPMSMQ)N&CKHl9$ z(ZY8?x(2v7Zb#y8_Q-W>o=2a4pUMU|KIH2zcyT8~a=G?DKW-YG??)}ToLa``K0Cp4 z;UQR=x8X;Myqv&pH*B8F(x>a<mVB|#^0#GjC9HGvmQanYJ>~dOEKVHZg3tB&)yg(6 zz6S2{4LdvE>NWoef!z9&QR~3Bbp72M>M?YSN6_?y?}~qGKV@&GjB9V$)c=i44?yn@ zKo}sK$smY4ulokW-~EnN^m(7-bOiu70dS3wjk~d)`DTM#T!uAZAV+}hB;;<*{2%nc z+yxhZ;d_Dy$gQLwY3KJ82&tp%Qo%uL8Q^|DPgMb|_MGYfU9VP#VX9v~F@t+<Q?FM{ zR{w_sz{lUbjIdZErsIA`*GQ#67FX{qHnfjqwXeaJhR^<DO`>IZXLuVuZMU`Y;+Ln$ zk;Rg32%Oz^@ylO6*GEpn)@{iSak($bwGRx64%{@<{xXCDcg3~Sn-SoTIrNMcb3c@i zYLUEN`fX?wz!KZ-rDK0RR$seYQzCn_3sBH|8b2^7KpAlkG6uf6&jggtpHRI`pfdix zXQ0D;`4>bN3TVhSPK{sdlm#<Fsy9#@JC1wWz{Ud5cU<dKdXx9mo0t1yP=O;9M`znG zKXT?UaDQj5z?XPmyHN9#6Ue{t|F3Hkl=##9$jSd@GoEm9#}j{#_&NHX@ZS-lYS~+$ zt@eAgcTe&&<2GYsRnQ~L5R$(?-Ia@Oz##{I&)Q#ES$SuUn{D>V(6#Ozc=zcaA~(YM zA}y7bS<_~_m%<lA+Sf;(8>hoQ&M*l0wQ9j={_X`(LWawR46v5>zV`Le!Gp@IaBdia zcK-9unJs08_hI_f9V*Df6hq!Ny|UJtWQb_-vffz6^zp=S<&@^jrg7K$5Xqc$xQpkZ zKYBM949S=+v4(Rj5ke}*LgebJ*j9U;a(fy4ZoDx8`a)tF{buo#L_p{BkyCMTzH#pz zZNGXw57a)eJ|chcwWD8J!as+j)S~xfsNJ>i>D5@oP=J>xW7Rym%pI~^h`as|oZE%( z`ewuc<NTl1*uH+_e?as|ejcF_P-_?^GIB4h3{mR+0|D0!ahUWj%34qD%X<G{6S$m9 zJB-i6ZvgT~)P^I3cYBUDll*jY<%h1Ar`J6xktAjtgCUM@{~2vJpvOeXIeKXg5`wNK zCuol=@8j=43O=_ZsFX4<tjcU3LI{1V=E+OI4rx8Hq8sAMC4<^s=0`{g>=vWRS_TrU zrsSZh*qf|IPN(h-5f-ZR%{xO#t3YCJ@_9b_27yE?2HCYh+8!S#Ha4)+rwF`d2vpw$ zvdln-D4pcr|MvI~dHlu2!@Q>=ujkkS1cH~F*qt1t=}({Qd^0*22`$_2!9HYrH_bzV z*v0e+g2Kzqpzw0<HPF!bJw2|ooPOsGAW>5*b3_PM3^>*w5W4agOry^w&k+SGM1vU4 zn^UUEk_U=SLcIee3W!_9Hqn1E_?g`cfclevKV34a>`Mjge9L$bG2XEnwV4p0mxG;t z@acs2<pNniMHq4kT17)mo_5!p&_mr5{*8MRl_U-9>Y6K-h^s3v<u+3+&_vf>cv^!v z$#rp|0N#*A+>ZaVp%LoLzW@SMoPSyC8IZ33;vvuvQT<C?TFd~a{t*o+pnU!dhCqh@ z3-ZT+!ul`!^}iUDZOaluat}^8Y!pzFiF#-ma_8tC9rbFz=(XYg<`!(2vlKTCoKz~X zTOfxSULNJvQBbPL!Uk-`L}|BHDW#g=&K3)XzYybSP~}(Gmi%|fe2A*{ul9}+l@s$U z44u2c;I+HWf~@n8+1@Jw)YH<Bgh-E=aq=HKrvL6og9)5u#;rfr&&TxBdF1Rmw{&kb z0G+6=D4?7`Mzqj-ABD=>)<OyKRboInZLe+wDWp2W_><`|B748UQN@fH7T)bBal*EY zS}q6~r&UP;X}(Yp|H|%g!+->lC4=l2F(caKKI8ziFWuMb>+cIk0syyxL@z+uU#>so zp434(FQ`@U^ahltA1ZQg{E$XKQiK9opk?@P3y|mV0)BG30x75Tx#W<sKyZZ0{TQCH zL{5||AU}_8&rf?q0-ZWaO!Joq{vA;N=td<|Lgw&<=8VYiqqzP5g@_9G|DhvRmH&rW zx0kR*cyB|&h)~s*;WFI#B6^UF2i4)IK=0yLw+_zpZ^Z>@)h_yP7#w8u-!c$i%(At< zs+FkMvmokDuo?HSHp|0w+&$UaH%?lRma`52YvPVjj(4q=*@%@#eT0pX?QLU#zSDDg z8sV*^y+p2M^gIf_o^nJl!-lBW2_EvyKr@qASk3)OwT#C4uEoH2Vfpx+4Z<a-C(C?y zzzQ7~!_6#06<YQbAEi~=u&!~J5F1C&|F~*R8}Dhb#jF?S$!D0<TXl*|;=t&t0#}$n z+LxNj1F%F<@MR)Ki<ts>iidL(o@Cm*dL`3$B(sd7o`to&8(3Z3)v%^vf?X@jR@r1E zQ$;UP<<*iRWviFSwjghV(Ggn=$t-xgi1rXPG6>Ccsdev-VQ}3M(_(kQ_D*Y$tM-yI zco_!VD{kT4U^{32SjbR46;%{Ed*3rTPj1y~XGes!z0%g;*GKO)&y?U)H#>{d6T%vJ z=qyAr@+@Dcy5E=gh^OEp!uorA&$Q&h#Z=#z7~w`=o?r~0Hm4PTti1<C>MMu!=1Gn( zTaLSzZuu-0o^jwj8WvP5Y`^WKM>=|1Tebda9(~JRrCS<R<CpmwG61c1RDKyY0fN}t z_88PLxVxeCQiSRNd4`IQqsFVpxyDLS;Nm~Co%_Lh6%0=lUCWb;Za{eGEHTeZYhV9u zHCnB(*|zIz331fb42SnhvT|lqcSbV|IOVE84+49o&o7?8Fb%(4a!<@*0n7>^*9%sP zT<Od}_;0D=E@rH$f<3=vu)ww%IfvMBZ-4*(sl<Gs%(+v>V|@Jj5S^29n^t3hGFYOH z)kcds*6~VD4ti*Ov*c3R{%nSx#3L<bY_Cah&*LZ?Tiy>|L>5H*s<PzNg@jnuQ8R!> zUZ}qDN<7<)lfvq=_KI?EF{w6*V&7Xe3rPfbEoe5}2pX=#J18`tqxSI6lR=g@tk^*% zcgf|Slz0PHA6xVZ-ND&bu^d`9RyEZLz=SDlldUy%1U2*1P&a>kn1*Aq)_pX)oMyZ) zZ*M^>tssgK^qZP@$)x$ieWUPOD$227Cr%Hy4Al%6QQ=g$jJ+O+y+rq*C_%P=&o+_I z12ZlQ7Ul<021Dhjh58(aR3ZxkI}7xi<g6hsvgq%dB`I&<Rcq`eEpL;P)L>psNlq1v z7=O#z&z{19vM%j;I=%8ayGlC;jQ2}xs!&7C=|q`S#rnMzUCK<byMwVVH16}>-tV8% z+fm{|20X*A&c3yrEpsjmzZDfxIC@|1>(k-E%y1VwC|^%DCpI=AJF&F#8~l2PfJ>va za>zSlt*P>&(X!N8pQ_L93aeK{)Lm?yH-sg)i}>%b$DUw0`3TA;DXlJycp`@36>86W z-a(RDbmb7UW2(|Q@|99ZCqwGS{1S|&#jZk}nMQ<avkIz2d}%@>=QR536Zea~!P8Z% zdcEWLl7b1AT(y7^dYXYZL4r<&;HtOj4V`>kQw+Clq4KI+5Rcx@mRl4Mmyq#sBk+aZ zxz?Htc!glD_?7DjCwScSw@i))h3U0y@-fuQmVl%%RBWcp#|}2%Rue^A6uvPLft&Uy z#oU_o=A!l}YpQimu2}c5Wp`ux%<CkCNJwSun(Q)q6Q6O(78A1?aMXcWhRi=UM!G}< z5_GP#wg&Q-!{`~Qon~VSZ>vtLDR>2BK9%PodH~O`FdWcpuJF+(W^bLidJ}o(>?i(# z1(bniIhM0fE3FlYX~;0mU}vkgkp2qJyWybkCNo(p?V+_AOU+Sddp>ZXu0FP`iBi7{ zH9iB6awPBhld&rGb24Ho+s$XS^~FIB9MA5Fo_;hO+n0DgqWyNukb_lZBodm-qW39R znx7B0JeI%#RWi%s<?1qwN)N(*?Gjs3@XHrWSa{#VZ#(x!a4&H;Ad~44)!R$6kmf-R z)&l46*Oa*KMQYvw#|(_6J6DF^dK0uY6;KN1c6X=0f-<uG-l-kCnvSTv5jX{rxe{Yg zyJAO+iE)-u!$KX(_8zX5AX(qb7}@|hL<~%Sd)TZMFdr;x9E4GPxV=<+=u|vMD!X`b z5K2<Njsk6@Df-n}tq(IbIGvoj)Tc71*2DMcqEX$4m#kC0rMU{`3$@ZaVBsu-g=)1O zwRU^+%D%Yz(3~<&^o6P&yND5-wn!QarO~TFJ8(}&>nXMHMdeui`9(Vqoz=18p4<jM zQb9`OrChom|7}jR#5Kd@I{N1Bxy;ppr<ubVn}xCjJ#vrbFh?_qr{3azVpS3TPD|xZ zoLSa~4%R1^IgEnc9#!p~5OdHW){}M}<`w9>)S2h!a_lBPQW;N;$0c*vwLjp6QCk-R zZ(~;u2q)$c`XxOmN{A0XxDak>QeD{jO;$dU)<H&kyzg`^W+GW=BbO8~T{GWKEQjK* zZd2F%+dmh+=aG^@$+lJAx~|a+TkAXh?w#^uf%!{H3E`@7i_3z)R)lm*O)E_?x2M)n znjz7cb)#0KaeJVqW|O;xh+AnG+0+w-#0(-0YM+NRq6(Oq5@`}#R!4h^*_zvebP{f< z-ozUg#|NS}J_#J)Kq$7&=8LN%Xn(&~?Sl<LIm%Y85}q*+fp2lZMvs=%<r&7N#BdF> zbN<PXJa0<=c^SEcwiVLg#WT1b)SBkf4EuSKQr>DK+6mf<o6+tc;)`l>rS<vPwrWUC z{S){rIb@W0zLyo;puaKv^f5S=Y&#!nb!N@RDp79hv$!mRBydKcsBxBK@4|*TRN%Na zgbP&{rMva~tJ5yZVV+F*7Drwoecb`0@<r8Ou`M>0fLx-D<iANgO(|Ty)T9-i=tSO@ zo(}7e#?Z(rw=j{@*1gd?b>8h0nx#VHi{^Ps+nHIriH`j9H7^?{uIBvSQ{G<*t88Hw z7odj3Oc&AEmD6{Ho$0`mm#z0$-3?CPyKzVt=XCi_@nG&ce=YsYyg`SRFXx7#`<caL zWNyW>h=9;=@a9*RV7uh$M1}`=t?nQT%o)i2^$)fmgM;3zXWXWQ2H%KS8H{o)WGY^S zN2xvIkksWYAH?AMoS%f46ZL9dDRQg=5p5)~pgtGRn-+Msy37RC7c25{{&D$-@p#-^ zz`bHLj~XpcoXW|lHy`-(?X%_rwzw&orlUTFvCj8a6v&5PDtaF^#R6sDlv;TOIFg9v zAAIb&7EPx?ANsvMQ|^6O6rGK7i#uXZtxe(f;^l0nasK9flR9q`xs3ODud~JK)ZX5% zh}SLG3-Wca%2AZPY*Ag<*c&Cs&a9YVQatGsNK#E~Dw-G79#MFk>IFcfavd~N=Z+5* z&4SsOrzjNrw)9Sn6pTbfIK>BZt6NZlqx#W~f(sQ*h&UJF!WT~Bn1oB+&#<|tGwYLO z)J=-s*~iz>0Wz}yeI#V%(45JXKfE+%t)z%vv3t;_{ejt}{H0zIJx#QQx7&*J%Q%O% zS7R?)+*hAE24`HZr%UH9v5B%2crO1|b%2lL%unZN(-#6Y&u<k(KSn+-ZZ=Ya$G~DW zaU04*qj9+H0Nk9ZbXr7el$L*?M`d9{1`Fp<JGGMF3^?*bMQ@CH{K9+g-5*YEWIfe> z!xWalpOT^b2SysMd~-Z>Db`o|d>~lwT`m0+m%dD)fb66bPD$=HXZ)tW*^`u-dx<tr zX7qCYv0&x*-F%Doup$m+<Y$w#ibbzEC99+#7nmmVj0toa;ViAqpz}cK>38Z?W`BjW zillieFbD-~ahOJ{f}iLL>!NsdMI`yt%rcQzdQz)h?*WBboD2QU)+Q@iOgE}!)@ESb zAr~UHs_XFa>oYrax3KCB?r@t+lMPY6OsHpbCvMOqj8T-mMrqnn>KfS#()B*;_60j< ze?k=#CDkSt4b%_S^cOp>8M&qDF{a-8-RG*Hxw+=KdXG)yH7znCpEUPr!{W7SyLLSc zwLb^SKN8V&E_%261E$W6jXLWdll1FT&ag4D^M~$)+Qq!dv+8fKXf{fpm`_@iy7FE( zx87fbHTq9&=KX|RftxcU{c0IpS^KG74nNbZsf0N#A;y$DH~hBZ_N1u^RCyoKzxpDk zJ98lF{F=0yp32FQf<6tMW<(<obAzgdg^pIw7SJ*56qIHeO>|#;9NknzDR0*wUxcj9 zPY~QcNClWy{y<{LovPxuub7QXze9~1pTI)b99__h_KP>p2Q@#u@BGZrOnc|gl9uei zjLh}wRJNWw{oMmE88YKzjAw%@cXAXX`}4P3hre{L3LLgw<v`1MS~OB9>nh*QS$upM zYf2$6+gIFh{u5`En`QZ2S+GNY){WRVWQEOXEu^}#>biX13pUXXVBtO_RWv-eLhaU0 z$uc}4qs_UOf&7wf3C1pBXw*2KXLwYTXDf%^7VEk&_S(ekEKH!TSK6;LVj<SpG&mUi z*fVeQ8Q((W+nplLhMpIkSbUhOgH=g>nR3ypymgwFgYm20cavs^kXCtTel2^5vHpWA z$+UE{H&_Ea8PF=SdR$n9dm-ft`|wi~eR+yv8j4E@qTlS&${l3;Q7dPcOm*^RPJzd% zL-vAHQ+FJ@bK27Vw82{eIU?(4#YDMtQm$|eKW(Qke?Ol_H{Q`@ae7DiR90tyHNUzN zO18OplrZ&)yZZC0&ujOsXM9`zv@9XMtF`FfE?Ub+E-cP%oubSCQF{tr^e6N=vx+)Y zLpf@-jfIqqfop`X{H9*2b+6YgkiHBZ*NE>yS2Y>)hFQYAQSOz5$djpsW0{j1?IG9) z+4(ICP%{Iaqf=Z**t=osa7(%Yz2K(2b>Fw+(I@JMZQTmZ{^PE2Z(*^`-St2AK3Pp5 z7{iz!pn_N2TZVz5_7FbeFsfvOqsX*#T%Aqua)d#f+CI&nGxzsD@ClgHC<-Q0`qqQg zF)kY>lZ=d#^+OYDjP;m>H+v#9(MRSviUOK1)oi&++5%0Lep7Ce1GJ%O7#>_8_~Bng zn^cRFy7e}CA4?6ZXSx2@=z5%sj=OYMQLnw^_~g;5+`Q@Ff@<Si+&ZiO9|-*COiY-? z8;?}|zXovP7zl(6jO&C2H{iR#H&uCNfv=;2s$NO%?>MZEiZgF9Agfa;PAe(D3nbq8 z6M%zFA&mKlQ*^W$ci1Lz^#8G3VyWzy?@#;U@XBp$)NH$Y(BK;Q<N4*F=)G&d-Z28j zl~?=rZ$-04dzl!^^$k!=xo2z)c(_e>&4HhVey?>KpNhYbkfN$qt`d4PxFyK4`R=5= zz&P!I_F9YdMaF_J$GP|j6R?#Lj_~?aHE?!Wfj=_3DtyPP0e;az62Z7S(m3#n3${2K zqH5e<)jH=HU1ZJR8OE<ss_)4m1@lN=l1JX<?_$K;0Ppx-_}f!*++$OVZFgucV9d&F zdkd8~eam$Ei?7GaD67*^oIR|AeEh|2c}917VzJM(S<xEV(VT4SC4qsKZ^c@Bv0*G< z7@{qtje{S}iLCq%Bo*zKd6eSjrjCrZO(?_IWYlau7Sg&qe|xd}+0FKSjU0;Ycx#wN zBZmIs29J26@IuQaI!wy?N`O6!z?+MhmHX@uOjC^dEO}1Vhpi?;J}6PnIxXJRO}Z*h znL4K}zqtb!TFY;DaO)4jdQ#b{-R(nAR=SMY#acG921DYxP;Fd;NvW=V8XSU4%59$u z^m*>#VS&v)XXPfPjl|NrGZ!l3o<`stvWt?c)<jrip0#+n33D$%+UDp|rayOa(}Bx| zd(X1%bs*Ea79NJ#D2smH7jg<7DEn**V@-Hs6(3VqVNj}f+qQ%EEsqYGHiLojcEoQ} z9Mik;+G$PqSI%g9I&g_hHU&F`M_Gt$kz9b}FLs5R=cKCb%icB>mC-1%3szqjwy1(W zjm?b~qKnSUY-(+l|1gso?+&<SHPKTOmqOohEhc$*-cQkx0h8S`oal5H?iF~<Mt0Ux zxzJ`UU<y~^#C6Bja~3<}qey$c53H0DF>4`L<MDl`<<5ZYw#C&YzAI{6n8j|VZT;Rq zZ&3<yU({}v^6D`<r?tgm)1^}N$X{XfvGwkk@tb}8^z0sXBJ}Xm2d$nC*|*f`1*2J~ zNwspLH!er3sb)UCUZ$oR2ItQtU8zsK)gUu>*I7x7FbP&gqv_HHoXpRkZDibvJa@0S z2$dIQpKyMaf=r=Lb<$*ZcG*5ByU0-{BOuI2%(jij5=9w>udoe`zzgN543|Y)3=K!F z4W^`WnwYR_%Gj*E?8~AE>CyqK@H5J3zP$7PCgapeuEmViyG>m8Q0|o#sFIZ1_56B6 z!_b)~Km7#P*SWFAsn3VPQ3SODyop}=bIvSea*t=ayR2}*SkVomlkV$|Wda+rsN`)M zMCpj2hA)*PXx_xigGr+*Mzqrc3?^G9<eSP=FwJzc6Rcd5*MzKWtJdyQw-%aA_I1fq z%nQ4;^40LEpAqak#Dq3lZ9wO<SHkB{=m?+jYZdXvf~o$|Yz%}6^>f^=dvWGQn|Oi~ zdGpW*r*h^NQFiuSZ|!v7oiER@>4Gw(lOJC3&nK7CCwrt>UvWndze`56YMb>|k40su zXxwJdmX{N>y73{x8LnO6n$_qSG{|h6R2nre%d)_FM+i#$jSCvSY+=ukzUA%~XD4@! z)9sO4Owo+bIRDBVuBrm7FTou~+duCI5$BuQ4J}p2t_6HIi7Z}>E^nQh>4x)@IW30k z8_g>_=`TE(4h)Re*v-Mdudk@?2CtkMr=3d+OqsIYo6O+XX~l`&RH96|T^z?QZA~>N z-o}OUa(EZeLup$jvT|{u@cljcprx!HW1XIBkyL3-WtzEeUrX=R-o<0Xiqgkx8cd~o zvI|r0k}(M_4E|x5jHQp0<I?;-Nx}0SD4dEd3}q8@*VD1_;SiX_f?p*`r9y9SdGGEd zt!3R=Q+BS*iHfl?|7i0M(hu!>A4EtjB^s7~NEKF(h|+4sF_<T?ZS`X>dgxH8)Hytg ziL&n@X^<XSAEO_*lrWo6VrFpI!-xm*6OV8KxjroQ|3$okf=-xQ4-S40k1GF51(*_t z>;uQe@{&M3Sr&Y~ro>R;$Gk?E!7rN<l^Qq;K6HmLYacloW~d-c)u*@!r^SaQa`hK| z2TnMf1*qjbejWfLPa&1Poeg#|r!kClCDY3124y(rR~hX#m42--iY<Ff`<v6%`wjc( zR|U$ny~)kO3y)k_2AXNzMGd)Q+-+B}Qe|yT_H~)D5t5hky%JVeM65z)(`;bRo;iAm zd$#sjXHxgXV|T)U6e+XcnhGJs!NWY7;kvylrFpSht(JH$m8$#ApPrPkmx<|_l^O;w zC{eO#K(WKeCH3?4aMRz!?>T6J4c#M_yp^Cqu5D{iF~D3r)TE>0%kSWnxK$_%p+HjW z?+;Y(_T5VHY@&I&#ICr}Vke}gfndz%8$uc05cQ<-&0w(}$v<246&sz{!{TW%B`L#& zW@LoI23_sOnDtkuvKDo3Ch$#Ww(5RRj#O*S$_nc7u*vVfGK8sZH07$J%gX4^wsr0I zLi>f_v`&UyjS1ae3^6JicS>y5;wbi%KIhZFKKz2oqtTr`V5pMRPbAf}q8j4fy;yy> zIDNtHNv|ZTx+r06t|X!vRWj8jP02(kZ#DA`+ToW^*5l!qsi(BF==$&}W$xBE)l*%! z_Pvt_9a6;wZO-V09?OJgd~oj#<6Gq1R<-BfMpO&keyoyP1nLpibF@lm{GcKK{;6B$ zgiu<OjLPG8$nY!NmgsWZ$XEw!+qW!yQxlWQm*|x3)xoT(j<8f~#SSz&rntkSvir;8 z@QZPsj<8J48}EnrBU?#}_QbyPQa6|uSHyk@d~+EB5KbTf%`=tG=38op>+r~REVtq@ zn^}1I{R?CL+zoN^m$Nvi**8u~v1?oy-Va0S?M?1qTf7!CZEyowY`wiJZ<$c3eE~%< zp3>XuufVC=k}Bb99PFg>Z1nGOg3;-onI>sEk4mSkJ~h2_iQwr7<5x+p!|Pj#HDVh& zF&SaSbCm_+4k58TPk03La&2o#xy>6bzLI4|6eDF+nkgS?G?=D~{V^NMuAwEiaGU@s z@O{c}mnJ@Wc;_E;Hp+I}3evW9oP?yUXQykmyXgXl0nZBmBpv#b`__V5ySzxG*2eWj z3v6IY?;m~iy**V^!P5Nt78z+$rF)J=v?XMN{0k5CWZ8Sg1X;V@?M!^V5o;-zV{B|^ z-ZbtMtX)t6=SX|pI0%Wft+lIy0C-lFkg}B|v6N3H;&~M6*t#!kYUQ45!-_oS&b@Cs zsB)_Bz5>uOPr+g;4Vy$Ar(aEk%m-?<7mcLGThuVkR=mVM#B#!}2xT<vblsh^MhWxk zvPourdk~YQ%7WS&D;v0niJ65L4{HE^nFSo1$tZOlGrVjmH`&fr_vps0#EJZ<gn`69 ztt_$n5)<`H;2i&<QldjyV$qK-nO+a9yE?bCS)h-PmERQ#lp`EFjx^6VyEtul<#HBl zB2+QxBA&>}Q))F#aj!z>G|QrxF13LbQ!Uv7`m@H!bP_ngBykD!u<Ik<c4H(qF*u6U z03AWZ71w*lTp9A<DEs!;fjZs!-UH&{`>c0z`oO~li#`3hPw#;BK6Uyj+Xd;M9H-sU zy0tG=T-rqlrT>)>9xJ2%;G5@~BRN!?`r(}B^h`>DCa0?jidRLvS=U?Uv5t}X$H;8_ zEC(vFY&6tTKapdO>O+r}<4nN?-HBVfm#7QXY_d6Qsv%Cpg0YsG=(CD%zqy8GIb?eb z6jDWoT2tB9vH<;2{x^6032yU892e1w8mW|Ix~A#lLeMfXZn8`<^=?e5W1cHve!9hn z-^W((YkPRobxD|-97~wUotyk*@A8W7ogOo>5;|eez@m}6_U*<a6qd&F+|*(pp)?tZ zli8AN2@$$1f)lccvPWjQ_Tb3JC5D;r2jYjepbZvJ0<va>Th+NHpBF@Furn2(zECCA ztYD#Ls?mVC**E<}PVs~3mFEEw82*#}xzQFq#aSY$sr=#r8~PZPO!Ze5m1(ncB^B3g zv-f+3lpDC!5#xvDFD3nv3}HwC-tB!kIQ083;!k+{_rF^~DjMeZfmfOb!_G;&;YQz$ zKPL#kQvnfv=kq`PeVGzJY_EqN_>D=!4z^E!pONXJluXynlmg&UkesyA{SS8yp8pSx C<%a<P literal 14684 zcmeHuS6q`@({H4zAgHKFw}q%Q1?f!zL5LLTy$Fg(54|f@r3nZ~2`IftlM+G+y#xfL zO9vA`dI>$0^MG!_z2E(wi*s=<zVE`H=2`R1nl<yE`OmDx*9vk{r%uwI1c5-Oq#sBq zf<OemAkeX(;{?DHA0?Gx;ICsgic(^r>{j|I-~*BIJ=uF8P+l<EwgEBl`GnO24I2>X ztnL1vW5Z9G96=zyC25I!%8t5oaN*GL;?Sofiq522t+a8&Vj96;m$mM(U;85YY$@wf zZwQP`DOo7LK)WHptKx+oh+6LKnHQ7%6ROglmwqta+lVZ)=9;$L+v=^@<pB0_z45rg zBTRIc>DRy2IYFSxUN0~q2t<A?f*b_8MRf4!v_2Y;5P$K1TM#0xKzZOX!J;U)y}~(; zPl069jdEQWOhZqEB@yUF4hfVoo{6an=J6_|zMd|iP`_4i<~20ZhHKzB2&6|DAw<^X zStmDwG_e}QaP!47+J6oNYyhb*vnEB9&AMf}gW&vQ_#87#5h);l5|B@KtB>dRyGo;^ z>RyUcHKs$rN!)k0Z^)B@KzFV@v0?de>VXHV(@9-6{>|;pZjH_!bB=-HxnrQpNKbH^ zSlvUi*65u)>!LFt5D5qxM?7BIZei6|8|M%cIJ-Xhg@@q49l`Qet?yghsc|~5UAc>} z#lbLKq$CKm@jTZE1bT6~GV~Y-<O_n5fIt@sSZP2Y{p0`N7d8Sn@S?k$3wui*?0&JR zQ(ebcohaPh_SPDhJY4a+rUN6JOP5=s)&f@xe1e&@v`}Y2qw9CgRxS6|^diOOMo6YX z5kewW_7!e>mnAc@)Ph{`8;<59>hjtY@R&YH(2EgbR+@RLVRNNDayBMW-2n$r3}a4} z>7>|IP^Bed1o?V<Pw9Y3duS)&fNqiX03m4ORAuOxNY2v|)W`*M6^6IHhJDRE7J{d? zw>OpIE`VM*AuZ7Zcg-L44W4{P*ueyi^ZfQuUfiMx`zvDGB#VI7*K=Ikchzyvv$g?8 zkM7%`<eaE7XW)S7YjkgGF#~pAAc!D$INnMVMr%_SiV${F1fn~-{vX358lorazKDAS z0$qOr_8@3?_)x^bmcAei#8V%=bHcZ*KgVoNBmhDPdO-l4mtCPWy9y180al4vk-Q<4 zhCo&*knAtuZ!0=Xc$tDguc|`vHoKA;M8hG+K$l3MyY$ss)<hhQAiesLy(Rf{?E~i$ zIMO;*zH5#a+jsN}2$bZ-ql?JtDoY9==PIbxImyW)24gxZ?hboYe=+_BiYJR8uki%0 zGw;B9Eh?>9Qpgvh^wGCi)nS$wa6=V3Zri3<!^D-dr$F$^Py_W`?6Ab%MUCSW^q)4& zhO|JVHou-4vGF_e)`uPwp=PD=C5FzoF43%(l=E3HjuqbKGzM*u=NetSz`9ALPQG#J zqE+q`l`bHA=0JAWOx5Fp-f$qTILU_cffx<Qo#U7M1JPoUv7L*fWIy+rPZH%785Pt4 z=>a%6u$yoBX+^t3PK`eo2O6XYRF?gxDWf$(LQXr{BJMN6qUjuO0tDc0K)wfif@^a1 z(dDmM`OOwu&-2PZe7iJ<r@Y<K8Cs{h4zDxF=j^2lrljlvf?Zjok5<CqJsj9*GA3Kj z!M1o5NL|&JAG61>)L90{=@NQnmw7}4N$@_IJGWm;<hfwoUWko2R7f{}9(KitW0sE# z8x4lAyM)H&&OM<+2)~$I)ae|;xNJ$PPS%C)D1B~A$jTsmJoq`%e9)eDV-uGu#0IiN z>a7;N9A#M)c8mWc*EFGem+tMZ`tmt!7VoC*C9tr;0Dx>4P@8qdv^imw+US9{n$WuU zpW`1z_8VU_j6W|*;a%gmZRPw8%|jp*xDY+hIMT`cS%5cU`>`A}CXork0;MSB7uD_O zZ4cRr12XP~-_mcxYC?G-0^3i9GRyC^)~-eRdtHJe82SyNA{O2shO~JJgx>sC$PvNH zDwn^D*BK-kH9geO80xdL6D)6d40U0Idq^0SYH4IgrL{a`jG%GUAlW$S+H1Rh;HDN+ z^rXB*=3+JZBJoD{)clgL+ttjN4K^B(Z=HjzL-4{Bod%p5sw+|_H~dNuz*_#H|7t=b zDf9GZdEax!kuRSA{Wi`3O^mAxJBB*`duxDXgo1zD%oI`fK4*7bZXkzZjOpGnR}qwH ziHocJNaJ*@I&3OlVkwAx5qzlbG5Xk(=5MzvBgaZpN*mT!0zb8efe9<G|CXo;JvQp} z`$Wh*Y{x?H9e39(zp+ls7c9TUY#yV>=z3`aMiA@)o@|8bhu*3QP4pl<zoUg+tT)Z= z94FRT*Q4ExR0c`@R=Syo4WJwl;-|fq@|7HP{`YfaJ~p?!qPu_a&PB%ww~J7cT+DAH z{lE<<q-&R}K4)hD-j`VfL6ZCZ_u^}CY7dS5tS>!Z>lZJ6!h*xI`GZ9(=>&C0smxw5 z|K7&{4O$fX?WPn2c0;PcHJkj_@!!uTim)r0+GKK~n!4f49+R(XsqY)(3u2F3QT~pC zlGgHfw!F;2EqB{-W3=9kc4h|C6`qa@(rg*6YbN|>M%jOBODV$Y-$>U&6%|mG3)KfK zU!my`+~GPyOYx-R%Yi7*Z_!+=K>8VXur#xY=H|70H%&RB;@_bznPS^KwSH>SI<n%y z<M<{hxIyh>p9mX-V6fpgxE!FUxs<9JNrcgOKHdJJ>VL~ZoaS;qkw+7k=T#Q`!{MQt zj}L=>Dx7fd9w4C&uh_%HhhoRyb|PN-N^mN3JwD0rM)&N2q^cj7>D(|<)-r@anvK7x zhdQ!_o{tv_ArN}K5Y4`E;Z=!@t%@J`XD9HU8W%`+;W3nb1s#8U@fvW{QLRhUH``Sh zfco7AM3@XkoREE1r+a52!WDH(b*xhdM0Y3!H-%bkXr%8dY@DCHSQ&Jvf4x)-_HfJa zTlg&^QETFTH2kH=08Q#M+yM~LMF5q=BLt7sUT06|V!b|t=}3;fcyXwLw-AhQ<<A_F zjcnJ|edTo(7eTIH>mW<*N+zKLF3E`L5B1VP)hv5zT?}?G0<jz#HPjU`Ncg0=>=L1N z)uH@fsZ)ix+oh6_DZt9#ki#VE$WK!I@{c_$@SH297!HYeP0M(7zXjK&G42=VN4Rws z*-qPq|HuFmvVt$SJyd+X+x>~@x`7bQD8b>@3=yuUSA$sl_>O~!07U=ZYxC0b*t5<* zpY9`uVa|ef=gmwPd~=kx5-4?t;gLG$PIqyk5u|#krX-yWN!m2wbdRLi^H3CUQ?nLZ z#EDy14$S~OKtKGf)HUNv&dFoNeuu(9qV69oXJk9y=%8MBTE0xnlu)x%UzWaTQK;9~ z?{-n1r1)qQ%E1*?qbVU(QW5mLgm<qVZUT3J@)Vn3Yt%U_g^w5soQfZTRr3J}`5xU@ z>fw5(SoNuIjeCWc72l;@FC-N5KDy3wC6X*)+mkhYermkJ`$B<PR{K*V!|3fp%>t}W znx3barpHyw_)K9g3B$?u2Nd@ghJZvKfsbZY$>SCpO=gg4&q*CmAI8mM3xRdGI0Et# zuM5~EKn{7vrr*pDkK>1m3>;(p6IO*D6S<DhtG-JWj=!Cc9)Mjf$V=&QnGf&R`XgN> z>-LdUo$(iI&WjX_zzc^=z3X}lcGv?`3`G3bTb(tFP79a`?weY?&$K^Z0^p|?;TpX} z94Q?;o%D5kwe~fehmE>@YgUp}LP8hC;>9Cwrh@7QJRAR9<$arTp@uL<^?|u@n@|yU z1C`VyGY6ss%IROJC2NV6f4?MG?<P)_O{@=)+L2sw5Oi<l!mPKFzvsGpB!K~>Y}el< zB%w0DXx=5CdrY$C@GYDL`u+7vwb3IRn4>|rt{>XQ5D|+(F9aOh4g#p>5K7Y(dARAe z8tW%Vvyvf#o}PdA?6=2_#^T?`{d*25=U0LN$mZX2Mb^6To{FO@C*=tGG!o?`Pm+7| zZ02ZxH`G{G46`MA=G78E2a_pbrOP2Y02P}wE2Z|+6;(<5$@J<dxJX-j%~wUr=tmi7 zj%L|pYpB=j?;k;W0Y`-iD{)6p^OMgH6Z5|=0~(L%gAT0{=#8M?t-Edc1*G?9PWe%0 z@XD>X@DNm{9yw%-IeLrQ(ha{mV{7O>dEuZB_#Iqx^z^#?f<*u|p)2=c2!H^7s?cFr zTQYoc4Cq(>N`^rVPEkRUi=ZP3;0$#q<Y)ou9VvH{L!spr+){d0%{y~f1!4zZDGFAK zA2~cgqYHAkdionc?QK1YEil1K!y~Ke%Vn?NszZ0a%`H*T&$<lU0CM_|S|i`Q9JHOa zeav=?!VO5Dzvo$4Z#nrwP@Aj*BF5yFDw|4FmTjBHa^G-uTfFGF$uY;~oeadOJ2wvq zNYze}2SL#PV#UD_%|Wr<G5SVAXW|L3xxo5<4#4)i)4(^sYgxgyr_nY&)NRA|z-rqO z2OFSqXZx%aN-TWqb!?MFk>`QLY&haJpxtmNUW=*oP_u{Ufg2b))PDTUCBOQ+zwiK5 zU&6`{N1gHyyxfi3+g{j1(jUeQE(mRP#hd_Tf6(qZcldZ{Fe28yT>Y!wMs@fx_QXHe zfQk;JkCc%A<@i$ukscUAtMlWZQ;wQmgdIh;yDWZh+=54~8@=$q9?9)<i0jEoHmBRz z+LwAFJtQE)rYEvTydrCkUUEQQP~02$GJPTV`#rv5fz@0MLnAvIt<SkAdeScZVspbc zkiOV?P9<{P&IOFt=;(A;&C|e;ZlLmjzMfENe0bYYwj$W_rB|jBWeoRzD?)ge@t=<Y zuf;QI8jq$ng4Pv$zMS*wUBPc$_`RQ!*5_UOUb5}$G@}9;Z=)VBM!PeO5Rj^+GV+)P ze6n~B&8}H51zCJN8FhmsXKI~B`$IH^>4*I;g}(?JsW(pyx}bcSt8SEgN5qRvxzpg( z0pYhrl$Akn(d1TVei_s@O3-bqPI$oTcrDHd^2FtJk8P!%wS%6jWMBO?aX|KRs-<*5 z(-Gl3FfQNdVsuqz+y_9WUd<t_ia28`mt?<h+ApYUbiLcxXzHIVW<b)!p1xV?9x}3e z`CQ=B%l*y^IJ?@O4kt(BEgmm5;d_8D19Ik>5A6gB8IAs9k{oBxn?@-Y^@e0gsMAtv zXn(RaXJzj#$8M_q(``&!a2DU?X6U)Q-wgrX4a)aF8pIGGBs?SEg&Q&QM}Z=@pOtq} z#5azr$nX0dlUa&BJQoEtj{Y{*l8I?kPwB!JXVYMw=DEdma@|#;eTSh+V#wJQWc|1m zel~3BCoD^uM5{fxP_(maj-<LS>@msq8`}S3_;T&6eKZ1FusX<6NZGhiF)~dN`cZQ_ zWxsnp@bsTv1dL{@df2-@P?nXU>crc6SQa+#%;&_Kf4EGe2M7GKDC|)E*8y@f!agI< ze<ev0jS#w;dKsay4D<j@Z}BX~Kv!I^mI{Q*wGDpjsLvo^;07k~lihASm{OP8f)gbG zoOH}(-up0~3=y+0He`9tjLUfQ#uV{Z<o`u36m01d#rs1MuYW{J3tl@)pi=Z#m<#|9 zk@5brK1=Sg)ytb&Ke20ozQyj<>k|`@BL{$DLlttn_-=UCFLxVVjI5femi={%%&?2v zADVjp0!2XcsNEPdCQ}IrDXLo<2-6eOpO|OeM`4Lo*iv`8*wd3E)7JnneJ(rzD$w9# zI30D0(HqkMoO~?p&H@<h(>zEA3aGC&O&R;3k*gh$f7J+q*MLLJvJUX;2d*jp-{b9L z?|)YGsBklQgs8oUyIJ6gN5M(r>2A0F9m$<niniwPIc{8&S)wAl8-_Dy%_9c<4F461 zv!0`9j#i+MEKYqj1R9CphJQtLAMxvEizJ@?hJw`S-xa>tuN>BMvEcYQwu5`w2zktI zQy&R!=s$A7Kn3#qdflR$4%Vs+wX8{AJBF-)6}|NR&pFU0iw-74r}kAjZ)z+n{Ku5V zI-RiC`rZJ=WUe#wXg_BP`qu<rE+FESJuytHwOoXFtLgAxJ$sA2%SUEK6_VyHJNCEI z^6~0(u)0@?!t3E_qVU`su79f%wh~!HrV7+I+qK{y&hq>t;e*n`^-oQUft@p{l&k*% z?yqMG&wje6lO|zl=PC2;A5}&mLDiExjV$~}mYl+gn7<bcaj7nuToSvFXUn@!+%)({ zY5Va?^F+@FcI5<wz&)_pKdKbNBbF-K|2_0k68Rq)Y2%6BbxRL3*X?ipXJr_M<2FwJ zH@GjCZEtK~f-Ofp(pz3B`XL}9zW)tnK)WFutTwpVpE=?H-B(WkUzL9%;t(FO>@4>= z(EMNVE5y<S9q(4-H%+e@pkmox?bTZ@a2#(F-5V?t@+?U=*qXTi``Z0onOh&5oukWR z;*sE!(-U=WXAHL{>XPIH4AC~rYf{z86qfR=HBC{}U9zV-c{!NvI%uGA?;17RTJ-a1 z7G%weyLK5>TgyIJJb#aJx}6@@s=i#nx?5`Hx<;=BE2<Go-(mO^>vV6Z>y{htu_6SX zXY|Y`oQoqZhBn1g_u+IBR|I)hE^H#%uLLv6SkLrAC>VdGzJ~wjM!u@*{J4$`_Tm}I zWx}_O*5Z-l5JnBCNRT`8y#GbY$+Gud(#EVbng-)vpkXE0%;TaW_Qom^u5Sym993>- zFk`XQVFFVDU~v|}1`|c<GPUxtI6dNXJ%%}Q^jgcP@-4@2{&G5e{i;P&7_uzcj)j5s z1=C5l6@q`Lk16S<UY%^sEVU5ihlubb%o(Mnd6`)RF6AyA|0ryG!ra*O{QCU>Kbxb6 z(ixie&bzHwgvDJm(P>Z;apj4N9H6XCME%3KybLmS<N!7-BG$rOIKn?!AALe7_Nc?` z?NU=kOwh0K^9|j^Jl`LUjG|sGS+)90!)@zBKb{9dP>@DtJgBShqyK6^*Xj$d(j+{G zuHC*W^a;jHSV2wwwUnFJKU|x_u8xs23UR@WWn`+LEF=bqp}K<fvDB|Q%8MEjQ3j(( z=mLD${K*eLuING2+!cz0cxk&U_uQn9>@CnAxkB!qbLEq^LS(W3BY6zNhM1l3@r<kd z2>(rA9UTEXI!fV?+hYRlxz7<%be)k-U)e>KH>r|WhgsL$_|_|kiN6bB*4S+-41OYG zd^+)L#B76<;zin1Q(VXo?AiC533wlDVFhnBa#C{dC^O7gl|M#V>kJv*IHzE5*IjP$ zP;ZZPE+fq=h|<_`v=xihH52?OjAHvdGQ%A86CB8mv`;K+pU4en^!3=SF?l>a=L<f? zW$pa1XV16Vcrr@XJmXoS(pO}|{P_-~0wb30?=a&qdhrf@%icsFkFUmr#+(B(n>{J< zvm^Xf?}<RctkjCQaG9NrkM@!5S%dFURvLDN7-=0o#td-X2WH_Xk1B>g;%b$?z>Mc* zjSzlZ=baUwFL?e8RJdty6DpzRiioO3$&bj@@&!r3nUY{cB7+Wdb-Zy*xumuN0;<Jl zb22K{`SHDWxo><Wu{R0$?V6ojOb3x?Mw+)vN;Y&ku*_nzKbo;w`Bd?CvyJIXiwFKq zwK;z!u(Y=F3fF8>ss#T6<Y`-pT#g^tRkzC>dnbP73UWD3!k<^S0sXk0Wisv9xNdtw zKaV9xKG}4TMDLs%L^k=u{QZI3p;i=S!QnwLM$5W&&Wdcgq0McH*)XfqCHL8;u-7^h zpEikDv+$<KLm;uvT~%!51>7oB6xf6Pa9vSn3rTO#A>-urf5o*r@oe0NAyQHhozTNF z$@hNcCj7H`6H~k{b#*djYi6o*N4a=|KTjRcpu2s?j;H#;Ij!^(wW8vW3;Vn*H|9g$ zsyoK5wDwvHdE0F&EZ=G(lXIj4D*qZbTrzpEtJ1jBV!!V6CecBO@>QJ;mO)jUlLmo5 z_}AyvHkT)W3(SXgm8maOhdh&smbs_1N)bD2m@IsfI(yV5N>7b?-qe-_M-?LXtQz*C zRDdO#@vrS@q*EE~te`m?x#o}wU@wFp?p*Y8hwk*oUweHCmT~+VR!7^jy!^v25ZBmV zwqWKLnjAkX0ev<Sz_E5-6e{vMlnmQ_pA+-6MMO0LWjl5f6}f3=w^eTp5tDuPMSx?h zB1ef{f$K?paX4Kk;%V{RCkf~3iHP<qkB9gLB?7C5<3wjTcNkm2rCj`dIu1Rb+6K-J zg)|(LT+V{WZpyH>&EcZ#*S2|<*I0hgU+R19E5f^eJ4fXe8QDE{c&XyMK}k6{iqLrN zWw{*_%OkxKE1w*#36lx8;%s)KREzN$iL!^!?!@C?4xS@>o-#{?fEjKn^5;0WVw!ok z5Gxe?)`qr^T_eo&&f0xl9ecRa;FQ8U5$8$!%ipe|H`AKWp6rEQ_TQ0q`09;$9dB3Z zb_Rd#WeDD#nYEnouR4s(meZKu+B)YnTgn*(d>sclQ5!1w3n=^4z}p3mqkm(+9P>^2 zJSE(rM<f%RBgft)$dQ;_@h*{g{r+qi(PK2jt$nWHEU0dZiWHnqSRX8WO6nm@cV!`9 z-5#Rn=8(D4EH}|rZu88a)U5XJRDevLza?+oTBnt4u=-6u+3)2%sw96+rX)6XLXb)A z^Vr`@>-#R^(n0kdYn!G{9bQA(IUWIXh$*_+0>u>(>-R^=A5SUi_8hM$)nh*;s)$gT zC|!DvuKGd-&%+A+S<fB1i6J`|qMliGC0)+FBgrJm;a!h4M(4yp)M1Q_tv{)hvjB#B zgjBa!@4e}xypqVz3mYror0%NYPa>*7e&AkYw$9HEA7~aD#B4O(Gk7Y}I|2Xdfs`I| za|VlG3T-Ty>2fo)V4p`MxJoPM9yerf>(txt+{jp|L&r!AJ^97H1)?{hJ0cprQv>Je z1j<xznzX#jNm{+g&se2tKSy>szEA8rDl$XKuFJ>N{<}*QT{j%DQ7^=8B^1fhZn5WQ z1GzqkTpXM&7`zTfz?-MVRP)mAo5gz0zwNj(E%FN-{Y>GXvb44HlJgSkw;TJYn%`54 znQzni@0j;X>ad_9k+M3rg!l_jxRsW&t<Q4E#cP900$vqrbA23_^vTpIiRxu)eXE*u z!{{wmp`yjj*rVZ~L-{I$PYl&Lfqx@QE!JhXkZZ4@;ci{qtZv-qV~F6hCG7DzJ)Wz5 zeUq5l^vcKP>X7;ytY;LuV&*J63xSiwbj?vwj|^aZBP<>7R6I)k{)IWwa?kJ$+-i{L zb!3Vqb%r$7^%`6(+Ei1z`ixdb`xs`nV|5sY855$q;F;U8kk@_!vc(B6x)<tVcR%LQ z0TYL>Nr5#_Lhx@+we4)$O6g!ItOuIat^F$96KDm7T>PvrmtBj}zDX~<9Aw8(o&b;h zSOnMHD@UFmX&y{qM-6PLGE#<~UWHEb+Tg{NV6H<UxFD7jXp5)sPO7EkIQl5dsp6WE zCN-NLnu4vTzlS^=bmB{_?ZMMMv9_3?QoU*8aHT*;xzO-u;Z5Foq``z`D7@R@b1Cb& zbOrmn@|sqWj@G=r-7e00pVN~caA;FyG(%r_DEn$KB6_L(3)?hy0*>HDHW)xqISuA= zavCLIDy7&P-11Y|0_u}ZE6EyCX@ryW>6rV85ss%BuInx`AQ)RQ8KKh7y*PfUENQ=7 zoPbs&H~bgjn0P1$shr6d9h#DPU1Gt|b+a)b**O;f{LE81Hbd>>))WHqWs*at3yJ-` ztuvBE9$m={2fjdNp@n=H?z=Mw#SO5&dn(hSRFX;2gXDDM&{%laA#35=@t#qTBHN7g z4Yg}Sxt~qxY}A;&aoD^36Kc+^jlh-i5d(_uhkO<tLW{jBwP5JLAoJEj?1OgJQzLG( zd-IXP<#iKf<{Pu$*`|Axbq6dwv3LaGT`0>!3-|DDU*ZQpw&ZAe7oBRW#_-9;WNU5Z z4#%A7uG6O0BvxPOuI&fga>Sr3-*^cOGG2kNi}c-C|90-IoP^5wNY!tUer69fH=cAX zX=@N-^}^_2o>6*On49>)ztSQY2veGGRy4|7q!x><x$z8AUO-Q&BaUgOh3j=_O*8Qf z*kx_#PF%2S4^$R>zJRrJR||T5oGvP=f5HvYC){fOy?=$x1RSYtuRG4l;PX&SD{d^< zFzMNgTpCMbNyoM!Kd!-V(+hbwuEoAmbfNphE7i2^l<UnU$5}^YhD+lZs;9pRHm;TW zz-E1Mt!vMIkbEM$+;lJb>btWign7Sry>ZUpX&dWza*diyNe*Gz?7f{%w!ZQ{W~X)G z1f1*4H_b4v@BC;4<M(%8jno)iYR0FaD@Nn2pN5{(cGB6=>!E_m;E^<lzyZq1P&r${ z!dS6tnRg{x(3vN)kY8nj#97%WS%PKp5^K8QF&ecn4V}w=^V%6_<=ckXmN;eGhWTSk za-xgh4^yUwLn*tJ$d%Ovelf?BBq^|-Dn`{bBrmgh@@tG~9c~O+dShow&{%n$n~K^H zUONn3=oeo-clCV4<S*9b>`F`+DJW`-4JvueUP|31yj#>;d0)`Ue*XJO)X>J=2duO8 zVQ)u}X$jscLuwDT12o(xlOw-$PI4(5IME-D7Ea1KxkD|AXGwKmX#zhJ<f||WTX7Yh zyYTlZqB(IoAu?k$t4TXJ5ytB3X31^&q`D2oT*~qIPj(2-fQn9PYdhh;IU67bn3y`G zgM1mr8`VFbI;4hu=9B79jwn0Bo$X#)&Xg0!DvPF3|9s-_B0g%v)Z&37US4DFuC7`R zUUVMUrFuA8c&+tn9r-4I&U#-@vKZ^esX*9E`JaF5e=THhGQ1uJKX2l^2=3b){M#%- znOSO*UX{e;Ds3;~K=8*H*4#O5;k4WtH0wG~^rsyS-B~03{q|-;+v$b>m|c#Hj|Y@~ zblN-^0)LLJM{HH#<$gG_KmSb{vu^6`ZAAtEKf48ydhBjaJ`FqdU=Mg2a!=pNwT*Ls zsNDf(-v9LSui=qwI>mP?=E$sgIIjw;!&dN{)q4g{$Hzt}SK>IzRv=YJ$Kg#Dw#~Vm z67y}nVg^iOeY^1RutG0cqUc1NB|8JnS`$FoOXIQO#Y%(V=fTQEfh;bVKA1vrN2_!F zOcpS>9_~F@7rKY5*n`jwjKLMQGUrcgPX@XXty#OfHhS=qY8I^^{r$}Y4e+<cElLY4 zOxqzkYpf&2OlM!g`MB&q@{2ip@AaBHKy|TQDf)A{r&s$eU>{)f#6VQLh$L^?M`+wA z9GQ4z*Z$?W4-u}Dx=`e**~f-3q(Wof_URtx9g<+t*i26ZrE$lYnTPraL^Reqc*8N? zxHCmgfD-t1mxb##--LFG5HcTs^=(BIp3l<Fn(l7&x@clg;6`lk+Skwtq^+yR)M~A{ zF=RlU*;X&-x?0nxH#o;jy^!fMp~5+C^y*47QvFUA<KF%r_}!~6;o+_%d1+$7%|fYi zjvTj`;r?NYcw$Bsgvhl4xrpVJxt8_`H^#V9;hgH7$cb{P1e@?KfhlIj^TjuTA9a+b zS)6yU5M9w`cb%@e(O~FiJc7Wvp8?WR=q(tMX=kG5z>88E``XO#E-=xhj3GaEgAM*> zHWw;m?ea}y*A;Cu6htai<WT*(uO6=R&<Eo@R(8Ke#&xW^Rc2{}^TV}VW8^}rPzlU? zTjjzv<Y0(!g^yT#rcjn#+VOzvtIq{XL!`RFoo^FwaiekT_NNIMCfMJ`zr41jmK-cM zAm=y5g4IxnTw`>a0uu}?AW7S%xPYEjyd2CCQ!Rp#&P2&i8h)&QB(f^@!#zlumZ#;J zrMy<#Iq$9HwnVa$W>|47?2>I?_Jwn^)N*aw1zALGa#G}^i4isqbOPHUcf4C(NvQ?B zt&q{@58V`OU{+Xbp41&Tu@h4YeXdF&GuP{=tn;-ntW)hhYQ<iN--}^F?l{-mmfWq= zBIz@u_g-GExOKHo2sa(tu+(Ravtep|Ul#OrYm_`1$SEYNM&1x_C*fz?-3K|J)P1|X zvXNMv*(KrIHTLD#;YluhuG%bUw%Cy7E09Zbh;?-vf37dVn?IV)$V8<+7tfJ+z;_+n zR@T}%?ftouN?2AfdClKef!Z-D)~9+Uec*C=Ty%|Jx4FE!IYALS%`vtxi;3GWFPF}m zT{G%taJWxVwr#g}A%4p$u59q7uT#YfV0eye4#gpE(oQlou+mSy^X;B9l{1-%GbS9L zRST+xc^hH8q$jJRwo-oVR7Q72X#V&h-Qeat{*D`ht{-Y8{8(tU#vP{$c}G0|&bdsq z^8u+H3vx2!mRoun1Z=NJt>}7Yeg3kmVsDi=0gzsCig9dER^BPzjV_?UrDyrld1d>y z8^BUdj!=;W%_*Aa9(TW=2^!F)UHg%{;4L@S)+<E!Kwg`#LM~UPk+Lx9Bjw!G@l7tf z8P28|Va<B<b|5dU==Z7cunL(={IK~X$UFIpTEII9nuxeTMZBFrnpUICL)t{Rgz7U7 z8H{lTbU@by+lp)vC#WHM_Pmd}KxoI&eYysD`kXTJR6GBjeI#iuchHOS;CGvb`!~E< zAyPZ7JKMQ4m2_H$Kcdh$;VE#>Qo@)&rq)fhr&$UE-;@ZPax{21d%lM&A?EYhXg$1W zIjl=1Lc2&gLEL)Yz(t;CewRo-lj<e`A(&X6g-j?ddr=L|H}k-8`Nw@6wfwkwT&y74 zt&Ju0oH@$r_|>_#_RTRfHOMutt>f`~R!@l@*yyaQ0Aqy>HIyV$h_~4wua=eUn$Gd& zVVN?%?_7RaWU>U9Hz5koE9%0gmu<e@aeIUhJIe$wy}rtYme3iy{6%hy%l3Niz4kZ* zuaY0xLTRSsN=$OXP`||oV~J;Fe_j`E#69V1@yBn~th5}t{`!BQkbkhib^v&9b26Tj z0-zN7SNs+{eJ@rf*=RJEgqzc1sMlBh{)H_5<#LnQPU+ulWn|^wzOsaK7>KVB-d@rR zb`O60Eo#CWxDh>kY5p<VW3NGUZ%1^u&GZ4jM;!kwPLj&sFLVwQ;3)Wkoknuud;gdK zq~W~(bFn>e2+op<Ni<@vu(U}_PyaH#sJtk?!My$YBO6Xwc!OGN##K?$WO5*Qa>JNu z?}W-~auRmg7A?j{h?EP>CNew&Q)Psd4>2`1>pTcCKM^JBAEIuB0ebT15^NS)^&xgF zb=<|L(;ryeH7Ow5=U?^+)_FX_1E16Tb}g&HIdxbVF^0M$G|;XfDq>`)b*otwwgrnv zMC9%?GI-q2_6VTY`7GPAE4uw|n2E{2WhgDS7#-=3R~gZ{6Pvdh$TPCsm<T&-5qABH zF6x~DpYZ@ilO~ttn1P*`tiIUWVp&D$4IPV2t^Cc9;T7DT9G$cR_FP>@lW`FaJ6PV5 z_3h)hg?3W&$~cSf6^HMg{yA+m$V*bsO>YT48E<Fbzi5=o#dh<E)l@q(Mfe#RS8!Wn zHbGzqC7z;?h^+kH>fIl69*kHi_5#29NN;)U-7Z@rHKCawdGqvJ;*<IfBOc{H_)a8F z6D8EICYa91!OFG$2!e%f3eMQ&_eF9&3=NlyQsTg+34FYHi+LVq_k4KyZAn7x_m{B= z1vo<7#^}D0;LYzt#WD+1E|GZx+m-L?Y?Y`rqkHrxuqpYP`xhID>dOX?9^vh!dd=@K z^;@bBvJW)SEUi_+zVkk5^a+Wq{k~>y-1h}sc7<(HJFez@n~dZGRwt$mGjxs`Wo*{W zeYGNJJWct$iqo@kqG}a!&&^CVPV%PeDBl7)Ck40y^+l>S{relTjEZV9H(oWSM610M zDnz$($HkT+kmd3scSN|YS>UNTA28Rr0zdj@E4P{^hLz+p!B{dyoZz1$@}u{y#FPP4 z=1Ko;K>~JnBq3jKi_gcsTMBF|FL^UCcH+FwZDA|s&u4hO{mljI^2OrCvKJcTGFCr% z_ax^C&>L$_ehjA?c@4`LC--{F?IpkXUSJGf`O1i7rg~xVZ3#91fYCXo(a;2pP}GAP z^FH}nPOy1e7>m8512220=@!qhNV19FC7Tg4DUvAE9ds^YLPbR__VcvH1)u`qv?D%! z+qy4aZalRe68nf&BiJQL>7gSy*1?MLW?XVk=fjJ)%L`6Rx|nEx%MObRXSmwt#bO*8 zQ{l{CgH!eC>8Xuzd;nk69epVfJCEY;t@W~rl)XXUC|rB$4ZG+rq^$-1-P~f6BDvH$ zD!Y^VPMK3{2NhNO=51Mk=8D$O#c-0tLvOg;?S1o;x-eSyp?ViIg7Eb5O0*gXrl@4n z1aCE6TGcnkbx8HbSn6n4FNIMdb)B)}vvKccQ{`44a!O*dsbO<QMaZ4wgAZ&;vgLpz zRa8SwCSSEYUq0wXCdBJ-!_tPu&T*q}me1{875-r-RaiM`lDz`8k;$~ia|X&En)`!J zjn=Z0+MD&X0(RY3QpX3Z36LQe5$n|4C|SqxWP@Z!Rh!)t(y*7w_l#llewk)2?Zj-o z?$5?+C3v~dC^z03c5n9xqnP#%t-LE&RhRD6EI$31zj-)LKLb3R$#PP5@`c(ImxHwh z5~+x$UZ^LujY4s^d>QhP6}9<Zgs&x9d*tv|+Fc0}Q+I<U@M%3T6FcN#_f{+Rv<BC? zliX%PSlce`meIgQi>aF3-t4;PFXYT-wJUJ1$jYzbuI3n8#s+@05gGG|^bP;65xAM^ z9TuAD?R$=f%Ob!f9%d7D0q|I@x6m4SL~IaVp)vQ`4-tm06Hi{|a1|?_b`KkvUcnSz zbC`&J8gtW4<s5wdRU>4QJzgs*?Y*n*D2)b0Cq*8?pmgDNOPfo=s%t_{$K;sMaik<$ zUdimcZYi$)o0L*@&f?m2)<g^4!Ct06zPCE8(!aii`G26H{YfULVmH(pV4cysU##=d zBuRU~_hZ0EhyTm}lDnkNx#xZYc&&MJ22deN0(Pg>=OwWOUIj_tmy^gA)A#&8iV*2b diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt index d629828f12..ed9e4ae833 100644 --- a/doc/src/accelerate_intel.txt +++ b/doc/src/accelerate_intel.txt @@ -30,8 +30,8 @@ Dihedral Styles: charmm, harmonic, opls :l Fixes: nve, npt, nvt, nvt/sllod :l Improper Styles: cvff, harmonic :l Pair Styles: buck/coul/cut, buck/coul/long, buck, eam, gayberne, -charmm/coul/long, lj/cut, lj/cut/coul/long, sw, tersoff :l -K-Space Styles: pppm :l +charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, sw, tersoff :l +K-Space Styles: pppm, pppm/disp :l :ule [Speed-ups to expect:] @@ -42,62 +42,88 @@ precision mode. Performance improvements are shown compared to LAMMPS {without using other acceleration packages} as these are under active development (and subject to performance changes). The measurements were performed using the input files available in -the src/USER-INTEL/TEST directory. These are scalable in size; the -results given are with 512K particles (524K for Liquid Crystal). -Most of the simulations are standard LAMMPS benchmarks (indicated -by the filename extension in parenthesis) with modifications to the -run length and to add a warmup run (for use with offload -benchmarks). +the src/USER-INTEL/TEST directory with the provided run script. +These are scalable in size; the results given are with 512K +particles (524K for Liquid Crystal). Most of the simulations are +standard LAMMPS benchmarks (indicated by the filename extension in +parenthesis) with modifications to the run length and to add a +warmup run (for use with offload benchmarks). :c,image(JPG/user_intel.png) Results are speedups obtained on Intel Xeon E5-2697v4 processors (code-named Broadwell) and Intel Xeon Phi 7250 processors -(code-named Knights Landing) with "18 Jun 2016" LAMMPS built with -Intel Parallel Studio 2016 update 3. Results are with 1 MPI task +(code-named Knights Landing) with "June 2017" LAMMPS built with +Intel Parallel Studio 2017 update 2. Results are with 1 MPI task per physical core. See {src/USER-INTEL/TEST/README} for the raw simulation rates and instructions to reproduce. :line +[Accuracy and order of operations:] + +In most molecular dynamics software, parallelization parameters +(# of MPI, OpenMP, and vectorization) can change the results due +to changing the order of operations with finite-precision +calculations. The USER-INTEL package is deterministic. This means +that the results should be reproducible from run to run with the +{same} parallel configurations and when using determinstic +libraries or library settings (MPI, OpenMP, FFT). However, there +are differences in the USER-INTEL package that can change the +order of operations compared to LAMMPS without acceleration: + +Neighbor lists can be created in a different order :ulb,l +Bins used for sorting atoms can be oriented differently :l +The default stencil order for PPPM is 7. By default, LAMMPS will +calculate other PPPM parameters to fit the desired acuracy with +this order :l +The {newton} setting applies to all atoms, not just atoms shared +between MPI tasks :l +Vectorization can change the order for adding pairwise forces :l +:ule + +The precision mode (described below) used with the USER-INTEL +package can change the {accuracy} of the calculations. For the +default {mixed} precision option, calculations between pairs or +triplets of atoms are performed in single precision, intended to +be within the inherent error of MD simulations. All accumulation +is performed in double precision to prevent the error from growing +with the number of atoms in the simulation. {Single} precision +mode should not be used without appropriate validation. + +:line + [Quick Start for Experienced Users:] LAMMPS should be built with the USER-INTEL package installed. Simulations should be run with 1 MPI task per physical {core}, not {hardware thread}. -For Intel Xeon CPUs: - Edit src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi as necessary. :ulb,l -If using {kspace_style pppm} in the input script, add "neigh_modify binsize cutoff" and "kspace_modify diff ad" to the input script for better -performance. Cutoff should be roughly the neighbor list cutoff. By -default the binsize is half the neighbor list cutoff. :l -"-pk intel 0 omp 2 -sf intel" added to LAMMPS command-line :l +Set the environment variable KMP_BLOCKTIME=0 :l +"-pk intel 0 omp $t -sf intel" added to LAMMPS command-line :l +$t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l +For some of the simple 2-body potentials without long-range +electrostatics, performance and scalability can be better with +the "newton off" setting added to the input script :l +If using {kspace_style pppm} in the input script, add +"kspace_modify diff ad" for better performance :l :ule -For Intel Xeon Phi CPUs for simulations without {kspace_style -pppm} in the input script : +For Intel Xeon Phi CPUs: -Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l -Runs should be performed using MCDRAM. :l -"-pk intel 0 omp 2 -sf intel" {or} "-pk intel 0 omp 4 -sf intel" -should be added to the LAMMPS command-line. Choice for best -performance will depend on the simulation. :l +Runs should be performed using MCDRAM. :ulb,l :ule -For Intel Xeon Phi CPUs for simulations with {kspace_style -pppm} in the input script: - -Edit src/MAKE/OPTIONS/Makefile.knl as necessary. :ulb,l -Runs should be performed using MCDRAM. :l -Add "neigh_modify binsize 3" to the input script for better -performance. :l -Add "kspace_modify diff ad" to the input script for better -performance. :l -export KMP_AFFINITY=none :l -"-pk intel 0 omp 3 lrt yes -sf intel" or "-pk intel 0 omp 1 lrt yes --sf intel" added to LAMMPS command-line. Choice for best performance -will depend on the simulation. :l +For simulations using {kspace_style pppm} on Intel CPUs +supporting AVX-512: + +Add "kspace_modify diff ad" to the input script :ulb,l +The command-line option should be changed to +"-pk intel 0 omp $r lrt yes -sf intel" where $r is the number of +threads minus 1. :l +Do not use thread affinity (set KMP_AFFINITY=none) :l +The "newton off" setting may provide better scalability :l :ule For Intel Xeon Phi coprocessors (Offload): @@ -169,6 +195,10 @@ cat /proc/cpuinfo :pre [Building LAMMPS with the USER-INTEL package:] +NOTE: See the src/USER-INTEL/README file for additional flags that +might be needed for best performance on Intel server processors +code-named "Skylake". + The USER-INTEL package must be installed into the source directory: make yes-user-intel :pre @@ -322,8 +352,8 @@ follow in the input script. NOTE: The USER-INTEL package will perform better with modifications to the input script when "PPPM"_kspace_style.html is used: -"kspace_modify diff ad"_kspace_modify.html and "neigh_modify binsize -3"_neigh_modify.html should be added to the input script. +"kspace_modify diff ad"_kspace_modify.html should be added to the +input script. Long-Range Thread (LRT) mode is an option to the "package intel"_package.html command that can improve performance when using @@ -342,6 +372,10 @@ would normally perform best with "-pk intel 0 omp 4", instead use environment variable "KMP_AFFINITY=none". LRT mode is not supported when using offload. +NOTE: Changing the "newton"_newton.html setting to off can improve +performance and/or scalability for simple 2-body potentials such as +lj/cut or when using LRT mode on processors supporting AVX-512. + Not all styles are supported in the USER-INTEL package. You can mix the USER-INTEL package with styles from the "OPT"_accelerate_opt.html package or the "USER-OMP package"_accelerate_omp.html. Of course, @@ -467,7 +501,7 @@ supported. Brown, W.M., Carrillo, J.-M.Y., Mishra, B., Gavhane, N., Thakker, F.M., De Kraker, A.R., Yamada, M., Ang, J.A., Plimpton, S.J., "Optimizing Classical Molecular Dynamics in LAMMPS," in Intel Xeon Phi Processor High Performance Programming: Knights Landing Edition, J. Jeffers, J. Reinders, A. Sodani, Eds. Morgan Kaufmann. :ulb,l -Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency. 2016 International Conference for High Performance Computing. In press. :l +Brown, W. M., Semin, A., Hebenstreit, M., Khvostov, S., Raman, K., Plimpton, S.J. "Increasing Molecular Dynamics Simulation Rates with an 8-Fold Increase in Electrical Power Efficiency."_http://dl.acm.org/citation.cfm?id=3014915 2016 High Performance Computing, Networking, Storage and Analysis, SC16: International Conference (pp. 82-95). :l Brown, W.M., Carrillo, J.-M.Y., Gavhane, N., Thakkar, F.M., Plimpton, S.J. Optimizing Legacy Molecular Dynamics Software with Directive-Based Offload. Computer Physics Communications. 2015. 195: p. 95-101. :l :ule diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt index 94c6ee84fd..7382e6024d 100644 --- a/doc/src/fix_neb.txt +++ b/doc/src/fix_neb.txt @@ -14,152 +14,178 @@ fix ID group-ID neb Kspring keyword value :pre ID, group-ID are documented in "fix"_fix.html command :ulb,l neb = style name of this fix command :l -Kspring = parallel spring constant (force/distance units or force units) :l +Kspring = parallel spring constant (force/distance units or force units, see nudge keyword) :l zero or more keyword/value pairs may be appended :l -keyword = {nudg_style} or {perp} or {freend} or {freend_k_spring} :l - {nudg_style} value = {neigh} or {idealpos} - {neigh} = the parallel nudging force is calculated from the distances to neighbouring replicas (in this case, Kspring is in force/distance units) - {idealpos} = the parallel nudging force is proportional to the distance between the replica and its interpolated ideal position (in this case Kspring is in force units) - {perp} value {none} or kspring2 - {none} = no perpendicular spring force is applied - {kspring2} = spring constant for the perpendicular nudging force (in force/distance units) - {freeend} value = {none} or {ini} or {final} or {finaleini} or {final2eini} - {none} = no nudging force is applied to the first and last replicas - {ini} = set the first replica to be a free end - {final} = set the last replica to be a free end - {finaleini} = set the last replica to be a free end and set its target energy as that of the first replica - {final2eini} = same as {finaleini} plus prevent intermediate replicas to have a lower energy than the first replica - {freeend_kspring} value = kspring3 - kspring3 = spring constant of the perpendicular spring force (per distance units) - :pre +keyword = {nudge} or {perp} or {ends} :l + {nudge} value = {neigh} or {ideal} + {neigh} = parallel nudging force based on distance to neighbor replicas (Kspring = force/distance units) + {ideal} = parallel nudging force based on interpolated ideal position (Kspring = force units) + {perp} value = {Kspring2} + {Kspring2} = spring constant for perpendicular nudging force (force/distance units) + {end} values = estyle Kspring3 + {estyle} = {first} or {last} or {last/efirst} or {last/efirst/middle} + {first} = apply force to first replica + {last} = apply force to last replica + {last/efirst} = apply force to last replica and set its target energy to that of first replica + {last/efirst/middle} = same as {last/efirst} plus prevent middle replicas having lower energy than first replica + {Kspring3} = spring constant for target energy term (1/distance units) :pre [Examples:] fix 1 active neb 10.0 -fix 2 all neb 1.0 perp 1.0 freeend final -fix 1 all neb 1.0 nudg_style idealpos freeend final2eini freend_kspring 1:pre +fix 2 all neb 1.0 perp 1.0 end last +fix 2 all neb 1.0 perp 1.0 end first end last +fix 1 all neb 1.0 nudge ideal end last/efirst 1 :pre [Description:] -Add a nudging force to atoms in the group for a multi-replica +Add nudging forces to atoms in the group for a multi-replica simulation run via the "neb"_neb.html command to perform a nudged elastic band (NEB) calculation for finding the transition state. Hi-level explanations of NEB are given with the "neb"_neb.html command and in "Section_howto 5"_Section_howto.html#howto_5 of the manual. The fix neb command must be used with the "neb" command and defines -how nudging inter-replica forces are computed. A NEB calculation is +how inter-replica nudging forces are computed. A NEB calculation is divided in two stages. In the first stage n replicas are relaxed -toward a MEP and in a second stage, the climbing image scheme (see -"(Henkelman2)"_#Henkelman2) is turned on so that the replica having -the highest energy relaxes toward the saddle point (i.e. the point of -highest energy along the MEP). - -One purpose of the nudging forces is to keep the replicas equally -spaced. During the NEB, the 3N-length vector of interatomic force Fi -= -Grad(V) of replicas i is altered. For all intermediate replicas -(i.e. for 1<i<n) but the climbing replica the force vector -becomes: - -Fi = -Grad(V) + (Grad(V) dot That) That + Fnudgparallel + Fspringperp :pre - -That is the unit "tangent" vector for replica i and is a function of -Ri, Ri-1, Ri+1, and the potential energy of the 3 replicas; it points -roughly in the direction of (Ri+i - Ri-1) (see the -"(Henkelman1)"_#Henkelman1 paper for details). Ri are the atomic -coordinates of replica i; Ri-1 and Ri+1 are the coordinates of its -neighbor replicas. The term (Grad(V) dot That) is used to remove the +toward a MEP until convergence. In the second stage, the climbing +image scheme (see "(Henkelman2)"_#Henkelman2) is enabled, so that the +replica having the highest energy relaxes toward the saddle point +(i.e. the point of highest energy along the MEP), and a second +relaxation is performed. + +A key purpose of the nudging forces is to keep the replicas equally +spaced. During the NEB calculation, the 3N-length vector of +interatomic force Fi = -Grad(V) for each replica I is altered. For +all intermediate replicas (i.e. for 1 < I < N, except the climbing +replica) the force vector becomes: + +Fi = -Grad(V) + (Grad(V) dot T') T' + Fnudge_parallel + Fspring_perp :pre + +T' is the unit "tangent" vector for replica I and is a function of Ri, +Ri-1, Ri+1, and the potential energy of the 3 replicas; it points +roughly in the direction of (Ri+i - Ri-1); see the +"(Henkelman1)"_#Henkelman1 paper for details. Ri are the atomic +coordinates of replica I; Ri-1 and Ri+1 are the coordinates of its +neighbor replicas. The term (Grad(V) dot T') is used to remove the component of the gradient parallel to the path which would tend to -distribute the replica unevenly along the path. Fnudgparallel is an -artificial nudging force which is applied only in the tangent direction -and which maintains the replicas equally spaced (see below for more -information). Fspringperp is an optinal artificial spring which is -applied only perpendicular to the tangent and which prevent the paths -from forming too acute kinks (see below for more information). - -The keyword {nudg_style} allow to specify how to parallel -nudging force is computed. With a value of idealpos, the spring -force is computed as suggested in "(E)"_#E : - -Fnudgparallel=-{Kspring}* (RD-RDideal)/(2 meanDist) :pre - -where RD is the "reaction coordinate" see "neb"_neb.html section, and -RDideal is the ideal RD for which all the images are equally spaced -(i.e. RDideal = (i-1)*meanDist when the climbing image is off, where i -is the replica number). The meanDist is the average distance between -replicas. +distribute the replica unevenly along the path. Fnudge_parallel is an +artificial nudging force which is applied only in the tangent +direction and which maintains the equal spacing between replicas (see +below for more information). Fspring_perp is an optional artificial +spring which is applied only perpendicular to the tangent and which +prevent the paths from forming acute kinks (see below for more +information). -When {nudg_style} has a value of neigh (or by default), the parallel -nudging force is computed as in "(Henkelman1)"_#Henkelman1 by -connecting each intermediate replica with the previous and the next -image: +In the second stage of the NEB calculation, the interatomic force Fi +for the climbing replica (the replica of highest energy after the +first stage) is changed to: -Fnudgparallel= {Kspring}* (|Ri+1 - Ri| - |Ri - Ri-1|) :pre +Fi = -Grad(V) + 2 (Grad(V) dot T') T' :pre -The parallel nudging force associated with the key word idealpos should -usually be more efficient at keeping the images equally spaced. +and the relaxation procedure is continued to a new converged MEP. :line -The keyword {perp} allows to add a spring force perpendicular to the -path in order to prevent the path from becoming too kinky. It can -improve significantly the convergence of the NEB when the resolution -is poor (i.e. when too few images are used) (see "(Maras)"_#Maras1). -The perpendicular spring force is given by +The keyword {nudge} specifies how the parallel nudging force is +computed. With a value of {neigh}, the parallel nudging force is +computed as in "(Henkelman1)"_#Henkelman1 by connecting each +intermediate replica with the previous and the next image: + +Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre + +Note that in this case the specified {Kspring) is in force/distance +units. + +With a value of {ideal}, the spring force is computed as suggested in +"(WeinenE)"_#WeinenE : + +Fnudge_parallel = -{Kspring} * (RD-RDideal) / (2 * meanDist) :pre -Fspringperp = {Kspringperp} * f(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre +where RD is the "reaction coordinate" see "neb"_neb.html section, and +RDideal is the ideal RD for which all the images are equally spaced. +I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where +I is the replica number). The meanDist is the average distance +between replicas. Note that in this case the specified {Kspring) is +in force units. -f(Ri-1 Ri R+1) is a smooth scalar function of the angle Ri-1 Ri -Ri+1. It is equal to 0 when the path is straight and is equal to 1 -when the angle Ri-1 Ri Ri+1 is accute. f(Ri-1 Ri R+1) is defined in -"(Jonsson)"_#Jonsson +Note that the {ideal} form of nudging can often be more effective at +keeping the replicas equally spaced. :line -By default, the force acting on the first and last replicas is not -altered so that during the NEB relaxation, these ending replicas relax -toward local minima. However it is possible to use the key word -{freeend} to allow either the initial or the final replica to relax -toward a MEP while constraining its energy. The interatomic force Fi -for the free end image becomes : +The keyword {perp} adds a spring force perpendicular to the path in +order to prevent the path from becoming too kinky, with magnitude It +can significantly improve the convergence of the NEB calculation when +the resolution is poor. I.e. when too few replicas are used; see +"(Maras)"_#Maras1 for details. -Fi = -Grad(V)+ (Grad(V) dot That + (E-ETarget)*kspring3) That, {when} Grad(V) dot That < 0 -Fi = -Grad(V)+ (Grad(V) dot That + (ETarget- E)*kspring3) That, {when} Grad(V) dot That > 0 -:pre +The perpendicular spring force is given by -where E is the energy of the free end replica and ETarget is the -target energy. - -When the value {ini} ({final}) is used after the keyword {freeend}, -the first (last) replica is considered as a free end. The target -energy is set to the energy of the replica at starting of the NEB -calculation. When the value {finaleini} or {final2eini} is used the -last image is considered as a free end and the target energy is equal -to the energy of the first replica (which can evolve during the NEB -relaxation). With the value {finaleini}, when the initial path is too -far from the MEP, an intermediate repilica might relax "faster" and -get a lower energy than the last replica. The benefit of the free end -is then lost since this intermediate replica will relax toward a local -minima. This behavior can be prevented by using the value {final2eini} -which remove entirely the contribution of the gradient for all -intermediate replica which have a lower energy than the initial one -thus preventing these replicae to over-relax. After converging a NEB -with the {final2eini} value it is recommended to check that all -intermediate replica have a larger energy than the initial -replica. Finally note that if the last replica converges toward a -local minimum with a larger energy than the energy of the first -replica, a free end neb calculation with the value {finaleini} or -{final2eini} cannot reach the convergence criteria. +Fspring_perp = {Kspring2} * F(Ri-1,Ri,Ri+1) (Ri+1 + Ri-1 - 2 Ri) :pre -:line +where {Kspring2} is the specified value. F(Ri-1 Ri R+1) is a smooth +scalar function of the angle Ri-1 Ri Ri+1. It is equal to 0.0 when +the path is straight and is equal to 1 when the angle Ri-1 Ri Ri+1 is +acute. F(Ri-1 Ri R+1) is defined in "(Jonsson)"_#Jonsson. +If {Kspring2} is set to 0.0 (the default) then no perpendicular spring +force is added. +:line -In the second stage of the NEB, the interatomic force Fi for the -climbing replica (which is the replica of highest energy) becomes: +By default, no forces act on the first and last replicas during the +NEB relaxation, so these replicas simply relax toward their respective +local minima. By using the key word {end}, additional forces can be +applied to the first or last replica, to enable them to relax toward a +MEP while constraining their energy. -Fi = -Grad(V) + 2 (Grad(V) dot That) That :pre +The interatomic force Fi for the specified replica becomes: +Fi = -Grad(V) + (Grad(V) dot T' + (E-ETarget)*Kspring3) T', {when} Grad(V) dot T' < 0 +Fi = -Grad(V) + (Grad(V) dot T' + (ETarget- E)*Kspring3) T', {when} Grad(V) dot T' > 0 +:pre +where E is the current energy of the replica and ETarget is the target +energy. The "spring" constant on the difference in energies is the +specified {Kspring3} value. + +When {estyle} is specified as {first}, the force is applied to the +first replica. When {estyle} is specified as {last}, the force is +applied to the last replica. Note that the {end} keyword can be used +twice to add forces to both the first and last replicas. + +For both these {estyle} settings, the target energy {ETarget} is set +to the initial energy of the replica (at the start of the NEB +calculation). + +If the {estyle} is specified as {last/efirst} or {last/efirst/middle}, +force is applied to the last replica, but the target energy {ETarget} +is continuously set to the energy of the first replica, as it evolves +during the NEB relaxation. + +The difference between these two {estyle} options is as follows. When +{estyle} is specified as {last/efirst}, no change is made to the +inter-replica force applied to the intermediate replicas (neither +first or last). If the initial path is too far from the MEP, an +intermediate repilica may relax "faster" and reach a lower energy than +the last replica. In this case the intermediate replica will be +relaxing toward its own local minima. This behavior can be prevented +by specifying {estyle} as {last/efirst/middle} which will alter the +inter-replica force applied to intermediate replicas by removing the +contribution of the gradient to the inter-replica force. This will +only be done if a particular intermediate replica has a lower energy +than the first replica. This should effectively prevent the +intermediate replicas from over-relaxing. + +After converging a NEB calculation using an {estyle} of {last/efirst}, +you should check that all intermediate replicas have a larger energy +than the first replica. If not, then repeat the calculation with an +{estyle} of {last/efirst/middle}. + +Finally, note that if the last replica converges toward a local +minimum which has a larger energy than the energy of the first +replica, a NEB calculation using an {estyle} of {last/efirst} or +{last/efirst/middle} cannot reach final convergence. [Restart, fix_modify, output, run start/stop, minimize info:] @@ -186,7 +212,8 @@ for more info on packages. [Default:] -The option defaults are nudg_style = neigh, perp = none, freeend = none and freend_kspring = 1. +The option defaults are nudge = neigh, perp = 0.0, ends is not +specified (no inter-replica force on the end replicas). :line @@ -197,14 +224,14 @@ The option defaults are nudg_style = neigh, perp = none, freeend = none and free [(Henkelman2)] Henkelman, Uberuaga, Jonsson, J Chem Phys, 113, 9901-9904 (2000). -:link(E) -[(E)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002) +:link(WeinenE) +[(WeinenE)] E, Ren, Vanden-Eijnden, Phys Rev B, 66, 052301 (2002). :link(Jonsson) [(Jonsson)] Jonsson, Mills and Jacobsen, in Classical and Quantum -Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti, and Coker -World Scientific, Singapore, 1998, p. 385 +Dynamics in Condensed Phase Simulations, edited by Berne, Ciccotti, +and Coker World Scientific, Singapore, 1998, p 385. :link(Maras1) [(Maras)] Maras, Trushin, Stukowski, Ala-Nissila, Jonsson, -Comp Phys Comm, 205, 13-21 (2016) +Comp Phys Comm, 205, 13-21 (2016). diff --git a/doc/src/kspace_modify.txt b/doc/src/kspace_modify.txt index b488df9627..66091f4973 100644 --- a/doc/src/kspace_modify.txt +++ b/doc/src/kspace_modify.txt @@ -308,7 +308,8 @@ The option defaults are mesh = mesh/disp = 0 0 0, order = order/disp = gewald = gewald/disp = 0.0, slab = 1.0, compute = yes, cutoff/adjust = yes (MSM), pressure/scalar = yes (MSM), fftbench = yes (PPPM), diff = ik (PPPM), mix/disp = pair, force/disp/real = -1.0, force/disp/kspace = -1.0, -split = 0, tol = 1.0e-6, and disp/auto = no. +split = 0, tol = 1.0e-6, and disp/auto = no. For pppm/intel, order = +order/disp = 7. :line diff --git a/doc/src/kspace_style.txt b/doc/src/kspace_style.txt index 371540bd68..4f27c9aa78 100644 --- a/doc/src/kspace_style.txt +++ b/doc/src/kspace_style.txt @@ -33,12 +33,16 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg} accuracy = desired relative error in forces {pppm/gpu} value = accuracy accuracy = desired relative error in forces + {pppm/intel} value = accuracy + accuracy = desired relative error in forces {pppm/kk} value = accuracy accuracy = desired relative error in forces {pppm/omp} value = accuracy accuracy = desired relative error in forces {pppm/cg/omp} value = accuracy accuracy = desired relative error in forces + {pppm/disp/intel} value = accuracy + accuracy = desired relative error in forces {pppm/tip4p/omp} value = accuracy accuracy = desired relative error in forces {pppm/stagger} value = accuracy diff --git a/doc/src/pair_lj_long.txt b/doc/src/pair_lj_long.txt index d559871f9d..da9f37b9c3 100644 --- a/doc/src/pair_lj_long.txt +++ b/doc/src/pair_lj_long.txt @@ -7,6 +7,7 @@ :line pair_style lj/long/coul/long command :h3 +pair_style lj/long/coul/long/intel command :h3 pair_style lj/long/coul/long/omp command :h3 pair_style lj/long/coul/long/opt command :h3 pair_style lj/long/tip4p/long command :h3 diff --git a/examples/neb/in.neb.hop1 b/examples/neb/in.neb.hop1 index b874d1ba32..9b5dcb07ec 100644 --- a/examples/neb/in.neb.hop1 +++ b/examples/neb/in.neb.hop1 @@ -51,7 +51,7 @@ set group nebatoms type 3 group nonneb subtract all nebatoms fix 1 lower setforce 0.0 0.0 0.0 -fix 2 nebatoms neb 1.0 nudg_style idealpos +fix 2 nebatoms neb 1.0 #nudge ideal fix 3 all enforce2d thermo 100 diff --git a/examples/neb/in.neb.hop1freeend b/examples/neb/in.neb.hop1.end similarity index 91% rename from examples/neb/in.neb.hop1freeend rename to examples/neb/in.neb.hop1.end index fa90e9a98c..2f4ba526d8 100644 --- a/examples/neb/in.neb.hop1freeend +++ b/examples/neb/in.neb.hop1.end @@ -15,7 +15,7 @@ variable u uloop 20 lattice hex 0.9 region box block 0 20 0 10 -0.25 0.25 -read_data initial.hop1freeend +read_data initial.hop1.end # LJ potentials @@ -41,7 +41,7 @@ set group nebatoms type 3 group nonneb subtract all nebatoms fix 1 lower setforce 0.0 0.0 0.0 -fix 2 nebatoms neb 1.0 nudg_style idealpos freeend ini +fix 2 nebatoms neb 1.0 nudge ideal end first 1.0 fix 3 all enforce2d thermo 100 diff --git a/examples/neb/initial.hop1freeend b/examples/neb/initial.hop1.end similarity index 100% rename from examples/neb/initial.hop1freeend rename to examples/neb/initial.hop1.end diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi index 2cb37ed9fe..ac8279949a 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi +++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi @@ -8,7 +8,7 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ +CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ -fno-alias -ansi-alias -restrict $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor index b7f3cd6846..db5de83a06 100644 --- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor +++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor @@ -8,7 +8,7 @@ SHELL = /bin/sh CC = mpiicpc MIC_OPT = -qoffload-arch=mic-avx512 -fp-model fast=2 -CCFLAGS = -g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ +CCFLAGS = -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \ -xHost -fno-alias -ansi-alias -restrict \ -qoverride-limits $(MIC_OPT) SHFLAGS = -fPIC diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl index 3bc777592e..881c51f0e4 100644 --- a/src/MAKE/OPTIONS/Makefile.knl +++ b/src/MAKE/OPTIONS/Makefile.knl @@ -8,7 +8,7 @@ SHELL = /bin/sh CC = mpiicpc OPTFLAGS = -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -CCFLAGS = -g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \ +CCFLAGS = -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \ -fno-alias -ansi-alias -restrict $(OPTFLAGS) SHFLAGS = -fPIC DEPFLAGS = -M diff --git a/src/REPLICA/fix_neb.cpp b/src/REPLICA/fix_neb.cpp index b17315ca0d..297c101234 100644 --- a/src/REPLICA/fix_neb.cpp +++ b/src/REPLICA/fix_neb.cpp @@ -34,6 +34,9 @@ using namespace FixConst; using namespace MathConst; enum{SINGLE_PROC_DIRECT,SINGLE_PROC_MAP,MULTI_PROC}; + +#define BUFSIZE 8 + /* ---------------------------------------------------------------------- */ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) : @@ -45,56 +48,62 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) : tagsendall(NULL), tagrecvall(NULL), counts(NULL), displacements(NULL) { + if (narg < 4) error->all(FLERR,"Illegal fix neb command"); - NEBLongRange=false; - StandardNEB=true; - PerpSpring=FreeEndIni=FreeEndFinal=false; - FreeEndFinalWithRespToEIni=FinalAndInterWithRespToEIni=false; + kspring = force->numeric(FLERR,arg[3]); + if (kspring <= 0.0) error->all(FLERR,"Illegal fix neb command"); - kspringPerp=0.0; - kspring2=1.0; - if (narg < 4) - error->all(FLERR,"Illegal fix neb command, argument missing"); + // optional params - kspring = force->numeric(FLERR,arg[3]); - if (kspring <= 0.0) - error->all(FLERR,"Illegal fix neb command." - " The spring force was not provided properly"); + NEBLongRange = false; + StandardNEB = true; + PerpSpring = FreeEndIni = FreeEndFinal = false; + FreeEndFinalWithRespToEIni = FinalAndInterWithRespToEIni = false; + kspringPerp = 0.0; + kspring2 = 1.0; - int iarg =4; + int iarg = 4; while (iarg < narg) { - if (strcmp (arg[iarg],"nudg_style")==0) { - if (strcmp (arg[iarg+1],"idealpos")==0) { - NEBLongRange = true; - iarg+=2;} - else if (strcmp (arg[iarg+1],"neigh")==0) { - NEBLongRange = false; - StandardNEB = true; - iarg+=2;} - else error->all(FLERR,"Illegal fix neb command. Unknown keyword");} - else if (strcmp (arg[iarg],"perp")==0) { - PerpSpring=true; + if (strcmp(arg[iarg],"nudge") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command"); + if (strcmp(arg[iarg+1],"ideal") == 0) { + NEBLongRange = true; + StandardNEB = false; + } else if (strcmp(arg[iarg+1],"neigh") == 0) { + NEBLongRange = false; + StandardNEB = true; + } else error->all(FLERR,"Illegal fix neb command"); + iarg += 2; + + } else if (strcmp(arg[iarg],"perp") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal fix neb command"); + PerpSpring = true; kspringPerp = force->numeric(FLERR,arg[iarg+1]); - if (kspringPerp < 0.0) - error->all(FLERR,"Illegal fix neb command. " - "The perpendicular spring force was not provided properly"); - iarg+=2;} - else if (strcmp (arg[iarg],"freeend")==0) { - if (strcmp (arg[iarg+1],"ini")==0) - FreeEndIni=true; - else if (strcmp (arg[iarg+1],"final")==0) - FreeEndFinal=true; - else if (strcmp (arg[iarg+1],"finaleini")==0) - FreeEndFinalWithRespToEIni=true; - else if (strcmp (arg[iarg+1],"final2eini")==0) { - FinalAndInterWithRespToEIni=true; - FreeEndFinalWithRespToEIni=true;} - else if (strcmp (arg[iarg+1],"none")!=0) error->all(FLERR,"Illegal fix neb command. Unknown keyword"); - iarg+=2;} - else if (strcmp (arg[iarg],"freeend_kspring")==0) { - kspring2=force->numeric(FLERR,arg[iarg+1]); - iarg+=2; } - else error->all(FLERR,"Illegal fix neb command. Unknown keyword"); + if (kspringPerp == 0.0) PerpSpring = false; + if (kspringPerp < 0.0) error->all(FLERR,"Illegal fix neb command"); + iarg += 2; + + } else if (strcmp (arg[iarg],"end") == 0) { + if (iarg+3 > narg) error->all(FLERR,"Illegal fix neb command"); + if (strcmp(arg[iarg+1],"first") == 0) { + FreeEndIni = true; + } else if (strcmp(arg[iarg+1],"last") == 0) { + FreeEndFinal = true; + FinalAndInterWithRespToEIni = false; + FreeEndFinalWithRespToEIni = false; + } else if (strcmp(arg[iarg+1],"last/efirst") == 0) { + FreeEndFinal = false; + FinalAndInterWithRespToEIni = false; + FreeEndFinalWithRespToEIni = true; + } else if (strcmp(arg[iarg+1],"last/efirst/middle") == 0) { + FreeEndFinal = false; + FinalAndInterWithRespToEIni = true; + FreeEndFinalWithRespToEIni = true; + } else error->all(FLERR,"Illegal fix neb command"); + kspring2 = force->numeric(FLERR,arg[iarg+2]); + iarg += 3; + + } else error->all(FLERR,"Illegal fix neb command"); } // nreplica = number of partitions @@ -119,12 +128,12 @@ FixNEB::FixNEB(LAMMPS *lmp, int narg, char **arg) : MPI_Group uworldgroup,rootgroup; if (NEBLongRange) { for (int i=0; i<nreplica; i++) - iroots[i]=universe->root_proc[i]; + iroots[i] = universe->root_proc[i]; MPI_Comm_group(uworld, &uworldgroup); MPI_Group_incl(uworldgroup, nreplica, iroots, &rootgroup); MPI_Comm_create(uworld, rootgroup, &rootworld); } - delete[] iroots; + delete [] iroots; // create a new compute pe style // id = fix-ID + pe, compute group = all @@ -256,11 +265,11 @@ void FixNEB::min_post_force(int vflag) double delxp,delyp,delzp,delxn,delyn,delzn; double vIni=0.0; - vprev=vnext=veng=pe->compute_scalar(); + vprev = vnext = veng = pe->compute_scalar(); - if (ireplica < nreplica-1 && me ==0) + if (ireplica < nreplica-1 && me == 0) MPI_Send(&veng,1,MPI_DOUBLE,procnext,0,uworld); - if (ireplica > 0 && me ==0) + if (ireplica > 0 && me == 0) MPI_Recv(&vprev,1,MPI_DOUBLE,procprev,0,uworld,MPI_STATUS_IGNORE); if (ireplica > 0 && me == 0) @@ -297,6 +306,7 @@ void FixNEB::min_post_force(int vflag) } // communicate atoms to/from adjacent replicas to fill xprev,xnext + inter_replica_comm(); // trigger potential energy computation on next timestep @@ -335,10 +345,10 @@ void FixNEB::min_post_force(int vflag) tangent[i][0]=delxp; tangent[i][1]=delyp; tangent[i][2]=delzp; - tlen += tangent[i][0]*tangent[i][0] - + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2]; - dot += f[i][0]*tangent[i][0] - + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2]; + tlen += tangent[i][0]*tangent[i][0] + + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2]; + dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + + f[i][2]*tangent[i][2]; } } @@ -360,10 +370,10 @@ void FixNEB::min_post_force(int vflag) tangent[i][0]=delxn; tangent[i][1]=delyn; tangent[i][2]=delzn; - tlen += tangent[i][0]*tangent[i][0] - + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2]; - dot += f[i][0]*tangent[i][0] - + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2]; + tlen += tangent[i][0]*tangent[i][0] + + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2]; + dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + + f[i][2]*tangent[i][2]; } } } else { @@ -388,13 +398,13 @@ void FixNEB::min_post_force(int vflag) domain->minimum_image(delxn,delyn,delzn); if (vnext > veng && veng > vprev) { - tangent[i][0]=delxn; - tangent[i][1]=delyn; - tangent[i][2]=delzn; + tangent[i][0] = delxn; + tangent[i][1] = delyn; + tangent[i][2] = delzn; } else if (vnext < veng && veng < vprev) { - tangent[i][0]=delxp; - tangent[i][1]=delyp; - tangent[i][2]=delzp; + tangent[i][0] = delxp; + tangent[i][1] = delyp; + tangent[i][2] = delzp; } else { if (vnext > vprev) { tangent[i][0] = vmax*delxn + vmin*delxp; @@ -408,24 +418,23 @@ void FixNEB::min_post_force(int vflag) } nlen += delxn*delxn + delyn*delyn + delzn*delzn; - tlen += tangent[i][0]*tangent[i][0] - + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2]; + tlen += tangent[i][0]*tangent[i][0] + + tangent[i][1]*tangent[i][1] + tangent[i][2]*tangent[i][2]; gradlen += f[i][0]*f[i][0] + f[i][1]*f[i][1] + f[i][2]*f[i][2]; dotpath += delxp*delxn + delyp*delyn + delzp*delzn; - dottangrad += tangent[i][0]* f[i][0] - + tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2]; - gradnextlen += fnext[i][0]*fnext[i][0] - + fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2]; - dotgrad += f[i][0]*fnext[i][0] - + f[i][1]*fnext[i][1] + f[i][2]*fnext[i][2]; - - springF[i][0]=kspringPerp*(delxn-delxp); - springF[i][1]=kspringPerp*(delyn-delyp); - springF[i][2]=kspringPerp*(delzn-delzp); + dottangrad += tangent[i][0]*f[i][0] + + tangent[i][1]*f[i][1] + tangent[i][2]*f[i][2]; + gradnextlen += fnext[i][0]*fnext[i][0] + + fnext[i][1]*fnext[i][1] +fnext[i][2] * fnext[i][2]; + dotgrad += f[i][0]*fnext[i][0] + f[i][1]*fnext[i][1] + + f[i][2]*fnext[i][2]; + + springF[i][0] = kspringPerp*(delxn-delxp); + springF[i][1] = kspringPerp*(delyn-delyp); + springF[i][2] = kspringPerp*(delzn-delzp); } } -#define BUFSIZE 8 double bufin[BUFSIZE], bufout[BUFSIZE]; bufin[0] = nlen; bufin[1] = plen; @@ -459,7 +468,7 @@ void FixNEB::min_post_force(int vflag) // first or last replica has no change to forces, just return - if(ireplica>0 && ireplica<nreplica-1) + if (ireplica > 0 && ireplica < nreplica-1) dottangrad = dottangrad/(tlen*gradlen); if (ireplica == 0) dottangrad = dottangrad/(nlen*gradlen); @@ -468,7 +477,6 @@ void FixNEB::min_post_force(int vflag) if (ireplica < nreplica-1) dotgrad = dotgrad /(gradlen*gradnextlen); - if (FreeEndIni && ireplica == 0) { if (tlen > 0.0) { double dotall; @@ -568,14 +576,15 @@ void FixNEB::min_post_force(int vflag) for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { - dot += f[i][0]*tangent[i][0] - + f[i][1]*tangent[i][1] + f[i][2]*tangent[i][2]; - dotSpringTangent += springF[i][0]*tangent[i][0] - +springF[i][1]*tangent[i][1]+springF[i][2]*tangent[i][2];} + dot += f[i][0]*tangent[i][0] + f[i][1]*tangent[i][1] + + f[i][2]*tangent[i][2]; + dotSpringTangent += springF[i][0]*tangent[i][0] + + springF[i][1]*tangent[i][1] + springF[i][2]*tangent[i][2];} } double dotSpringTangentall; - MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1,MPI_DOUBLE,MPI_SUM,world); + MPI_Allreduce(&dotSpringTangent,&dotSpringTangentall,1, + MPI_DOUBLE,MPI_SUM,world); dotSpringTangent=dotSpringTangentall; double dotall; MPI_Allreduce(&dot,&dotall,1,MPI_DOUBLE,MPI_SUM,world); @@ -603,12 +612,12 @@ void FixNEB::min_post_force(int vflag) for (int i = 0; i < nlocal; i++) if (mask[i] & groupbit) { - f[i][0] += prefactor*tangent[i][0] - +AngularContr*(springF[i][0] -dotSpringTangent*tangent[i][0]); - f[i][1] += prefactor*tangent[i][1] - + AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]); - f[i][2] += prefactor*tangent[i][2] - + AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]); + f[i][0] += prefactor*tangent[i][0] + + AngularContr*(springF[i][0] - dotSpringTangent*tangent[i][0]); + f[i][1] += prefactor*tangent[i][1] + + AngularContr*(springF[i][1] - dotSpringTangent*tangent[i][1]); + f[i][2] += prefactor*tangent[i][2] + + AngularContr*(springF[i][2] - dotSpringTangent*tangent[i][2]); } } @@ -827,7 +836,6 @@ void FixNEB::inter_replica_comm() } } - /* ---------------------------------------------------------------------- reallocate xprev,xnext,tangent arrays if necessary reallocate communication arrays if necessary diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README index e32a09c45c..c02014d0ce 100644 --- a/src/USER-INTEL/README +++ b/src/USER-INTEL/README @@ -4,6 +4,7 @@ -------------------------------- W. Michael Brown (Intel) michael.w.brown at intel.com + William McDoniel (RWTH Aachen University) Rodrigo Canales (RWTH Aachen University) Markus H�hnerbach (RWTH Aachen University) Stan Moore (Sandia) @@ -14,15 +15,25 @@ ----------------------------------------------------------------------------- -This package is based on the USER-OMP package and provides LAMMPS styles that: +This package provides LAMMPS styles that: 1. include support for single and mixed precision in addition to double. 2. include modifications to support vectorization for key routines + 3. include modifications for data layouts to improve cache efficiency 3. include modifications to support offload to Intel(R) Xeon Phi(TM) coprocessors ----------------------------------------------------------------------------- +For Intel server processors codenamed "Skylake", the following flags should +be added or changed in the Makefile depending on the version: + +2017 update 2 - No changes needed +2017 updates 3 or 4 - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512 +2018 or newer - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high + +----------------------------------------------------------------------------- + When using the suffix command with "intel", intel styles will be used if they exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP USER-OMP styles will be used whenever USER-INTEL styles are not available. This diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README index cf14fb3237..758c37bf56 100644 --- a/src/USER-INTEL/TEST/README +++ b/src/USER-INTEL/TEST/README @@ -4,6 +4,7 @@ # in.intel.lj - Atomic fluid (LJ Benchmark) # in.intel.rhodo - Protein (Rhodopsin Benchmark) # in.intel.lc - Liquid Crystal w/ Gay-Berne potential +# in.intel.eam - Copper benchmark with Embedded Atom Method # in.intel.sw - Silicon benchmark with Stillinger-Weber # in.intel.tersoff - Silicon benchmark with Tersoff # in.intel.water - Coarse-grain water benchmark using Stillinger-Weber @@ -11,19 +12,26 @@ ############################################################################# ############################################################################# -# Expected Timesteps/second with turbo on and HT enabled, LAMMPS 18-Jun-2016 +# Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017 +# - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi # # Xeon E5-2697v4 Xeon Phi 7250 # -# in.intel.lj - 162.764 179.148 -# in.intel.rhodo - 11.633 13.668 -# in.intel.lc - 19.136 24.863 -# in.intel.sw - 139.048 152.026 -# in.intel.tersoff - 82.663 92.985 -# in.intel.water - 59.838 85.704 +# in.intel.lj - 199.5 282.3 +# in.intel.rhodo - 12.4 17.5 +# in.intel.lc - 19.0 25.7 +# in.intel.eam - 59.4 92.8 +# in.intel.sw - 132.4 161.9 +# in.intel.tersoff - 83.3 101.1 +# in.intel.water - 53.4 90.3 # ############################################################################# +############################################################################# +# For Skylake server (Xeon) architectures, see notes in the USER-INTEL/README +# for build flags that should be used. +############################################################################# + ############################################################################# # For Haswell (Xeon v3) architectures, depending on the compiler version, # it may give better performance to compile for an AVX target (with -xAVX @@ -42,7 +50,18 @@ # -v m 0.5 # Run for half as long ############################################################################# -# Example for running benchmarks: +############################################################################# +# The LAMMPS newton setting can be controlled from the commandline for the +# benchmarks with the N variable: +# +# -v N on # newton on +# -v N off # newton off +# +# The default is on for all of the benchmarks except for LJ where the off +# setting performs best with the USER-INTEL package +############################################################################# + +# Example for running benchmarks (see run_benchmarks.sh for script): # Number of physical cores per node not including hyperthreads export LMP_CORES=28 @@ -57,26 +76,35 @@ export LMP_BIN=../../lmp_intel_cpu # LAMMPS root directory export LMP_ROOT=../../../ -source /opt/intel/parallel_studio_xe_2016.2.062/psxevars.sh +source source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh +export KMP_BLOCKTIME=0 export I_MPI_PIN_DOMAIN=core export I_MPI_FABRICS=shm # For single node +# ONLY FOR INTEL XEON PHI x200 SERIES PROCESSORS +export I_MPI_SHM_LMT=shm + # Generate the restart file for use with liquid crystal benchmark mpirun -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none # Benchmark to run export bench=in.intel.lj +############################################################################# +# For Intel Xeon Phi x200 series processors best performance is achieved by +# using MCDRAM. In flat mode, this can be achieved with numactl, +# MPI environment variables, or other options provided by batch schedulers +############################################################################# ############################################################################# # To run without a optimization package ############################################################################# -mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none +mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -v N on ############################################################################# # To run with USER-OMP package ############################################################################# -mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp +mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk omp 0 -sf omp -v N on ############################################################################# # To run with USER-INTEL package and no coprocessor @@ -89,6 +117,9 @@ mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 -sf intel mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 1 -sf intel ############################################################################# -# If using PPPM (in.intel.rhodo) on Intel Xeon Phi x200 series processors +# If using PPPM (e.g. in.intel.rhodo) on Intel Xeon Phi x200 series +# or Skylake processors ############################################################################# -mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp 3 lrt yes -sf intel +export KMP_AFFINITY=none +rthreads=$((OMP_NUM_THREADS-1)) +mpirun -np $LMP_CORES $LMP_BIN -in $bench -log none -pk intel 0 omp $rthreads lrt yes -sf intel diff --git a/src/USER-INTEL/TEST/in.intel.eam b/src/USER-INTEL/TEST/in.intel.eam index e9523a5dd1..5a3b3064af 100644 --- a/src/USER-INTEL/TEST/in.intel.eam +++ b/src/USER-INTEL/TEST/in.intel.eam @@ -1,4 +1,6 @@ # bulk Cu lattice + +variable N index on # Newton Setting variable w index 10 # Warmup Timesteps variable t index 3100 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -13,6 +15,7 @@ variable z index 2 variable rr equal floor($t*$m) variable root getenv LMP_ROOT +newton $N if "$n > 0" then "processors * * * grid numa" variable xx equal 20*$x diff --git a/src/USER-INTEL/TEST/in.intel.lc b/src/USER-INTEL/TEST/in.intel.lc index 0172ba3b4d..411f5d830d 100644 --- a/src/USER-INTEL/TEST/in.intel.lc +++ b/src/USER-INTEL/TEST/in.intel.lc @@ -3,6 +3,7 @@ # shape: 2 1.5 1 # cutoff 4.0 with skin 0.8 +variable N index on # Newton Setting variable w index 10 # Warmup Timesteps variable t index 840 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -15,6 +16,7 @@ variable z index 2 variable rr equal floor($t*$m) +newton $N if "$n > 0" then "processors * * * grid numa" units lj diff --git a/src/USER-INTEL/TEST/in.intel.lj b/src/USER-INTEL/TEST/in.intel.lj index 8931ca24bc..2b724f6014 100644 --- a/src/USER-INTEL/TEST/in.intel.lj +++ b/src/USER-INTEL/TEST/in.intel.lj @@ -1,5 +1,6 @@ # 3d Lennard-Jones melt +variable N index off # Newton Setting variable w index 10 # Warmup Timesteps variable t index 7900 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -15,6 +16,7 @@ variable yy equal 20*$y variable zz equal 20*$z variable rr equal floor($t*$m) +newton $N if "$n > 0" then "processors * * * grid numa" units lj diff --git a/src/USER-INTEL/TEST/in.intel.rhodo b/src/USER-INTEL/TEST/in.intel.rhodo index 7b3b092607..05145d79c0 100644 --- a/src/USER-INTEL/TEST/in.intel.rhodo +++ b/src/USER-INTEL/TEST/in.intel.rhodo @@ -1,5 +1,6 @@ # Rhodopsin model +variable N index on # Newton Setting variable w index 10 # Warmup Timesteps variable t index 520 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -16,10 +17,11 @@ variable z index 2 variable rr equal floor($t*$m) variable root getenv LMP_ROOT +newton $N if "$n > 0" then "processors * * * grid numa" units real -neigh_modify delay 5 every 1 binsize $b +neigh_modify delay 5 every 1 atom_style full bond_style harmonic diff --git a/src/USER-INTEL/TEST/in.intel.sw b/src/USER-INTEL/TEST/in.intel.sw index 077c9bb4fb..494f58dea3 100644 --- a/src/USER-INTEL/TEST/in.intel.sw +++ b/src/USER-INTEL/TEST/in.intel.sw @@ -1,5 +1,6 @@ # bulk Si via Stillinger-Weber +variable N index on # Newton Setting variable w index 10 # Warmup Timesteps variable t index 6200 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -16,6 +17,7 @@ variable zz equal 10*$z variable rr equal floor($t*$m) variable root getenv LMP_ROOT +newton $N if "$n > 0" then "processors * * * grid numa" units metal diff --git a/src/USER-INTEL/TEST/in.intel.tersoff b/src/USER-INTEL/TEST/in.intel.tersoff index f0c6a88f75..574b29f674 100644 --- a/src/USER-INTEL/TEST/in.intel.tersoff +++ b/src/USER-INTEL/TEST/in.intel.tersoff @@ -1,5 +1,6 @@ # bulk Si via Tersoff +variable N index on # Newton Setting variable w index 10 # Warmup Timesteps variable t index 2420 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -16,6 +17,7 @@ variable zz equal 10*$z variable rr equal floor($t*$m) variable root getenv LMP_ROOT +newton $N if "$n > 0" then "processors * * * grid numa" units metal diff --git a/src/USER-INTEL/TEST/in.intel.water b/src/USER-INTEL/TEST/in.intel.water index 1c1fca311f..0643def19e 100644 --- a/src/USER-INTEL/TEST/in.intel.water +++ b/src/USER-INTEL/TEST/in.intel.water @@ -1,5 +1,6 @@ # Coarse-grain water simulation using Stillinger-Weber +variable N index on # Newton Setting variable w index 10 # Warmup Timesteps variable t index 2600 # Main Run Timesteps variable m index 1 # Main Run Timestep Multiplier @@ -11,6 +12,7 @@ variable y index 2 variable z index 2 variable rr equal floor($t*$m) +newton $N if "$n > 0" then "processors * * * grid numa" units real diff --git a/src/USER-INTEL/TEST/in.lc_generate_restart b/src/USER-INTEL/TEST/in.lc_generate_restart index 8ae53c5c8e..30d593f2cd 100644 --- a/src/USER-INTEL/TEST/in.lc_generate_restart +++ b/src/USER-INTEL/TEST/in.lc_generate_restart @@ -4,13 +4,13 @@ # cutoff 4.0 with skin 0.8 # NPT, T=2.4, P=8.0 -variable x index 1 -variable y index 1 -variable z index 1 +variable xt index 1 +variable yt index 1 +variable zt index 1 -variable i equal $x*32 -variable j equal $y*32 -variable k equal $z*32 +variable i equal ${xt}*32 +variable j equal ${yt}*32 +variable k equal ${zt}*32 units lj atom_style ellipsoid diff --git a/src/USER-INTEL/TEST/run_benchmarks.sh b/src/USER-INTEL/TEST/run_benchmarks.sh new file mode 100755 index 0000000000..10bd79e0d1 --- /dev/null +++ b/src/USER-INTEL/TEST/run_benchmarks.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +######################################################################### +# Adjust settings below for your system +######################################################################### + +# --------------------- MPI Launch Command + +export MPI="mpirun" +#export MPI="numactl -p 1 mpirun" # -- Systems w/ MCDRAM in flat mode + +# ------------- Name and location of the LAMMPS binary + +export LMP_BIN=../../lmp_intel_cpu_intelmpi +#export LMP_BIN=../../lmp_knl + +# ------------- Directory containing the LAMMPS installation + +export LMP_ROOT=../../../ + +# ------------- Number of physical cores (not HW threads) + +export LMP_CORES=36 # -- For Intel Xeon E5-2697v4 SKU +#export LMP_CORES=68 # -- For Intel Xeon Phi x200 7250 SKU + +# ------------- Number of HW threads to use in tests + +export LMP_THREAD_LIST="2" # -- For 2 threads per core w/ HT enabled +#export LMP_THREAD_LIST="2 4" # -- For 2 threads per core w/ HT enabled + +# ------------- MPI Tuning Parameters + +#export I_MPI_SHM_LMT=shm # -- Uncomment for Xeon Phi x200 series + +# ------------- Library locations for build + +#source /opt/intel/parallel_studio_xe_2017.2.050/psxevars.sh + +######################################################################### +# End settings for your system +######################################################################### + +export WORKLOADS="lj rhodo rhodo_lrt lc sw water eam" +export LMP_ARGS="-pk intel 0 -sf intel -screen none -v d 1" +export RLMP_ARGS="-pk intel 0 lrt yes -sf intel -screen none -v d 1" + +export LOG_DIR_HEADER=`echo $LMP_BIN | sed 's/\.\.\///g' | sed 's/\.\///g'` +export LOG_DIR_HOST=`hostname` +export DATE_STRING=`date +%s` +export LOG_DIR=$LOG_DIR_HOST"_"$LOG_DIR_HEADER"_"$DATE_STRING +mkdir $LOG_DIR + +export I_MPI_PIN_DOMAIN=core +export I_MPI_FABRICS=shm +export KMP_BLOCKTIME=0 + +echo -n "Creating restart file...." +$MPI -np $LMP_CORES $LMP_BIN -in in.lc_generate_restart -log none $LMP_ARGS +echo "Done." +for threads in $LMP_THREAD_LIST +do + export OMP_NUM_THREADS=$threads + for workload in $WORKLOADS + do + export LOGFILE=$LOG_DIR/$workload.$LMP_CORES"c"$threads"t".log + echo "Running $LOGFILE" + cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $LMP_ARGS"; + rthreads=$threads + unset KMP_AFFINITY + $cmd + + # - For benchmarks with PPPM, also try LRT mode + if [ $workload = "rhodo" ]; then + export LOGFILE=$LOG_DIR/$workload"_lrt".$LMP_CORES"c"$threads"t".log + cmd="$MPI -np $LMP_CORES $LMP_BIN -in in.intel.$workload -log $LOGFILE $RLMP_ARGS"; + rthreads=$((threads-1)) + export KMP_AFFINITY=none + export OMP_NUM_THREADS=$rthreads + echo " $cmd" >> $LOG_DIR/commands.info + $cmd + fi + done +done + +# Performance reported by LAMMPS (Timesteps/second ignoring warm-up run) +grep Perf $LOG_DIR/*.log | awk 'BEGIN{n=1}n%2==0{print $0}{n++}' | sed 's/\/day//g' | sed 's/steps\/s/steps_s/g' | sed 's/hours\/ns//g' | sed 's/.*\///g' | sed 's/\.log:Performance://g' | awk '{c=NF-1; print $1,$c}' diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp index aafc765c6b..0c493646e3 100644 --- a/src/USER-INTEL/angle_charmm_intel.cpp +++ b/src/USER-INTEL/angle_charmm_intel.cpp @@ -81,16 +81,16 @@ void AngleCharmmIntel::compute(int eflag, int vflag, else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -102,7 +102,7 @@ void AngleCharmmIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void AngleCharmmIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -126,12 +126,9 @@ void AngleCharmmIntel::eval(const int vflag, const int nthreads = tc; acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oeangle = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oeangle = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -140,8 +137,12 @@ void AngleCharmmIntel::eval(const int vflag, reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -150,7 +151,17 @@ void AngleCharmmIntel::eval(const int vflag, const int4_t * _noalias const anglelist = (int4_t *) neighbor->anglelist[0]; - for (int n = nfrom; n < nto; n++) { + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) seangle = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = anglelist[n].a; const int i2 = anglelist[n].b; const int i3 = anglelist[n].c; @@ -229,40 +240,58 @@ void AngleCharmmIntel::eval(const int vflag, // apply force to each of 3 atoms - if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= f1x + f3x; + f[i2].y -= f1y + f3y; + f[i2].z -= f1z + f3z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } } - if (NEWTON_BOND || i2 < nlocal) { - f[i2].x -= f1x + f3x; - f[i2].y -= f1y + f3y; - f[i2].z -= f1z + f3z; - } - - if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; - } - - if (EVFLAG) { - IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, - f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, - delx2, dely2, delz2, oeangle, f, NEWTON_BOND, - nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, + i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, + dely1, delz1, delx2, dely2, delz2, seangle, + f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, + sv4, sv5); + #else + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, + i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, + dely1, delz1, delx2, dely2, delz2, oeangle, + f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, + ov4, ov5); + #endif } } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oeangle += seangle; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif } // omp parallel - if (EVFLAG) { - if (EFLAG) - energy += oeangle; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - } + if (EFLAG) energy += oeangle; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp index f101fd9e1f..198431d552 100644 --- a/src/USER-INTEL/angle_harmonic_intel.cpp +++ b/src/USER-INTEL/angle_harmonic_intel.cpp @@ -81,16 +81,16 @@ void AngleHarmonicIntel::compute(int eflag, int vflag, else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -102,7 +102,7 @@ void AngleHarmonicIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void AngleHarmonicIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -126,12 +126,9 @@ void AngleHarmonicIntel::eval(const int vflag, const int nthreads = tc; acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oeangle = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oeangle = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -140,8 +137,12 @@ void AngleHarmonicIntel::eval(const int vflag, reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -150,7 +151,17 @@ void AngleHarmonicIntel::eval(const int vflag, const int4_t * _noalias const anglelist = (int4_t *) neighbor->anglelist[0]; - for (int n = nfrom; n < nto; n++) { + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t seangle, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) seangle = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:seangle, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = anglelist[n].a; const int i2 = anglelist[n].b; const int i3 = anglelist[n].c; @@ -211,40 +222,58 @@ void AngleHarmonicIntel::eval(const int vflag, // apply force to each of 3 atoms - if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= f1x + f3x; + f[i2].y -= f1y + f3y; + f[i2].z -= f1z + f3z; + } + + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } } - if (NEWTON_BOND || i2 < nlocal) { - f[i2].x -= f1x + f3x; - f[i2].y -= f1y + f3y; - f[i2].z -= f1z + f3z; - } - - if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; - } - - if (EVFLAG) { - IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, - f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, - delx2, dely2, delz2, oeangle, f, NEWTON_BOND, - nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, + f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, + delz1, delx2, dely2, delz2, seangle, f, + NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, + sv5); + #else + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, + f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, + delz1, delx2, dely2, delz2, oeangle, f, + NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, + ov5); + #endif } } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oeangle += seangle; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif } // omp parallel - if (EVFLAG) { - if (EFLAG) - energy += oeangle; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - } + if (EFLAG) energy += oeangle; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp index e61ab9be84..430142a72a 100644 --- a/src/USER-INTEL/bond_fene_intel.cpp +++ b/src/USER-INTEL/bond_fene_intel.cpp @@ -77,16 +77,16 @@ void BondFENEIntel::compute(int eflag, int vflag, else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -96,10 +96,10 @@ void BondFENEIntel::compute(int eflag, int vflag, } } -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void BondFENEIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nbondlist; if (inum == 0) return; @@ -119,23 +119,23 @@ void BondFENEIntel::eval(const int vflag, const int nthreads = tc; acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oebond = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oebond = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } - #if defined(_OPENMP) #pragma omp parallel default(none) \ shared(f_start,f_stride,fc) \ reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -144,7 +144,17 @@ void BondFENEIntel::eval(const int vflag, const int3_t * _noalias const bondlist = (int3_t *) neighbor->bondlist[0]; - for (int n = nfrom; n < nto; n++) { + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) sebond = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = bondlist[n].a; const int i2 = bondlist[n].b; const int type = bondlist[n].t; @@ -199,33 +209,48 @@ void BondFENEIntel::eval(const int vflag, // apply force to each of 2 atoms - if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += delx*fbond; - f[i1].y += dely*fbond; - f[i1].z += delz*fbond; - } - - if (NEWTON_BOND || i2 < nlocal) { - f[i2].x -= delx*fbond; - f[i2].y -= dely*fbond; - f[i2].z -= delz*fbond; - } - - if (EVFLAG) { - IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += delx*fbond; + f[i1].y += dely*fbond; + f[i1].z += delz*fbond; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= delx*fbond; + f[i2].y -= dely*fbond; + f[i2].z -= delz*fbond; + } + } + + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, + delx, dely, delz, sebond, f, NEWTON_BOND, + nlocal, sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, delx, dely, delz, oebond, f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + #endif } } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oebond += sebond; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif } // omp parallel - if (EVFLAG) { - if (EFLAG) - energy += oebond; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - } + if (EFLAG) energy += oebond; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp index 51a33b1cc3..1cccf5fe54 100644 --- a/src/USER-INTEL/bond_harmonic_intel.cpp +++ b/src/USER-INTEL/bond_harmonic_intel.cpp @@ -77,16 +77,16 @@ void BondHarmonicIntel::compute(int eflag, int vflag, else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -96,7 +96,7 @@ void BondHarmonicIntel::compute(int eflag, int vflag, } } -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void BondHarmonicIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -119,12 +119,9 @@ void BondHarmonicIntel::eval(const int vflag, const int nthreads = tc; acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oebond = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oebond = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -133,8 +130,12 @@ void BondHarmonicIntel::eval(const int vflag, reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -143,7 +144,17 @@ void BondHarmonicIntel::eval(const int vflag, const int3_t * _noalias const bondlist = (int3_t *) neighbor->bondlist[0]; - for (int n = nfrom; n < nto; n++) { + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t sebond, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) sebond = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:sebond, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = bondlist[n].a; const int i2 = bondlist[n].b; const int type = bondlist[n].t; @@ -167,33 +178,50 @@ void BondHarmonicIntel::eval(const int vflag, if (EFLAG) ebond = rk*dr; // apply force to each of 2 atoms - if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += delx*fbond; - f[i1].y += dely*fbond; - f[i1].z += delz*fbond; - } - - if (NEWTON_BOND || i2 < nlocal) { - f[i2].x -= delx*fbond; - f[i2].y -= dely*fbond; - f[i2].z -= delz*fbond; + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += delx*fbond; + f[i1].y += dely*fbond; + f[i1].z += delz*fbond; + } + + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x -= delx*fbond; + f[i2].y -= dely*fbond; + f[i2].z -= delz*fbond; + } } - if (EVFLAG) { - IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, - delx, dely, delz, oebond, f, NEWTON_BOND, - nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, + fbond, delx, dely, delz, sebond, f, + NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, + sv4, sv5); + #else + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, + fbond, delx, dely, delz, oebond, f, + NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, + ov4, ov5); + #endif } } // for n + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oebond += sebond; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; + } + #endif } // omp parallel - if (EVFLAG) { - if (EFLAG) - energy += oebond; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - } + if (EFLAG) energy += oebond; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp index c07c226611..df8834c283 100644 --- a/src/USER-INTEL/dihedral_charmm_intel.cpp +++ b/src/USER-INTEL/dihedral_charmm_intel.cpp @@ -93,16 +93,16 @@ void DihedralCharmmIntel::compute(int eflag, int vflag, force->pair->vflag_either = force->pair->vflag_global = 1; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -114,7 +114,7 @@ void DihedralCharmmIntel::compute(int eflag, int vflag, #ifndef LMP_USE_AVXCD_DHC -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void DihedralCharmmIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -140,13 +140,10 @@ void DihedralCharmmIntel::eval(const int vflag, acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5; - if (EVFLAG) { - if (EFLAG) - oevdwl = oecoul = oedihedral = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0; - } + if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -156,8 +153,13 @@ void DihedralCharmmIntel::eval(const int vflag, opv0,opv1,opv2,opv3,opv4,opv5) #endif { + #if defined(LMP_SIMD_COMPILER_TEST) int nfrom, nto, tid; IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + int nfrom, npl, nto, tid; + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -169,21 +171,19 @@ void DihedralCharmmIntel::eval(const int vflag, acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5; - if (EVFLAG) { - if (EFLAG) - sevdwl = secoul = sedihedral = (acc_t)0.0; - if (vflag) { - sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; - spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0; - } + if (EFLAG) sevdwl = secoul = sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0; } #if defined(LMP_SIMD_COMPILER_TEST) #pragma vector aligned #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \ sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) - #endif for (int n = nfrom; n < nto; n++) { + #endif + for (int n = nfrom; n < nto; n += npl) { const int i1 = dihedrallist[n].a; const int i2 = dihedrallist[n].b; const int i3 = dihedrallist[n].c; @@ -333,14 +333,14 @@ void DihedralCharmmIntel::eval(const int vflag, const flt_t f3y = -sy2 - f4y; const flt_t f3z = -sz2 - f4z; - if (EVFLAG) { + if (EFLAG || VFLAG) { flt_t deng; if (EFLAG) deng = tk * p; - IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, - f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, - vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, - vb3z, sedihedral, f, NEWTON_BOND, nlocal, - sv0, sv1, sv2, sv3, sv4, sv5); + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, + i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, + f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, + vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND, + nlocal, sv0, sv1, sv2, sv3, sv4, sv5); } @@ -387,7 +387,7 @@ void DihedralCharmmIntel::eval(const int vflag, f4z -= delz*fpair; } - if (EVFLAG) { + if (EFLAG || VFLAG) { flt_t ev_pre = (flt_t)0; if (NEWTON_BOND || i1 < nlocal) ev_pre += (flt_t)0.5; @@ -412,13 +412,13 @@ void DihedralCharmmIntel::eval(const int vflag, } // IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, // delx, dely, delz); - if (vflag) { - spv0 += ev_pre * delx * delx * fpair; - spv1 += ev_pre * dely * dely * fpair; - spv2 += ev_pre * delz * delz * fpair; - spv3 += ev_pre * delx * dely * fpair; - spv4 += ev_pre * delx * delz * fpair; - spv5 += ev_pre * dely * delz * fpair; + if (VFLAG && vflag) { + spv0 += ev_pre * delx * delx * fpair; + spv1 += ev_pre * dely * dely * fpair; + spv2 += ev_pre * delz * delz * fpair; + spv3 += ev_pre * delx * dely * fpair; + spv4 += ev_pre * delx * delz * fpair; + spv5 += ev_pre * dely * delz * fpair; } } @@ -440,36 +440,32 @@ void DihedralCharmmIntel::eval(const int vflag, } } } // for n - if (EVFLAG) { - if (EFLAG) { - oedihedral += sedihedral; - oecoul += secoul; - oevdwl += sevdwl; - } - if (vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; - opv0 += spv0; opv1 += spv1; opv2 += spv2; - opv3 += spv3; opv4 += spv4; opv5 += spv5; - } - } - } // omp parallel - - if (EVFLAG) { if (EFLAG) { - energy += oedihedral; - force->pair->eng_vdwl += oevdwl; - force->pair->eng_coul += oecoul; + oedihedral += sedihedral; + oecoul += secoul; + oevdwl += sevdwl; } - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - force->pair->virial[0] += opv0; - force->pair->virial[1] += opv1; - force->pair->virial[2] += opv2; - force->pair->virial[3] += opv3; - force->pair->virial[4] += opv4; - force->pair->virial[5] += opv5; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; + opv0 += spv0; opv1 += spv1; opv2 += spv2; + opv3 += spv3; opv4 += spv4; opv5 += spv5; } + } // omp parallel + + if (EFLAG) { + energy += oedihedral; + force->pair->eng_vdwl += oevdwl; + force->pair->eng_coul += oecoul; + } + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + force->pair->virial[0] += opv0; + force->pair->virial[1] += opv1; + force->pair->virial[2] += opv2; + force->pair->virial[3] += opv3; + force->pair->virial[4] += opv4; + force->pair->virial[5] += opv5; } fix->set_reduce_flag(); @@ -488,7 +484,7 @@ authors for more details. ------------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void DihedralCharmmIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -518,13 +514,10 @@ void DihedralCharmmIntel::eval(const int vflag, acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5; - if (EVFLAG) { - if (EFLAG) - oevdwl = oecoul = oedihedral = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0; - } + if (EFLAG) oevdwl = oecoul = oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -534,8 +527,9 @@ void DihedralCharmmIntel::eval(const int vflag, opv0,opv1,opv2,opv3,opv4,opv5) #endif { - int nfrom, nto, tid; - IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + int nfrom, npl, nto, tid; + IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads, + swidth); FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -559,26 +553,24 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; SIMD_acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5; - if (EVFLAG) { - if (EFLAG) { - sevdwl = SIMD_set((acc_t)0.0); - secoul = SIMD_set((acc_t)0.0); - sedihedral = SIMD_set((acc_t)0.0); - } - if (vflag) { - sv0 = SIMD_set((acc_t)0.0); - sv1 = SIMD_set((acc_t)0.0); - sv2 = SIMD_set((acc_t)0.0); - sv3 = SIMD_set((acc_t)0.0); - sv4 = SIMD_set((acc_t)0.0); - sv5 = SIMD_set((acc_t)0.0); - spv0 = SIMD_set((acc_t)0.0); - spv1 = SIMD_set((acc_t)0.0); - spv2 = SIMD_set((acc_t)0.0); - spv3 = SIMD_set((acc_t)0.0); - spv4 = SIMD_set((acc_t)0.0); - spv5 = SIMD_set((acc_t)0.0); - } + if (EFLAG) { + sevdwl = SIMD_set((acc_t)0.0); + secoul = SIMD_set((acc_t)0.0); + sedihedral = SIMD_set((acc_t)0.0); + } + if (VFLAG && vflag) { + sv0 = SIMD_set((acc_t)0.0); + sv1 = SIMD_set((acc_t)0.0); + sv2 = SIMD_set((acc_t)0.0); + sv3 = SIMD_set((acc_t)0.0); + sv4 = SIMD_set((acc_t)0.0); + sv5 = SIMD_set((acc_t)0.0); + spv0 = SIMD_set((acc_t)0.0); + spv1 = SIMD_set((acc_t)0.0); + spv2 = SIMD_set((acc_t)0.0); + spv3 = SIMD_set((acc_t)0.0); + spv4 = SIMD_set((acc_t)0.0); + spv5 = SIMD_set((acc_t)0.0); } SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, @@ -588,7 +580,7 @@ void DihedralCharmmIntel::eval(const int vflag, const SIMD_int simd_nlocals4 = SIMD_set(nlocals4); const int ntypes = atom->ntypes + 1; - for (int n = nfrom; n < nto; n += swidth) { + for (int n = nfrom; n < nto; n += npl) { SIMD_mask nmask = n_offset < nto5; SIMD_int i1 = SIMD_gather(nmask, dihedrallist, n_offset); const SIMD_flt_t q1 = SIMD_gather(nmask, q, i1); @@ -601,7 +593,7 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_int type = SIMD_gather(nmask, dihedrallist+4, n_offset); const SIMD_flt_t tweight = SIMD_gather(nmask, weight, type); type = type << 2; - n_offset = n_offset + swidth * 5; + n_offset = n_offset + npl * 5; // 1st bond @@ -747,7 +739,7 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_flt_t f3z = -sz2 - f4z; SIMD_flt_t qdeng; - if (EVFLAG) { + if (EFLAG || VFLAG) { SIMD_flt_t ev_pre; if (NEWTON_BOND) ev_pre = one; else { @@ -774,7 +766,7 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_jeng_update(newton_mask, featom, i3, ieng); } } - if (vflag) { + if (VFLAG && vflag) { sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x)); sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y)); sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z)); @@ -816,7 +808,7 @@ void DihedralCharmmIntel::eval(const int vflag, f4y = f4y - dely * fpair; f4z = f4z - delz * fpair; - if (EVFLAG) { + if (EFLAG || VFLAG) { SIMD_flt_t ev_pre; if (NEWTON_BOND) ev_pre = one; else { @@ -848,7 +840,7 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_jeng_update(newton_mask, featom, i4, ieng); } } - if (vflag) { + if (VFLAG && vflag) { spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair); spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair); spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair); @@ -865,45 +857,41 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_safe_jforce(newton_mask, pforce, i4, f4x, f4y, f4z); } // for n - if (EVFLAG) { - if (EFLAG) { - oedihedral += SIMD_sum(sedihedral); - oecoul += SIMD_sum(secoul); - oevdwl += SIMD_sum(sevdwl); - } - if (vflag) { - ov0 += SIMD_sum(sv0); - ov1 += SIMD_sum(sv1); - ov2 += SIMD_sum(sv2); - ov3 += SIMD_sum(sv3); - ov4 += SIMD_sum(sv4); - ov5 += SIMD_sum(sv5); - opv0 += SIMD_sum(spv0); - opv1 += SIMD_sum(spv1); - opv2 += SIMD_sum(spv2); - opv3 += SIMD_sum(spv3); - opv4 += SIMD_sum(spv4); - opv5 += SIMD_sum(spv5); - } - } - } // omp parallel - - if (EVFLAG) { if (EFLAG) { - energy += oedihedral; - force->pair->eng_vdwl += oevdwl; - force->pair->eng_coul += oecoul; + oedihedral += SIMD_sum(sedihedral); + oecoul += SIMD_sum(secoul); + oevdwl += SIMD_sum(sevdwl); } - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - force->pair->virial[0] += opv0; - force->pair->virial[1] += opv1; - force->pair->virial[2] += opv2; - force->pair->virial[3] += opv3; - force->pair->virial[4] += opv4; - force->pair->virial[5] += opv5; + if (VFLAG && vflag) { + ov0 += SIMD_sum(sv0); + ov1 += SIMD_sum(sv1); + ov2 += SIMD_sum(sv2); + ov3 += SIMD_sum(sv3); + ov4 += SIMD_sum(sv4); + ov5 += SIMD_sum(sv5); + opv0 += SIMD_sum(spv0); + opv1 += SIMD_sum(spv1); + opv2 += SIMD_sum(spv2); + opv3 += SIMD_sum(spv3); + opv4 += SIMD_sum(spv4); + opv5 += SIMD_sum(spv5); } + } // omp parallel + + if (EFLAG) { + energy += oedihedral; + force->pair->eng_vdwl += oevdwl; + force->pair->eng_coul += oecoul; + } + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + force->pair->virial[0] += opv0; + force->pair->virial[1] += opv1; + force->pair->virial[2] += opv2; + force->pair->virial[3] += opv3; + force->pair->virial[4] += opv4; + force->pair->virial[5] += opv5; } fix->set_reduce_flag(); @@ -953,12 +941,14 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, fc.set_ntypes(tp1,bp1,memory); buffers->set_ntypes(tp1); - for (int i = 0; i < tp1; i++) { - for (int j = 0; j < tp1; j++) { - fc.ljp[i][j].lj1 = lj14_1[i][j]; - fc.ljp[i][j].lj2 = lj14_2[i][j]; - fc.ljp[i][j].lj3 = lj14_3[i][j]; - fc.ljp[i][j].lj4 = lj14_4[i][j]; + if (weightflag) { + for (int i = 0; i < tp1; i++) { + for (int j = 0; j < tp1; j++) { + fc.ljp[i][j].lj1 = lj14_1[i][j]; + fc.ljp[i][j].lj2 = lj14_2[i][j]; + fc.ljp[i][j].lj3 = lj14_3[i][j]; + fc.ljp[i][j].lj4 = lj14_4[i][j]; + } } } diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp index 03ab152f49..94130f4355 100644 --- a/src/USER-INTEL/dihedral_harmonic_intel.cpp +++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp @@ -77,16 +77,16 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag, } else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -96,7 +96,7 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag, } } -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void DihedralHarmonicIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -120,12 +120,9 @@ void DihedralHarmonicIntel::eval(const int vflag, const int nthreads = tc; acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oedihedral = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -134,8 +131,12 @@ void DihedralHarmonicIntel::eval(const int vflag, reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -144,16 +145,17 @@ void DihedralHarmonicIntel::eval(const int vflag, const int5_t * _noalias const dihedrallist = (int5_t *) neighbor->dihedrallist[0]; + #ifdef LMP_INTEL_USE_SIMDOFF acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; - if (EVFLAG) { - if (EFLAG) - sedihedral = (acc_t)0.0; - if (vflag) { - sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; - } + if (EFLAG) sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } - - for (int n = nfrom; n < nto; n++) { + #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = dihedrallist[n].a; const int i2 = dihedrallist[n].b; const int i3 = dihedrallist[n].c; @@ -203,6 +205,7 @@ void DihedralHarmonicIntel::eval(const int vflag, const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z); // error check + #ifndef LMP_INTEL_USE_SIMDOFF if (c > PTOLERANCE || c < MTOLERANCE) { int me = comm->me; @@ -224,6 +227,7 @@ void DihedralHarmonicIntel::eval(const int vflag, me,x[i4].x,x[i4].y,x[i4].z); } } + #endif if (c > (flt_t)1.0) c = (flt_t)1.0; if (c < (flt_t)-1.0) c = (flt_t)-1.0; @@ -292,16 +296,27 @@ void DihedralHarmonicIntel::eval(const int vflag, const flt_t f3y = -sy2 - f4y; const flt_t f3z = -sz2 - f4z; - if (EVFLAG) { + if (EFLAG || VFLAG) { flt_t deng; if (EFLAG) deng = tk * p; - IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, - f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, - vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, - vb3z, sedihedral, f, NEWTON_BOND, nlocal, + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif } + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += f1x; @@ -328,20 +343,19 @@ void DihedralHarmonicIntel::eval(const int vflag, } } } // for n - if (EVFLAG) { - if (EFLAG) oedihedral += sedihedral; - if (vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; - } + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oedihedral += sedihedral; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } + #endif } // omp parallel - if (EVFLAG) { - if (EFLAG) energy += oedihedral; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - } + if (EFLAG) energy += oedihedral; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp index bfd5a53956..3248a8bfc7 100644 --- a/src/USER-INTEL/dihedral_opls_intel.cpp +++ b/src/USER-INTEL/dihedral_opls_intel.cpp @@ -81,16 +81,16 @@ void DihedralOPLSIntel::compute(int eflag, int vflag, } else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -100,7 +100,7 @@ void DihedralOPLSIntel::compute(int eflag, int vflag, } } -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void DihedralOPLSIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -124,12 +124,9 @@ void DihedralOPLSIntel::eval(const int vflag, const int nthreads = tc; acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oedihedral = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -138,8 +135,12 @@ void DihedralOPLSIntel::eval(const int vflag, reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -148,16 +149,17 @@ void DihedralOPLSIntel::eval(const int vflag, const int5_t * _noalias const dihedrallist = (int5_t *) neighbor->dihedrallist[0]; + #ifdef LMP_INTEL_USE_SIMDOFF acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5; - if (EVFLAG) { - if (EFLAG) - sedihedral = (acc_t)0.0; - if (vflag) { - sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; - } + if (EFLAG) sedihedral = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; } - - for (int n = nfrom; n < nto; n++) { + #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5) + for (int n = nfrom; n < nto; n ++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = dihedrallist[n].a; const int i2 = dihedrallist[n].b; const int i3 = dihedrallist[n].c; @@ -236,6 +238,7 @@ void DihedralOPLSIntel::eval(const int vflag, const flt_t dx = (cx*vb3x + cy*vb3y + cz*vb3z)*cmag*rb3; // error check + #ifndef LMP_INTEL_USE_SIMDOFF if (c > PTOLERANCE || c < MTOLERANCE) { int me = comm->me; @@ -257,6 +260,7 @@ void DihedralOPLSIntel::eval(const int vflag, me,x[i4].x,x[i4].y,x[i4].z); } } + #endif if (c > (flt_t)1.0) c = (flt_t)1.0; if (c < (flt_t)-1.0) c = (flt_t)-1.0; @@ -321,14 +325,25 @@ void DihedralOPLSIntel::eval(const int vflag, const flt_t f3y = sy2 - f4y; const flt_t f3z = sz2 - f4z; - if (EVFLAG) { - IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, edihed, i1, i2, i3, i4, f1x, - f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, - vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, - vb3z, sedihedral, f, NEWTON_BOND, nlocal, + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, + i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, + i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif } + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += f1x; @@ -355,20 +370,19 @@ void DihedralOPLSIntel::eval(const int vflag, } } } // for n - if (EVFLAG) { - if (EFLAG) oedihedral += sedihedral; - if (vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; - } + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oedihedral += sedihedral; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } + #endif } // omp parallel - if (EVFLAG) { - if (EFLAG) energy += oedihedral; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; - } + if (EFLAG) energy += oedihedral; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index edd33eb72b..e132947750 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -61,6 +61,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) int ncops = force->inumeric(FLERR,arg[3]); _nbor_pack_width = 1; + _three_body_neighbor = 0; _precision_mode = PREC_MODE_MIXED; _offload_balance = -1.0; @@ -95,7 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _allow_separate_buffers = 1; _offload_ghost = -1; _lrt = 0; - + int iarg = 4; while (iarg < narg) { if (strcmp(arg[iarg],"omp") == 0) { @@ -140,7 +141,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) else error->all(FLERR,"Illegal package intel command"); iarg += 2; } - + // undocumented options else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) { @@ -326,12 +327,18 @@ void FixIntel::init() "Currently, cannot use more than one intel style with hybrid."); check_neighbor_intel(); - if (_precision_mode == PREC_MODE_SINGLE) + int off_mode = 0; + if (_offload_balance != 0.0) off_mode = 1; + if (_precision_mode == PREC_MODE_SINGLE) { _single_buffers->zero_ev(); - else if (_precision_mode == PREC_MODE_MIXED) + _single_buffers->grow_ncache(off_mode,_nthreads); + } else if (_precision_mode == PREC_MODE_MIXED) { _mixed_buffers->zero_ev(); - else + _mixed_buffers->grow_ncache(off_mode,_nthreads); + } else { _double_buffers->zero_ev(); + _double_buffers->grow_ncache(off_mode,_nthreads); + } _need_reduce = 0; } @@ -367,8 +374,6 @@ void FixIntel::pair_init_check(const bool cdmessage) { #ifdef INTEL_VMASK atom->sortfreq = 1; - if (neighbor->binsizeflag && atom->userbinsize <= 0.0) - atom->userbinsize = neighbor->binsize_user; #endif _nbor_pack_width = 1; @@ -376,9 +381,8 @@ void FixIntel::pair_init_check(const bool cdmessage) #ifdef _LMP_INTEL_OFFLOAD if (_offload_balance != 0.0) atom->sortfreq = 1; - if (force->newton_pair == 0) - _offload_noghost = 0; - else if (_offload_ghost == 0) + _offload_noghost = 0; + if (force->newton_pair && _offload_ghost == 0) _offload_noghost = 1; set_offload_affinity(); @@ -535,24 +539,24 @@ void FixIntel::pre_reverse(int eflag, int vflag) { if (_force_array_m != 0) { if (_need_reduce) { - reduce_results(_force_array_m); + reduce_results(&_force_array_m[0].x); _need_reduce = 0; } - add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0); + add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0); _force_array_m = 0; } else if (_force_array_d != 0) { if (_need_reduce) { - reduce_results(_force_array_d); + reduce_results(&_force_array_d[0].x); _need_reduce = 0; } - add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0); + add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0); _force_array_d = 0; } else if (_force_array_s != 0) { if (_need_reduce) { - reduce_results(_force_array_s); + reduce_results(&_force_array_s[0].x); _need_reduce = 0; } - add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0); + add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0); _force_array_s = 0; } @@ -563,47 +567,56 @@ void FixIntel::pre_reverse(int eflag, int vflag) /* ---------------------------------------------------------------------- */ -template <class ft> -void FixIntel::reduce_results(ft * _noalias const f_start) +template <class acc_t> +void FixIntel::reduce_results(acc_t * _noalias const f_scalar) { int o_range, f_stride; if (force->newton_pair) o_range = atom->nlocal + atom->nghost; else o_range = atom->nlocal; - IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque); - - #if defined(_OPENMP) - #pragma omp parallel default(none) shared(o_range, f_stride) - #endif - { - int iifrom, iito, tid; - IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads, - sizeof(ft)); - - int t_off = f_stride; - if (_results_eatom) { - for (int t = 1; t < _nthreads; t++) { - _use_simd_pragma("vector nontemporal") - _use_simd_pragma("novector") - for (int n = iifrom; n < iito; n++) { - f_start[n].x += f_start[n + t_off].x; - f_start[n].y += f_start[n + t_off].y; - f_start[n].z += f_start[n + t_off].z; - f_start[n].w += f_start[n + t_off].w; - } - t_off += f_stride; - } + IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque); + + o_range *= 4; + const int f_stride4 = f_stride * 4; + + if (_nthreads <= INTEL_HTHREADS) { + acc_t *f_scalar2 = f_scalar + f_stride4; + if (_nthreads == 4) { + acc_t *f_scalar3 = f_scalar2 + f_stride4; + acc_t *f_scalar4 = f_scalar3 + f_stride4; + _use_simd_pragma("vector aligned") + _use_simd_pragma("simd") + for (int n = 0; n < o_range; n++) + f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; + } else if (_nthreads == 2) { + _use_simd_pragma("vector aligned") + _use_simd_pragma("simd") + for (int n = 0; n < o_range; n++) + f_scalar[n] += f_scalar2[n]; } else { + acc_t *f_scalar3 = f_scalar2 + f_stride4; + _use_simd_pragma("vector aligned") + _use_simd_pragma("simd") + for (int n = 0; n < o_range; n++) + f_scalar[n] += f_scalar2[n] + f_scalar3[n]; + } + } else { + #if defined(_OPENMP) + #pragma omp parallel + #endif + { + int iifrom, iito, tid; + IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads, + sizeof(acc_t)); + + acc_t *f_scalar2 = f_scalar + f_stride4; for (int t = 1; t < _nthreads; t++) { - _use_simd_pragma("vector nontemporal") - _use_simd_pragma("novector") - for (int n = iifrom; n < iito; n++) { - f_start[n].x += f_start[n + t_off].x; - f_start[n].y += f_start[n + t_off].y; - f_start[n].z += f_start[n + t_off].z; - } - t_off += f_stride; + _use_simd_pragma("vector aligned") + _use_simd_pragma("simd") + for (int n = iifrom; n < iito; n++) + f_scalar[n] += f_scalar2[n]; + f_scalar2 += f_stride4; } } } @@ -641,40 +654,59 @@ void FixIntel::add_results(const ft * _noalias const f_in, #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { if (offload) { - add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); if (force->newton_pair) { + add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); const acc_t * _noalias const enull = 0; int offset = _offload_nlocal; if (atom->torque) offset *= 2; add_oresults(f_in + offset, enull, eatom, vatom, _offload_min_ghost, _offload_nghost); - } + } else + add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair()); } else { - add_oresults(f_in, ev_global, eatom, vatom, - _host_min_local, _host_used_local); if (force->newton_pair) { + add_oresults(f_in, ev_global, eatom, vatom, + _host_min_local, _host_used_local); const acc_t * _noalias const enull = 0; int offset = _host_used_local; if (atom->torque) offset *= 2; add_oresults(f_in + offset, enull, eatom, vatom, _host_min_ghost, _host_used_ghost); + } else { + int start = host_start_pair(); + add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start); } } stop_watch(TIME_PACK); return; } - if (force->newton_pair && (_offload_noghost == 0 || offload == 0)) - f_length = atom->nlocal + atom->nghost; - else - f_length = atom->nlocal; + int start; + if (offload) { + start = 0; + if (force->newton_pair) { + if (_offload_noghost == 0) + f_length = atom->nlocal + atom->nghost; + else + f_length = atom->nlocal; + } else + f_length = offload_end_pair(); + } else { + if (force->newton_pair) { + start = 0; + f_length = atom->nlocal + atom->nghost; + } else { + start = host_start_pair(); + f_length = atom->nlocal - start; + } + } + add_oresults(f_in, ev_global, eatom, vatom, start, f_length); #else if (force->newton_pair) f_length = atom->nlocal + atom->nghost; else f_length = atom->nlocal; - #endif - add_oresults(f_in, ev_global, eatom, vatom, 0, f_length); + #endif stop_watch(TIME_PACK); } @@ -695,8 +727,11 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, "Sphere particles not yet supported for gayberne/intel"); } + int packthreads; + if (_nthreads > INTEL_HTHREADS) packthreads = _nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) + #pragma omp parallel if(packthreads > 1) #endif { #if defined(_OPENMP) @@ -705,7 +740,7 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, const int tid = 0; #endif int ifrom, ito; - IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t)); + IP_PRE_omp_range_align(ifrom, ito, tid, nall, packthreads, sizeof(acc_t)); if (atom->torque) { int ii = ifrom * 2; lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + @@ -833,6 +868,11 @@ void FixIntel::add_off_results(const ft * _noalias const f_in, _offload_nlocal; } + if (atom->torque) + if (f_in[1].w < 0.0) + error->all(FLERR, "Bad matrix inversion in mldivide3"); + add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1); + // Load balance? if (_offload_balance < 0.0) { if (neighbor->ago == 0) @@ -860,10 +900,6 @@ void FixIntel::add_off_results(const ft * _noalias const f_in, stop_watch(TIME_IMBALANCE); #endif acc_timers(); - if (atom->torque) - if (f_in[1].w < 0.0) - error->all(FLERR, "Bad matrix inversion in mldivide3"); - add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1); } /* ---------------------------------------------------------------------- */ diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index f4c02b37b5..92d1311256 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -70,23 +70,32 @@ class FixIntel : public Fix { inline int nbor_pack_width() const { return _nbor_pack_width; } inline void nbor_pack_width(const int w) { _nbor_pack_width = w; } - + inline int three_body_neighbor() { return _three_body_neighbor; } + inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; } + inline int need_zero(const int tid) { if (_need_reduce == 0 && tid > 0) return 1; return 0; } - inline void set_reduce_flag() { _need_reduce = 1; } + inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; } inline int lrt() { if (force->kspace_match("pppm/intel", 0)) return _lrt; else return 0; } + inline int pppm_table() { + if (force->kspace_match("pppm/intel", 0) || + force->kspace_match("pppm/disp/intel",0)) + return INTEL_P3M_TABLE; + else return 0; + } + protected: IntelBuffers<float,float> *_single_buffers; IntelBuffers<float,double> *_mixed_buffers; IntelBuffers<double,double> *_double_buffers; - int _precision_mode, _nthreads, _nbor_pack_width; + int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor; public: inline int* get_overflow_flag() { return _overflow_flag; } @@ -241,7 +250,10 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, } else { nlocal = atom->nlocal; nall = _host_nall; - minlocal = _host_min_local; + if (force->newton) + minlocal = _host_min_local; + else + minlocal = host_start_pair(); } return; } @@ -275,7 +287,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, _results_eatom = eatom; _results_vatom = vatom; #ifndef _LMP_INTEL_OFFLOAD - if (rflag != 2 && _nthreads > 1) _need_reduce = 1; + if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1; #endif if (_overflow_flag[LMP_OVERFLOW]) @@ -303,7 +315,7 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, _results_eatom = eatom; _results_vatom = vatom; #ifndef _LMP_INTEL_OFFLOAD - if (rflag != 2 && _nthreads > 1) _need_reduce = 1; + if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1; #endif if (_overflow_flag[LMP_OVERFLOW]) @@ -331,7 +343,7 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in, _results_eatom = eatom; _results_vatom = vatom; #ifndef _LMP_INTEL_OFFLOAD - if (rflag != 2 && _nthreads > 1) _need_reduce = 1; + if (rflag != 2 && _nthreads > 1 && force->newton) _need_reduce = 1; #endif if (_overflow_flag[LMP_OVERFLOW]) diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp index 0fb02420b9..df13cd5d66 100644 --- a/src/USER-INTEL/improper_cvff_intel.cpp +++ b/src/USER-INTEL/improper_cvff_intel.cpp @@ -87,16 +87,16 @@ void ImproperCvffIntel::compute(int eflag, int vflag, else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -108,7 +108,7 @@ void ImproperCvffIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void ImproperCvffIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -131,12 +131,9 @@ void ImproperCvffIntel::eval(const int vflag, const int nthreads = tc; acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oeimproper = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oeimproper = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -145,8 +142,12 @@ void ImproperCvffIntel::eval(const int vflag, reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF_FIX IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -155,7 +156,17 @@ void ImproperCvffIntel::eval(const int vflag, const int5_t * _noalias const improperlist = (int5_t *) neighbor->improperlist[0]; + #ifdef LMP_INTEL_USE_SIMDOFF_FIX + acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) seimproper = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5) for (int n = nfrom; n < nto; n++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = improperlist[n].a; const int i2 = improperlist[n].b; const int i3 = improperlist[n].c; @@ -216,7 +227,7 @@ void ImproperCvffIntel::eval(const int vflag, flt_t c = (c0 + c1mag*c2mag) * s12; // error check - + #ifndef LMP_INTEL_USE_SIMDOFF_FIX if (c > PTOLERANCE || c < MTOLERANCE) { int me; MPI_Comm_rank(world,&me); @@ -238,6 +249,7 @@ void ImproperCvffIntel::eval(const int vflag, me,x[i4].x,x[i4].y,x[i4].z); } } + #endif if (c > (flt_t)1.0) c = (flt_t)1.0; if (c < (flt_t)-1.0) c = (flt_t)-1.0; @@ -250,31 +262,36 @@ void ImproperCvffIntel::eval(const int vflag, const int m = fc.fc[type].multiplicity; flt_t p, pd; - if (m == 2) { - p = (flt_t)2.0*c*c; - pd = (flt_t)2.0*c; - } else if (m == 3) { - const flt_t rc2 = c*c; - p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0; - pd = (flt_t)6.0*rc2 - (flt_t)1.5; - } else if (m == 4) { - const flt_t rc2 = c*c; - p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0; - pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c; - } else if (m == 6) { - const flt_t rc2 = c*c; - p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2; - pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c; - } else if (m == 1) { - p = c + (flt_t)1.0; - pd = (flt_t)0.5; - } else if (m == 5) { - const flt_t rc2 = c*c; - p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0; - pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5; - } else if (m == 0) { - p = (flt_t)2.0; - pd = (flt_t)0.0; + #ifdef LMP_INTEL_USE_SIMDOFF_FIX + #pragma simdoff + #endif + { + if (m == 2) { + p = (flt_t)2.0*c*c; + pd = (flt_t)2.0*c; + } else if (m == 3) { + const flt_t rc2 = c*c; + p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0; + pd = (flt_t)6.0*rc2 - (flt_t)1.5; + } else if (m == 4) { + const flt_t rc2 = c*c; + p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0; + pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c; + } else if (m == 6) { + const flt_t rc2 = c*c; + p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2; + pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c; + } else if (m == 1) { + p = c + (flt_t)1.0; + pd = (flt_t)0.5; + } else if (m == 5) { + const flt_t rc2 = c*c; + p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0; + pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5; + } else if (m == 0) { + p = (flt_t)2.0; + pd = (flt_t)0.0; + } } if (fc.fc[type].sign == -1) { @@ -317,46 +334,63 @@ void ImproperCvffIntel::eval(const int vflag, // apply force to each of 4 atoms - if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; - } + #ifdef LMP_INTEL_USE_SIMDOFF_FIX + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } - if (NEWTON_BOND || i2 < nlocal) { - f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; - } + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } - if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; - } + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } - if (NEWTON_BOND || i4 < nlocal) { - f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } } - if (EVFLAG) { - IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, - f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, - vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, - vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal, - ov0, ov1, ov2, ov3, ov4, ov5); + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF_FIX + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, + f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, + vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, + nlocal, sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, + f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, + vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, + nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + #endif } } // for n - } // omp parallel - if (EVFLAG) { - if (EFLAG) - energy += oeimproper; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + #ifdef LMP_INTEL_USE_SIMDOFF_FIX + if (EFLAG) oeimproper += seimproper; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } + #endif + } // omp parallel + if (EFLAG) energy += oeimproper; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp index 071ff548ea..cc854091f5 100644 --- a/src/USER-INTEL/improper_harmonic_intel.cpp +++ b/src/USER-INTEL/improper_harmonic_intel.cpp @@ -88,16 +88,16 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag, else evflag = 0; if (evflag) { - if (eflag) { + if (vflag && !eflag) { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,0,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,0,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -109,7 +109,7 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> +template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> void ImproperHarmonicIntel::eval(const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc) @@ -132,12 +132,9 @@ void ImproperHarmonicIntel::eval(const int vflag, const int nthreads = tc; acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - if (EFLAG) - oeimproper = (acc_t)0.0; - if (vflag) { - ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oeimproper = (acc_t)0.0; + if (VFLAG && vflag) { + ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; } #if defined(_OPENMP) @@ -146,8 +143,12 @@ void ImproperHarmonicIntel::eval(const int vflag, reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int nfrom, nto, tid; + int nfrom, npl, nto, tid; + #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads); + #else + IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads); + #endif FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) @@ -156,7 +157,17 @@ void ImproperHarmonicIntel::eval(const int vflag, const int5_t * _noalias const improperlist = (int5_t *) neighbor->improperlist[0]; + #ifdef LMP_INTEL_USE_SIMDOFF + acc_t seimproper, sv0, sv1, sv2, sv3, sv4, sv5; + if (EFLAG) seimproper = (acc_t)0.0; + if (VFLAG && vflag) { + sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; + } + #pragma simd reduction(+:seimproper, sv0, sv1, sv2, sv3, sv4, sv5) for (int n = nfrom; n < nto; n++) { + #else + for (int n = nfrom; n < nto; n += npl) { + #endif const int i1 = improperlist[n].a; const int i2 = improperlist[n].b; const int i3 = improperlist[n].c; @@ -207,7 +218,7 @@ void ImproperHarmonicIntel::eval(const int vflag, flt_t c = (c1*c2 + c0) * s12; // error check - + #ifndef LMP_INTEL_USE_SIMDOFF if (c > PTOLERANCE || c < MTOLERANCE) { int me; MPI_Comm_rank(world,&me); @@ -229,6 +240,7 @@ void ImproperHarmonicIntel::eval(const int vflag, me,x[i4].x,x[i4].y,x[i4].z); } } + #endif if (c > (flt_t)1.0) c = (flt_t)1.0; if (c < (flt_t)-1.0) c = (flt_t)-1.0; @@ -278,46 +290,63 @@ void ImproperHarmonicIntel::eval(const int vflag, // apply force to each of 4 atoms - if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; - } + #ifdef LMP_INTEL_USE_SIMDOFF + #pragma simdoff + #endif + { + if (NEWTON_BOND || i1 < nlocal) { + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; + } - if (NEWTON_BOND || i2 < nlocal) { - f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; - } + if (NEWTON_BOND || i2 < nlocal) { + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; + } - if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; - } + if (NEWTON_BOND || i3 < nlocal) { + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; + } - if (NEWTON_BOND || i4 < nlocal) { - f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + if (NEWTON_BOND || i4 < nlocal) { + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; + } } - if (EVFLAG) { - IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, - f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, - vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, - vb3z, oeimproper, f, NEWTON_BOND, nlocal, - ov0, ov1, ov2, ov3, ov4, ov5); + if (EFLAG || VFLAG) { + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, + f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, + vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, + nlocal, sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, + f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, + vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, + nlocal, ov0, ov1, ov2, ov3, ov4, ov5); + #endif } } // for n - } // omp parallel - if (EVFLAG) { - if (EFLAG) - energy += oeimproper; - if (vflag) { - virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + #ifdef LMP_INTEL_USE_SIMDOFF + if (EFLAG) oeimproper += seimproper; + if (VFLAG && vflag) { + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } + #endif + } // omp parallel + if (EFLAG) energy += oeimproper; + if (VFLAG && vflag) { + virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index c81dffec83..bacc8a8bad 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -12,6 +12,7 @@ Contributing author: W. Michael Brown (Intel) ------------------------------------------------------------------------- */ +#include <math.h> #include "intel_buffers.h" #include "force.h" #include "memory.h" @@ -28,6 +29,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) : _ntypes = 0; _off_map_listlocal = 0; _ccachex = 0; + _ncache_alloc = 0; #ifdef _LMP_INTEL_OFFLOAD _separate_buffers = 0; _off_f = 0; @@ -36,6 +38,7 @@ IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) : _off_list_alloc = false; _off_threads = 0; _off_ccache = 0; + _off_ncache = 0; _host_nmax = 0; #endif } @@ -111,15 +114,20 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, _buf_local_size = _buf_size; else _buf_local_size = static_cast<double>(nlocal) * 1.1 + 1; - if (lmp->atom->torque) - _buf_local_size *= 2; const int f_stride = get_stride(_buf_local_size); lmp->memory->create(_x, _buf_size,"intel_x"); if (lmp->atom->q != NULL) lmp->memory->create(_q, _buf_size, "intel_q"); if (lmp->atom->ellipsoid != NULL) lmp->memory->create(_quat, _buf_size, "intel_quat"); - lmp->memory->create(_f, f_stride * nthreads, "intel_f"); + #ifdef _LMP_INTEL_OFFLOAD + if (lmp->force->newton_pair) + #else + if (lmp->force->newton_pair || lmp->atom->molecular) + #endif + lmp->memory->create(_f, f_stride * nthreads, "intel_f"); + else + lmp->memory->create(_f, f_stride, "intel_f"); #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { @@ -131,7 +139,10 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, } if (offload_end > 0) { - lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f"); + int fm; + if (lmp->force->newton_pair) fm = _off_threads; + else fm = 1; + lmp->memory->create(_off_f, f_stride * fm, "intel_off_f"); const atom_t * const x = get_x(); const flt_t * const q = get_q(); const vec3_acc_t * f_start = get_off_f(); @@ -140,14 +151,14 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \ - nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\ + nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ nocopy(ev_global:length(8) alloc_if(1) free_if(0)) } } else { if (x != NULL && f_start != NULL && ev_global != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \ - nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\ + nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ nocopy(ev_global:length(8) alloc_if(1) free_if(0)) } } @@ -427,6 +438,115 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag, /* ---------------------------------------------------------------------- */ +template <class flt_t, class acc_t> +void IntelBuffers<flt_t, acc_t>::free_ncache() +{ + if (_ncache_alloc) { + flt_t *ncachex = _ncachex; + flt_t *ncachey = _ncachey; + flt_t *ncachez = _ncachez; + int *ncachej = _ncachej; + int *ncachejtype = _ncachejtype; + + #ifdef _LMP_INTEL_OFFLOAD + if (_off_ncache) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ncachex,ncachey,ncachez,ncachej:alloc_if(0) free_if(1)) \ + nocopy(ncachejtype:alloc_if(0) free_if(1)) + } + _off_ncache = 0; + #endif + + lmp->memory->destroy(ncachex); + lmp->memory->destroy(ncachey); + lmp->memory->destroy(ncachez); + lmp->memory->destroy(ncachej); + lmp->memory->destroy(ncachejtype); + + _ncache_alloc = 0; + } +} + +/* ---------------------------------------------------------------------- */ + +template <class flt_t, class acc_t> +void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag, + const int nthreads) +{ + const int nsize = get_max_nbors() * 3; + int esize = MIN(sizeof(int), sizeof(flt_t)); + IP_PRE_get_stride(_ncache_stride, nsize, esize, 0); + int nt = MAX(nthreads, _off_threads); + const int vsize = _ncache_stride * nt; + + if (_ncache_alloc) { + if (vsize > _ncache_alloc) + free_ncache(); + #ifdef _LMP_INTEL_OFFLOAD + else if (off_flag && _off_ncache == 0) + free_ncache(); + #endif + else + return; + } + + lmp->memory->create(_ncachex, vsize, "_ncachex"); + lmp->memory->create(_ncachey, vsize, "_ncachey"); + lmp->memory->create(_ncachez, vsize, "_ncachez"); + lmp->memory->create(_ncachej, vsize, "_ncachej"); + lmp->memory->create(_ncachejtype, vsize, "_ncachejtype"); + + _ncache_alloc = vsize; + + #ifdef _LMP_INTEL_OFFLOAD + if (off_flag) { + flt_t *ncachex = _ncachex; + flt_t *ncachey = _ncachey; + flt_t *ncachez = _ncachez; + int *ncachej = _ncachej; + int *ncachejtype = _ncachejtype; + + if (ncachex != NULL && ncachey !=NULL && ncachez != NULL && + ncachej != NULL && ncachejtype != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \ + nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \ + nocopy(ncachejtype:length(vsize) alloc_if(1) free_if(0)) + } + _off_ncache = 1; + } + #endif +} + +/* ---------------------------------------------------------------------- */ + +#ifndef _LMP_INTEL_OFFLOAD +template <class flt_t, class acc_t> +void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, + const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, + acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) +{ + IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0, + ov1, ov2, ov3, ov4, ov5); +} +#endif + +/* ---------------------------------------------------------------------- */ + +#ifndef _LMP_INTEL_OFFLOAD +template <class flt_t, class acc_t> +void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall, + const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, + acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) +{ + int iifrom, iito, tid; + IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2, + ov0, ov1, ov2, ov3, ov4, ov5); +} +#endif + +/* ---------------------------------------------------------------------- */ + template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes) { diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index 3462d013a1..9b73a65f60 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -78,6 +78,7 @@ class IntelBuffers { free_nbor_list(); free_nmax(); free_list_local(); + free_ncache(); } inline void grow_list(NeighList *list, const int nlocal, const int nthreads, @@ -106,6 +107,15 @@ class IntelBuffers { inline acc_t * get_ccachef() { return _ccachef; } #endif + void free_ncache(); + void grow_ncache(const int off_flag, const int nthreads); + inline int ncache_stride() { return _ncache_stride; } + inline flt_t * get_ncachex() { return _ncachex; } + inline flt_t * get_ncachey() { return _ncachey; } + inline flt_t * get_ncachez() { return _ncachez; } + inline int * get_ncachej() { return _ncachej; } + inline int * get_ncachejtype() { return _ncachejtype; } + inline int get_max_nbors() { int mn = lmp->neighbor->oneatom * sizeof(int) / (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN); @@ -180,6 +190,15 @@ class IntelBuffers { } } + #ifndef _LMP_INTEL_OFFLOAD + void fdotr_reduce_l5(const int lf, const int lt, const int nthreads, + const int f_stride, acc_t &ov0, acc_t &ov1, + acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5); + void fdotr_reduce(const int nall, const int nthreads, const int f_stride, + acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, + acc_t &ov4, acc_t &ov5); + #endif + #ifdef _LMP_INTEL_OFFLOAD inline void thr_pack_cop(const int ifrom, const int ito, const int offset, const bool dotype = false) { @@ -263,6 +282,10 @@ class IntelBuffers { int _ccache_stride; flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew; int *_ccachei, *_ccachej; + + int _ncache_stride, _ncache_alloc; + flt_t *_ncachex, *_ncachey, *_ncachez; + int *_ncachej, *_ncachejtype; #ifdef LMP_USE_AVXCD int _ccache_stride3; acc_t * _ccachef; @@ -274,7 +297,7 @@ class IntelBuffers { flt_t *_host_q; quat_t *_host_quat; vec3_acc_t *_off_f; - int _off_map_nmax, _cop, _off_ccache; + int _off_map_nmax, _cop, _off_ccache, _off_ncache; int *_off_map_ilist; int *_off_map_special, *_off_map_nspecial, *_off_map_tag; int *_off_map_numneigh; diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index ad07dfd7c2..93787cd6c8 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -17,6 +17,9 @@ #ifdef __INTEL_COMPILER #define LMP_SIMD_COMPILER +#if (__INTEL_COMPILER_BUILD_DATE > 20160720) +#define LMP_INTEL_USE_SIMDOFF +#endif #endif #ifdef __INTEL_OFFLOAD @@ -65,7 +68,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define INTEL_MAX_STENCIL 256 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL) #define INTEL_MAX_STENCIL_CHECK 4096 -#define INTEL_P3M_MAXORDER 5 +#define INTEL_P3M_MAXORDER 7 +#define INTEL_P3M_ALIGNED_MAXORDER 8 +// PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY) +#define INTEL_P3M_TABLE 1 #ifdef __INTEL_COMPILER #ifdef __AVX__ @@ -87,7 +93,12 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #ifdef __MIC__ #define INTEL_V512 1 #define INTEL_VMASK 1 +#define INTEL_HTHREADS 4 +#endif #endif + +#ifdef __AVX512ER__ +#define INTEL_HTHREADS 4 #endif #ifdef __AVX512CD__ @@ -96,15 +107,22 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #endif #endif +#ifdef __MIC__ +#define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH +#else +#define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH +#endif + #else #undef INTEL_VECTOR_WIDTH #define INTEL_VECTOR_WIDTH 1 +#define INTEL_COMPILE_WIDTH 1 #endif #define INTEL_DATA_ALIGN 64 -#define INTEL_ONEATOM_FACTOR 2 +#define INTEL_ONEATOM_FACTOR 1 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH #define INTEL_LB_MEAN_WEIGHT 0.1 @@ -112,6 +130,10 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define INTEL_MAX_HOST_CORE_COUNT 512 #define INTEL_MAX_COI_CORES 36 +#ifndef INTEL_HTHREADS +#define INTEL_HTHREADS 2 +#endif + #define IP_PRE_get_stride(stride, n, datasize, torque) \ { \ int blength = n; \ @@ -125,9 +147,17 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ { \ - const int idelta = 1 + inum/nthreads; \ + int idelta = inum/nthreads; \ + const int imod = inum % nthreads; \ ifrom = tid * idelta; \ - ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; \ + ito = ifrom + idelta; \ + if (tid < imod) { \ + ito+=tid+1; \ + ifrom+=tid; \ + } else { \ + ito+=imod; \ + ifrom+=imod; \ + } \ } #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ @@ -136,12 +166,37 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \ } +#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr) \ + { \ + if (nthr <= INTEL_HTHREADS) { \ + ifrom = tid; \ + ito = inum; \ + ip = nthr; \ + } else if (nthr % INTEL_HTHREADS == 0) { \ + int nd = nthr / INTEL_HTHREADS; \ + int td = tid / INTEL_HTHREADS; \ + int tm = tid % INTEL_HTHREADS; \ + IP_PRE_omp_range(ifrom, ito, td, inum, nd); \ + ifrom += tm; \ + ip = INTEL_HTHREADS; \ + } else { \ + IP_PRE_omp_range(ifrom, ito, tid, inum, nthr); \ + ip = 1; \ + } \ + } + +#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \ + { \ + tid = omp_get_thread_num(); \ + IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr); \ + } + #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ datasize) \ { \ int chunk_size = INTEL_DATA_ALIGN / datasize; \ - int idelta = static_cast<int>(static_cast<float>(inum) \ - /chunk_size/nthreads) + 1; \ + int idelta = static_cast<int>(ceil(static_cast<float>(inum) \ + /chunk_size/nthreads)); \ idelta *= chunk_size; \ ifrom = tid*idelta; \ ito = ifrom + idelta; \ @@ -168,6 +223,29 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, if (ito > inum) ito = inum; \ } +#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ + nthr, vecsize) \ + { \ + tid = omp_get_thread_num(); \ + if (nthr <= INTEL_HTHREADS) { \ + ifrom = tid*vecsize; \ + ito = inum; \ + ip = nthr*vecsize; \ + } else if (nthr % INTEL_HTHREADS == 0) { \ + int nd = nthr / INTEL_HTHREADS; \ + int td = tid / INTEL_HTHREADS; \ + int tm = tid % INTEL_HTHREADS; \ + IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd, \ + vecsize); \ + ifrom += tm * vecsize; \ + ip = INTEL_HTHREADS * vecsize; \ + } else { \ + IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr, \ + vecsize); \ + ip = vecsize; \ + } \ + } + #else #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ @@ -183,6 +261,21 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, ito = inum; \ } +#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads) \ + { \ + ifrom = 0; \ + ito = inum; \ + ip = 1; \ + } + +#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ + ip = 1; \ + } + #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ datasize) \ { \ @@ -202,14 +295,215 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, nthreads, vecsize) \ { \ tid = 0; \ - int idelta = static_cast<int>(ceil(static_cast<float>(inum) \ - /vecsize)); \ ifrom = 0; \ ito = inum; \ } +#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \ + nthreads, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ + ip = vecsize; \ + } + #endif +#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ + f_stride, pos, ov0, ov1, ov2, \ + ov3, ov4, ov5) \ +{ \ + acc_t *f_scalar = &f_start[0].x; \ + flt_t *x_scalar = &pos[minlocal].x; \ + int f_stride4 = f_stride * 4; \ + _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64); \ + int vwidth; \ + if (sizeof(acc_t) == sizeof(double)) \ + vwidth = INTEL_COMPILE_WIDTH/2; \ + else \ + vwidth = INTEL_COMPILE_WIDTH; \ + if (vwidth < 4) vwidth = 4; \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \ + int remainder = lt % vwidth; \ + if (lf > lt) remainder = 0; \ + const int v_range = lt - remainder; \ + if (nthreads == 2) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n]; \ + } else if (nthreads==4) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + acc_t *f_scalar3 = f_scalar2 + f_stride4; \ + acc_t *f_scalar4 = f_scalar3 + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \ + f_scalar4[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \ + } else if (nthreads==1) { \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + } else if (nthreads==3) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + acc_t *f_scalar3 = f_scalar2 + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \ + } \ + for (int n = v_range; n < lt; n += 4) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + for (int v = 0; v < 4; v++) \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + } \ + ov0 += ovv[0]; \ + ov1 += ovv[1]; \ + ov2 += ovv[2]; \ + if (vwidth > 4) { \ + ov0 += ovv[4]; \ + ov1 += ovv[5]; \ + ov2 += ovv[6]; \ + } \ + if (vwidth > 8) { \ + ov0 += ovv[8] + ovv[12]; \ + ov1 += ovv[9] + ovv[13]; \ + ov2 += ovv[10] + ovv[14]; \ + } \ +} + +#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ + f_stride, pos, offload, vflag, ov0, ov1, \ + ov2, ov3, ov4, ov5) \ +{ \ + int o_range = (nall - minlocal) * 4; \ + IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \ + sizeof(acc_t)); \ + \ + acc_t *f_scalar = &f_start[0].x; \ + int f_stride4 = f_stride * 4; \ + int t; \ + if (vflag == 2) t = 4; else t = 1; \ + acc_t *f_scalar2 = f_scalar + f_stride4 * t; \ + for ( ; t < nthreads; t++) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int n = iifrom; n < iito; n++) \ + f_scalar[n] += f_scalar2[n]; \ + f_scalar2 += f_stride4; \ + } \ + \ + if (vflag == 2) { \ + int nt_min = MIN(4,nthreads); \ + IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \ + f_stride, pos, ov0, ov1, ov2, ov3, ov4, \ + ov5); \ + } \ +} + #ifdef _LMP_INTEL_OFFLOAD #include <sys/time.h> @@ -229,17 +523,19 @@ inline double MIC_Wtime() { if (fix->separate_buffers() && ago != 0) { \ fix->start_watch(TIME_PACK); \ if (offload) { \ - _use_omp_pragma("omp parallel default(none) shared(buffers,nlocal,nall)") \ + int packthreads; \ + if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\ + else packthreads = 1; \ + _use_omp_pragma("omp parallel if(packthreads > 1)") \ { \ int ifrom, ito, tid; \ - int nthreads = comm->nthreads; \ IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \ - nthreads, sizeof(flt_t)); \ + packthreads, sizeof(flt_t)); \ buffers->thr_pack_cop(ifrom, ito, 0); \ int nghost = nall - nlocal; \ if (nghost) { \ IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \ - nthreads, sizeof(flt_t)); \ + packthreads, sizeof(flt_t)); \ buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \ fix->offload_min_ghost() - nlocal, \ ago == 1); \ @@ -254,7 +550,7 @@ inline double MIC_Wtime() { } \ } -#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \ +#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \ buffers, offload, fix, separate_flag, \ x_size, q_size, ev_size, f_stride) \ { \ @@ -276,17 +572,12 @@ inline double MIC_Wtime() { q_size = 0; \ } \ ev_size = 0; \ - if (evflag) { \ - if (eflag) ev_size = 2; \ - if (vflag) ev_size = 8; \ - } \ - int f_length; \ + if (eflag) ev_size = 2; \ + if (vflag) ev_size = 8; \ if (newton) \ - f_length = nall; \ + f_stride = buffers->get_stride(nall); \ else \ - f_length = nlocal; \ - f_length -= minlocal; \ - f_stride = buffers->get_stride(f_length); \ + f_stride = buffers->get_stride(inum); \ } #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ @@ -337,6 +628,20 @@ inline double MIC_Wtime() { } \ } +#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \ + f_start, f_stride, x, offload, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + if (newton) { \ + _use_omp_pragma("omp barrier"); \ + IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ + f_stride, x, offload, vflag, ov0, ov1, ov2, \ + ov3, ov4, ov5); \ + } \ +} + +#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) #else @@ -344,7 +649,7 @@ inline double MIC_Wtime() { #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ nlocal, nall) -#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, \ +#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \ buffers, offload, fix, separate_flag, \ x_size, q_size, ev_size, f_stride) \ { \ @@ -369,18 +674,54 @@ inline double MIC_Wtime() { #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ f_stride, x, q) +#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \ + f_start, f_stride, x, offload, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + if (newton) { \ + if (vflag == 2 && nthreads > INTEL_HTHREADS) { \ + _use_omp_pragma("omp barrier"); \ + buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2, \ + ov3, ov4, ov5); \ + } \ + } \ +} + +#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + if (newton) { \ + if (vflag == 2 && nthreads <= INTEL_HTHREADS) { \ + int lt = nall * 4; \ + buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1, \ + ov2, ov3, ov4, ov5); \ + } \ + } \ +} #endif -#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz) \ +#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz) \ { \ if (vflag == 1) { \ - sv0 += ev_pre * delx * delx * fpair; \ - sv1 += ev_pre * dely * dely * fpair; \ - sv2 += ev_pre * delz * delz * fpair; \ - sv3 += ev_pre * delx * dely * fpair; \ - sv4 += ev_pre * delx * delz * fpair; \ - sv5 += ev_pre * dely * delz * fpair; \ + sv0 += delx * delx * fpair; \ + sv1 += dely * dely * fpair; \ + sv2 += delz * delz * fpair; \ + sv3 += delx * dely * fpair; \ + sv4 += delx * delz * fpair; \ + sv5 += dely * delz * fpair; \ + } \ +} + +#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz) \ +{ \ + if (vflag == 1) { \ + sv0 += dx * fpx; \ + sv1 += dy * fpy; \ + sv2 += dz * fpz; \ + sv3 += dx * fpy; \ + sv4 += dx * fpz; \ + sv5 += dy * fpz; \ } \ } @@ -408,9 +749,10 @@ inline double MIC_Wtime() { } \ } -#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond, \ - delx, dely, delz, obond, force, newton, \ - nlocal, ov0, ov1, ov2, ov3, ov4, ov5) \ +#define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \ + fbond, delx, dely, delz, obond, force, \ + newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ + ov5) \ { \ flt_t ev_pre; \ if (newton) ev_pre = (flt_t)1.0; \ @@ -421,7 +763,7 @@ inline double MIC_Wtime() { } \ \ if (eflag) { \ - oebond += ev_pre * ebond; \ + obond += ev_pre * ebond; \ if (eatom) { \ flt_t halfeng = ebond * (flt_t)0.5; \ if (newton || i1 < nlocal) f[i1].w += halfeng; \ @@ -429,7 +771,7 @@ inline double MIC_Wtime() { } \ } \ \ - if (vflag) { \ + if (VFLAG && vflag) { \ ov0 += ev_pre * (delx * delx * fbond); \ ov1 += ev_pre * (dely * dely * fbond); \ ov2 += ev_pre * (delz * delz * fbond); \ @@ -439,9 +781,9 @@ inline double MIC_Wtime() { } \ } -#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, \ - f1x, f1y, f1z, f3x, f3y, f3z, delx1, \ - dely1, delz1, delx2, dely2, delz2, \ +#define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1, \ + i2, i3, f1x, f1y, f1z, f3x, f3y, f3z, \ + delx1, dely1, delz1, delx2, dely2, delz2, \ oeangle, force, newton, nlocal, ov0, ov1, \ ov2, ov3, ov4, ov5) \ { \ @@ -464,20 +806,20 @@ inline double MIC_Wtime() { } \ } \ \ - if (vflag) { \ - ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \ - ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \ - ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \ - ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \ - ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \ - ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \ + if (VFLAG && vflag) { \ + ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \ + ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \ + ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \ + ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \ + ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \ + ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \ } \ } -#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\ - f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, \ - f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, \ - vb3x, vb3y, vb3z,oedihedral, force, \ +#define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \ + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\ + f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, \ + vb2z, vb3x, vb3y, vb3z, oedihedral, force,\ newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ ov5) \ { \ @@ -502,7 +844,7 @@ inline double MIC_Wtime() { } \ } \ \ - if (vflag) { \ + if (VFLAG && vflag) { \ ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \ ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \ ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \ @@ -512,96 +854,36 @@ inline double MIC_Wtime() { } \ } -#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp) \ +#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp) \ { \ - if (evflag) { \ - if (eflag) { \ - f[i].w += fwtmp; \ - oevdwl += sevdwl; \ - } \ - if (vflag == 1) { \ - ov0 += sv0; \ - ov1 += sv1; \ - ov2 += sv2; \ - ov3 += sv3; \ - ov4 += sv4; \ - ov5 += sv5; \ - } \ + if (eflag) { \ + f[i].w += fwtmp; \ + oevdwl += sevdwl; \ } \ -} - -#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp) \ -{ \ - if (evflag) { \ - if (eflag) { \ - f[i].w += fwtmp; \ - oevdwl += sevdwl; \ - oecoul += secoul; \ - } \ - if (vflag == 1) { \ - ov0 += sv0; \ - ov1 += sv1; \ - ov2 += sv2; \ - ov3 += sv3; \ - ov4 += sv4; \ - ov5 += sv5; \ - } \ + if (newton == 0 && vflag == 1) { \ + ov0 += sv0; \ + ov1 += sv1; \ + ov2 += sv2; \ + ov3 += sv3; \ + ov4 += sv4; \ + ov5 += sv5; \ } \ } -#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom, \ - nall, nlocal, minlocal, nthreads, \ - f_start, f_stride, x, offload) \ +#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp) \ { \ - int o_range; \ - if (newton) \ - o_range = nall; \ - else \ - o_range = nlocal; \ - if (offload == 0) o_range -= minlocal; \ - IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, \ - sizeof(acc_t)); \ - \ - int t_off = f_stride; \ - if (eflag && eatom) { \ - for (int t = 1; t < nthreads; t++) { \ - _use_simd_pragma("vector nontemporal") \ - _use_simd_pragma("novector") \ - for (int n = iifrom; n < iito; n++) { \ - f_start[n].x += f_start[n + t_off].x; \ - f_start[n].y += f_start[n + t_off].y; \ - f_start[n].z += f_start[n + t_off].z; \ - f_start[n].w += f_start[n + t_off].w; \ - } \ - t_off += f_stride; \ - } \ - } else { \ - for (int t = 1; t < nthreads; t++) { \ - _use_simd_pragma("vector nontemporal") \ - _use_simd_pragma("novector") \ - for (int n = iifrom; n < iito; n++) { \ - f_start[n].x += f_start[n + t_off].x; \ - f_start[n].y += f_start[n + t_off].y; \ - f_start[n].z += f_start[n + t_off].z; \ - } \ - t_off += f_stride; \ - } \ + if (eflag) { \ + f[i].w += fwtmp; \ + oevdwl += sevdwl; \ + oecoul += secoul; \ } \ - \ - if (evflag) { \ - if (vflag == 2) { \ - const ATOM_T * _noalias const xo = x + minlocal; \ - _use_simd_pragma("vector nontemporal") \ - _use_simd_pragma("novector") \ - for (int n = iifrom; n < iito; n++) { \ - ov0 += f_start[n].x * xo[n].x; \ - ov1 += f_start[n].y * xo[n].y; \ - ov2 += f_start[n].z * xo[n].z; \ - ov3 += f_start[n].y * xo[n].x; \ - ov4 += f_start[n].z * xo[n].x; \ - ov5 += f_start[n].z * xo[n].y; \ - } \ - } \ + if (newton == 0 && vflag == 1) { \ + ov0 += sv0; \ + ov1 += sv1; \ + ov2 += sv2; \ + ov3 += sv3; \ + ov4 += sv4; \ + ov5 += sv5; \ } \ } diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h index ac13f1edfd..aa03a6f136 100644 --- a/src/USER-INTEL/intel_simd.h +++ b/src/USER-INTEL/intel_simd.h @@ -1778,7 +1778,7 @@ namespace ip_simd { inline void SIMD_iforce_update(const SIMD_mask &m, float *force, const SIMD_int &i, const SIMD_float &fx, const SIMD_float &fy, const SIMD_float &fz, - const int EVFLAG, const int eatom, + const int EFLAG, const int eatom, const SIMD_float &fwtmp) { SIMD_float jfrc; jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, @@ -1793,7 +1793,7 @@ namespace ip_simd { _MM_SCALE_1); jfrc = jfrc + fz; _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); - if (EVFLAG) { + if (EFLAG) { if (eatom) { jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3, _MM_SCALE_1); @@ -1806,7 +1806,7 @@ namespace ip_simd { inline void SIMD_iforce_update(const SIMD_mask &m, double *force, const SIMD_int &i, const SIMD_double &fx, const SIMD_double &fy, const SIMD_double &fz, - const int EVFLAG, const int eatom, + const int EFLAG, const int eatom, const SIMD_double &fwtmp) { SIMD_double jfrc; jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, @@ -1821,7 +1821,7 @@ namespace ip_simd { _MM_SCALE_2); jfrc = jfrc + fz; _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); - if (EVFLAG) { + if (EFLAG) { if (eatom) { jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 3, _MM_SCALE_2); diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp index c3335b2c26..bff3d53636 100644 --- a/src/USER-INTEL/nbin_intel.cpp +++ b/src/USER-INTEL/nbin_intel.cpp @@ -55,7 +55,7 @@ NBinIntel::~NBinIntel() { nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1)) } #endif -} +} /* ---------------------------------------------------------------------- setup for bin_atoms() @@ -71,7 +71,7 @@ void NBinIntel::bin_atoms_setup(int nall) if (_offload_alloc) { const int * binhead = this->binhead; #pragma offload_transfer target(mic:_cop) \ - nocopy(binhead:alloc_if(0) free_if(1)) + nocopy(binhead:alloc_if(0) free_if(1)) } #endif @@ -99,7 +99,7 @@ void NBinIntel::bin_atoms_setup(int nall) const int * _atombin = this->_atombin; const int * _binpacked = this->_binpacked; #pragma offload_transfer target(mic:_cop) \ - nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1)) + nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1)) } #endif memory->destroy(bins); @@ -157,10 +157,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) { const flt_t dx = (INTEL_BIGP - bboxhi[0]); const flt_t dy = (INTEL_BIGP - bboxhi[1]); const flt_t dz = (INTEL_BIGP - bboxhi[2]); - if (dx * dx + dy * dy + dz * dz < - static_cast<flt_t>(neighbor->cutneighmaxsq)) + if (dx * dx + dy * dy + dz * dz < + static_cast<flt_t>(neighbor->cutneighmaxsq)) error->one(FLERR, - "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); + "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); } // ---------- Grow and cast/pack buffers ------------- @@ -174,14 +174,16 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) { biga.w = 1; buffers->get_x()[nall] = biga; - const int nthreads = comm->nthreads; + int nthreads; + if (comm->nthreads > INTEL_HTHREADS) nthreads = comm->nthreads; + else nthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(buffers) + #pragma omp parallel if(nthreads > INTEL_HTHREADS) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, - sizeof(ATOM_T)); + sizeof(ATOM_T)); buffers->thr_pack(ifrom, ito, 0); } _fix->stop_watch(TIME_PACK); diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp index 7e0d2abdcb..ae4f599176 100644 --- a/src/USER-INTEL/npair_full_bin_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_intel.cpp @@ -70,483 +70,62 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { #endif buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end, - _fix->nbor_pack_width()); + _fix->nbor_pack_width()); int need_ic = 0; if (atom->molecular) dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); + neighbor->cutneighmax); #ifdef _LMP_INTEL_OFFLOAD - if (need_ic) { - if (offload_noghost) { - fbi<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end); - fbi<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end); + if (_fix->three_body_neighbor()) { + if (need_ic) { + if (offload_noghost) { + bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end); + } else { + bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal); + } } else { - fbi<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end); - fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal); + if (offload_noghost) { + bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end); + } else { + bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal); + } } } else { - if (offload_noghost) { - fbi<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end); - fbi<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end); + if (need_ic) { + if (offload_noghost) { + bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end); + } else { + bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal); + } } else { - fbi<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end); - fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal); + if (offload_noghost) { + bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end); + } else { + bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal); + } } } #else - if (need_ic) - fbi<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal); - else - fbi<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal); - #endif -} - -template <class flt_t, class acc_t, int offload_noghost, int need_ic> -void NPairFullBinIntel:: -fbi(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers, - const int astart, const int aend, const int offload_end) { - - if (aend-astart == 0) return; - - const int nall = atom->nlocal + atom->nghost; - int pad = 1; - int nall_t = nall; - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost && offload) nall_t = atom->nlocal; - #endif - - const int pack_width = _fix->nbor_pack_width(); - const int pad_width = pad; - - const ATOM_T * _noalias const x = buffers->get_x(); - int * _noalias const firstneigh = buffers->firstneigh(list); - const int e_nall = nall_t; - - const int molecular = atom->molecular; - int *ns = NULL; - tagint *s = NULL; - int tag_size = 0, special_size; - if (buffers->need_tag()) tag_size = e_nall; - if (molecular) { - s = atom->special[0]; - ns = atom->nspecial[0]; - special_size = aend; - } else { - s = &buffers->_special_holder; - ns = &buffers->_nspecial_holder; - special_size = 0; - } - const tagint * _noalias const special = s; - const int * _noalias const nspecial = ns; - const int maxspecial = atom->maxspecial; - const tagint * _noalias const tag = atom->tag; - - int * _noalias const ilist = list->ilist; - int * _noalias numneigh = list->numneigh; - int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int nstencil = this->nstencil; - const int * _noalias const stencil = this->stencil; - const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; - const int ntypes = atom->ntypes + 1; - const int nlocal = atom->nlocal; - - #ifndef _LMP_INTEL_OFFLOAD - int * const mask = atom->mask; - tagint * const molecule = atom->molecule; - #endif - - int tnum; - int *overflow; - double *timer_compute; - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - timer_compute = _fix->off_watch_neighbor(); - tnum = buffers->get_off_threads(); - overflow = _fix->get_off_overflow_flag(); - _fix->stop_watch(TIME_HOST_NEIGHBOR); - _fix->start_watch(TIME_OFFLOAD_LATENCY); - } else - #endif - { - tnum = comm->nthreads; - overflow = _fix->get_overflow_flag(); - } - const int nthreads = tnum; - const int maxnbors = buffers->get_max_nbors(); - int * _noalias const atombin = buffers->get_atombin(); - const int * _noalias const binpacked = buffers->get_binpacked(); - - const int xperiodic = domain->xperiodic; - const int yperiodic = domain->yperiodic; - const int zperiodic = domain->zperiodic; - const flt_t xprd_half = domain->xprd_half; - const flt_t yprd_half = domain->yprd_half; - const flt_t zprd_half = domain->zprd_half; - - #ifdef _LMP_INTEL_OFFLOAD - const int * _noalias const binhead = this->binhead; - const int * _noalias const bins = this->bins; - const int cop = _fix->coprocessor_number(); - const int separate_buffers = _fix->separate_buffers(); - #pragma offload target(mic:cop) if(offload) \ - in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ - in(tag:length(tag_size) alloc_if(0) free_if(0)) \ - in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ - in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ - in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \ - in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ - in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ - in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ - out(numneigh:length(0) alloc_if(0) free_if(0)) \ - in(ilist:length(0) alloc_if(0) free_if(0)) \ - in(atombin:length(aend) alloc_if(0) free_if(0)) \ - in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width) \ - in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \ - in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \ - out(overflow:length(5) alloc_if(0) free_if(0)) \ - out(timer_compute:length(1) alloc_if(0) free_if(0)) \ - signal(tag) - #endif - { - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime(); - #endif - - #ifdef _LMP_INTEL_OFFLOAD - overflow[LMP_LOCAL_MIN] = astart; - overflow[LMP_LOCAL_MAX] = aend - 1; - overflow[LMP_GHOST_MIN] = e_nall; - overflow[LMP_GHOST_MAX] = -1; - #endif - - int nstencilp = 0; - int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL]; - for (int k = 0; k < nstencil; k++) { - binstart[nstencilp] = stencil[k]; - int end = stencil[k] + 1; - for (int kk = k + 1; kk < nstencil; kk++) { - if (stencil[kk-1]+1 == stencil[kk]) { - end++; - k++; - } else break; - } - binend[nstencilp] = end; - nstencilp++; - } - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(numneigh, overflow, nstencilp, binstart, binend) - #endif - { - #ifdef _LMP_INTEL_OFFLOAD - int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; - #endif - - const int num = aend - astart; - int tid, ifrom, ito; - - IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width); - ifrom += astart; - ito += astart; - int e_ito = ito; - if (ito == num) { - int imod = ito % pack_width; - if (imod) e_ito += pack_width - imod; - } - const int list_size = (e_ito + tid * 2 + 2) * maxnbors; - int which; - int pack_offset = maxnbors * pack_width; - int ct = (ifrom + tid * 2) * maxnbors; - int *neighptr = firstneigh + ct; - const int obound = pack_offset + maxnbors * 2; - - int max_chunk = 0; - int lane = 0; - for (int i = ifrom; i < ito; i++) { - const flt_t xtmp = x[i].x; - const flt_t ytmp = x[i].y; - const flt_t ztmp = x[i].z; - const int itype = x[i].w; - const tagint itag = tag[i]; - const int ioffset = ntypes * itype; - - const int ibin = atombin[i]; - int raw_count = pack_offset; - - // loop over all atoms in surrounding bins in stencil including self - // skip i = j - if (exclude) { - for (int k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; - #ifndef _LMP_INTEL_OFFLOAD - #ifdef INTEL_VMASK - #pragma simd - #endif - #endif - for (int jj = bstart; jj < bend; jj++) { - int j = binpacked[jj]; - - if (i == j) j=e_nall; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost) { - if (j < nlocal) { - if (i < offload_end) continue; - } else if (offload) continue; - } - #endif - - #ifndef _LMP_INTEL_OFFLOAD - const int jtype = x[j].w; - if (exclusion(i,j,itype,jtype,mask,molecule)) continue; - #endif - - neighptr[raw_count++] = j; - } - } - } else { - for (int k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; - #ifndef _LMP_INTEL_OFFLOAD - #ifdef INTEL_VMASK - #pragma simd - #endif - #endif - for (int jj = bstart; jj < bend; jj++) { - int j = binpacked[jj]; - - if (i == j) j=e_nall; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost) { - if (j < nlocal) { - if (i < offload_end) continue; - } else if (offload) continue; - } - #endif - - neighptr[raw_count++] = j; - } - } - } - - if (raw_count > obound) *overflow = 1; - - #if defined(LMP_SIMD_COMPILER) - #ifdef _LMP_INTEL_OFFLOAD - int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; - #if __INTEL_COMPILER+0 > 1499 - #pragma vector aligned - #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) - #endif - #else - #pragma vector aligned - #pragma simd - #endif - #endif - for (int u = pack_offset; u < raw_count; u++) { - int j = neighptr[u]; - const flt_t delx = xtmp - x[j].x; - const flt_t dely = ytmp - x[j].y; - const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; - const flt_t rsq = delx * delx + dely * dely + delz * delz; - if (rsq > cutneighsq[ioffset + jtype]) - neighptr[u] = e_nall; - else { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[u] = -j - 1; - } - #ifdef _LMP_INTEL_OFFLOAD - if (j < nlocal) { - if (j < vlmin) vlmin = j; - if (j > vlmax) vlmax = j; - } else { - if (j < vgmin) vgmin = j; - if (j > vgmax) vgmax = j; - } - #endif - } - } - #ifdef _LMP_INTEL_OFFLOAD - lmin = MIN(lmin,vlmin); - gmin = MIN(gmin,vgmin); - lmax = MAX(lmax,vlmax); - gmax = MAX(gmax,vgmax); - #endif - - int n = lane, n2 = pack_offset; - for (int u = pack_offset; u < raw_count; u++) { - const int j = neighptr[u]; - int pj = j; - if (pj < e_nall) { - if (need_ic) - if (pj < 0) pj = -pj - 1; - - const int jtag = tag[pj]; - int flist = 0; - if (itag > jtag) { - if ((itag+jtag) % 2 == 0) flist = 1; - } else if (itag < jtag) { - if ((itag+jtag) % 2 == 1) flist = 1; - } else { - if (x[pj].z < ztmp) flist = 1; - else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1; - else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp) - flist = 1; - } - if (flist) { - neighptr[n2++] = j; - } else { - neighptr[n] = j; - n += pack_width; - } - } - } - int ns = (n - lane) / pack_width; - atombin[i] = ns; - for (int u = pack_offset; u < n2; u++) { - neighptr[n] = neighptr[u]; - n += pack_width; - } - - ilist[i] = i; - cnumneigh[i] = ct + lane; - ns += n2 - pack_offset; - numneigh[i] = ns; - - if (ns > max_chunk) max_chunk = ns; - lane++; - if (lane == pack_width) { - ct += max_chunk * pack_width; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = (ct % alignb); - if (edge) ct += alignb - edge; - neighptr = firstneigh + ct; - max_chunk = 0; - pack_offset = maxnbors * pack_width; - lane = 0; - if (ct + obound > list_size) { - if (i < ito - 1) { - *overflow = 1; - ct = (ifrom + tid * 2) * maxnbors; - } - } - } - } - - if (*overflow == 1) - for (int i = ifrom; i < ito; i++) - numneigh[i] = 0; - - #ifdef _LMP_INTEL_OFFLOAD - if (separate_buffers) { - #if defined(_OPENMP) - #pragma omp critical - #endif - { - if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; - if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; - if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; - if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; - } - #pragma omp barrier - } - - int ghost_offset = 0, nall_offset = e_nall; - if (separate_buffers) { - int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; - if (nghost < 0) nghost = 0; - if (offload) { - ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; - nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; - } else { - ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; - nall_offset = nlocal + nghost; - } - } - #endif - - if (molecular) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - - const int trip = jnum * pack_width; - for (int jj = 0; jj < trip; jj+=pack_width) { - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } - } - #ifdef _LMP_INTEL_OFFLOAD - else if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - int jj = 0; - for (jj = 0; jj < jnum; jj++) { - if (jlist[jj] >= nlocal) { - if (jlist[jj] == e_nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; - } - } - } - } - #endif - } // end omp - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime() - *timer_compute; - #endif - } // end offload - - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - _fix->stop_watch(TIME_OFFLOAD_LATENCY); - _fix->start_watch(TIME_HOST_NEIGHBOR); - for (int n = 0; n < aend; n++) { - ilist[n] = n; - numneigh[n] = 0; - } + if (_fix->three_body_neighbor()) { + if (need_ic) + bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal); + else + bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal); } else { - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; - if (separate_buffers) { - _fix->start_watch(TIME_PACK); - _fix->set_neighbor_host_sizes(); - buffers->pack_sep_from_single(_fix->host_min_local(), - _fix->host_used_local(), - _fix->host_min_ghost(), - _fix->host_used_ghost()); - _fix->stop_watch(TIME_PACK); - } + if (need_ic) + bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal); + else + bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal); } - #else - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; #endif } diff --git a/src/USER-INTEL/npair_full_bin_intel.h b/src/USER-INTEL/npair_full_bin_intel.h index f1be71abbc..83f2c3cd4c 100644 --- a/src/USER-INTEL/npair_full_bin_intel.h +++ b/src/USER-INTEL/npair_full_bin_intel.h @@ -15,7 +15,7 @@ NPairStyle(full/bin/intel, NPairFullBinIntel, - NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | + NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL) #else @@ -36,9 +36,6 @@ class NPairFullBinIntel : public NPairIntel { private: template <class flt_t, class acc_t> void fbi(NeighList *, IntelBuffers<flt_t,acc_t> *); - template <class flt_t, class acc_t, int, int> - void fbi(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int, - const int, const int offload_end = 0); }; } diff --git a/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp b/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp deleted file mode 100644 index 9a40e2a07c..0000000000 --- a/src/USER-INTEL/npair_half_bin_newtoff_intel.cpp +++ /dev/null @@ -1,451 +0,0 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing author: W. Michael Brown (Intel) -------------------------------------------------------------------------- */ - -#include "npair_half_bin_newtoff_intel.h" -#include "neighbor.h" -#include "neigh_list.h" -#include "atom.h" -#include "comm.h" -#include "group.h" - -using namespace LAMMPS_NS; - -/* ---------------------------------------------------------------------- */ - -NPairHalfBinNewtoffIntel::NPairHalfBinNewtoffIntel(LAMMPS *lmp) : - NPairIntel(lmp) {} - -/* ---------------------------------------------------------------------- - binned neighbor list construction with partial Newton's 3rd law - each owned atom i checks own bin and other bins in stencil - pair stored once if i,j are both owned and i < j - pair stored by me if j is ghost (also stored by proc owning j) -------------------------------------------------------------------------- */ - -void NPairHalfBinNewtoffIntel::build(NeighList *list) -{ - if (nstencil > INTEL_MAX_STENCIL_CHECK) - error->all(FLERR, "Too many neighbor bins for USER-INTEL package."); - - #ifdef _LMP_INTEL_OFFLOAD - if (exclude) - error->all(FLERR, "Exclusion lists not yet supported for Intel offload"); - #endif - - if (_fix->precision() == FixIntel::PREC_MODE_MIXED) - hbnni(list, _fix->get_mixed_buffers()); - else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) - hbnni(list, _fix->get_double_buffers()); - else - hbnni(list, _fix->get_single_buffers()); - - _fix->stop_watch(TIME_HOST_NEIGHBOR); -} - -template <class flt_t, class acc_t> -void NPairHalfBinNewtoffIntel:: -hbnni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { - const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal; - list->inum = nlocal; - - const int off_end = _fix->offload_end_neighbor(); - int host_start = off_end;; - - #ifdef _LMP_INTEL_OFFLOAD - if (off_end) grow_stencil(); - if (_fix->full_host_list()) host_start = 0; - #endif - - buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end); - - int need_ic = 0; - if (atom->molecular) - dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); - - #ifdef _LMP_INTEL_OFFLOAD - if (need_ic) { - hbnni<flt_t,acc_t,1>(1, list, buffers, 0, off_end); - hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal); - } else { - hbnni<flt_t,acc_t,0>(1, list, buffers, 0, off_end); - hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal); - } - #else - if (need_ic) - hbnni<flt_t,acc_t,1>(0, list, buffers, host_start, nlocal); - else - hbnni<flt_t,acc_t,0>(0, list, buffers, host_start, nlocal); - #endif -} - -template <class flt_t, class acc_t, int need_ic> -void NPairHalfBinNewtoffIntel:: -hbnni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers, - const int astart, const int aend) { - - if (aend-astart == 0) return; - - const int nall = atom->nlocal + atom->nghost; - int pad = 1; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - if (INTEL_MIC_NBOR_PAD > 1) - pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); - } else - #endif - if (INTEL_NBOR_PAD > 1) - pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); - const int pad_width = pad; - - const ATOM_T * _noalias const x = buffers->get_x(); - int * _noalias const firstneigh = buffers->firstneigh(list); - - const int molecular = atom->molecular; - int *ns = NULL; - tagint *s = NULL; - int tag_size = 0, special_size; - if (buffers->need_tag()) tag_size = nall; - if (molecular) { - s = atom->special[0]; - ns = atom->nspecial[0]; - special_size = aend; - } else { - s = &buffers->_special_holder; - ns = &buffers->_nspecial_holder; - special_size = 0; - } - const tagint * _noalias const special = s; - const int * _noalias const nspecial = ns; - const int maxspecial = atom->maxspecial; - const tagint * _noalias const tag = atom->tag; - - int * _noalias const ilist = list->ilist; - int * _noalias numneigh = list->numneigh; - int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int nstencil = this->nstencil; - const int * _noalias const stencil = this->stencil; - const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; - const int ntypes = atom->ntypes + 1; - const int nlocal = atom->nlocal; - - #ifndef _LMP_INTEL_OFFLOAD - int * const mask = atom->mask; - tagint * const molecule = atom->molecule; - #endif - - int tnum; - int *overflow; - double *timer_compute; - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - timer_compute = _fix->off_watch_neighbor(); - tnum = buffers->get_off_threads(); - overflow = _fix->get_off_overflow_flag(); - _fix->stop_watch(TIME_HOST_NEIGHBOR); - _fix->start_watch(TIME_OFFLOAD_LATENCY); - } else - #endif - { - tnum = comm->nthreads; - overflow = _fix->get_overflow_flag(); - } - const int nthreads = tnum; - const int maxnbors = buffers->get_max_nbors(); - int * _noalias const atombin = buffers->get_atombin(); - const int * _noalias const binpacked = buffers->get_binpacked(); - - const int xperiodic = domain->xperiodic; - const int yperiodic = domain->yperiodic; - const int zperiodic = domain->zperiodic; - const flt_t xprd_half = domain->xprd_half; - const flt_t yprd_half = domain->yprd_half; - const flt_t zprd_half = domain->zprd_half; - - #ifdef _LMP_INTEL_OFFLOAD - const int * _noalias const binhead = this->binhead; - const int * _noalias const bins = this->bins; - const int cop = _fix->coprocessor_number(); - const int separate_buffers = _fix->separate_buffers(); - #pragma offload target(mic:cop) if(offload) \ - in(x:length(nall+1) alloc_if(0) free_if(0)) \ - in(tag:length(tag_size) alloc_if(0) free_if(0)) \ - in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ - in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ - in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \ - in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ - in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ - in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ - out(numneigh:length(0) alloc_if(0) free_if(0)) \ - in(ilist:length(0) alloc_if(0) free_if(0)) \ - in(atombin:length(aend) alloc_if(0) free_if(0)) \ - in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,pad_width,offload,nall) \ - in(separate_buffers, astart, aend, nlocal, molecular, ntypes) \ - in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \ - out(overflow:length(5) alloc_if(0) free_if(0)) \ - out(timer_compute:length(1) alloc_if(0) free_if(0)) \ - signal(tag) - #endif - { - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime(); - #endif - - #ifdef _LMP_INTEL_OFFLOAD - overflow[LMP_LOCAL_MIN] = astart; - overflow[LMP_LOCAL_MAX] = aend - 1; - overflow[LMP_GHOST_MIN] = nall; - overflow[LMP_GHOST_MAX] = -1; - #endif - - int nstencilp = 0; - int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL]; - for (int k = 0; k < nstencil; k++) { - binstart[nstencilp] = stencil[k]; - int end = stencil[k] + 1; - for (int kk = k + 1; kk < nstencil; kk++) { - if (stencil[kk-1]+1 == stencil[kk]) { - end++; - k++; - } else break; - } - binend[nstencilp] = end; - nstencilp++; - } - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(numneigh, overflow, nstencilp, binstart, binend) - #endif - { - #ifdef _LMP_INTEL_OFFLOAD - int lmin = nall, lmax = -1, gmin = nall, gmax = -1; - #endif - - const int num = aend - astart; - int tid, ifrom, ito; - IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); - ifrom += astart; - ito += astart; - - int which; - - const int list_size = (ito + tid + 1) * maxnbors; - int ct = (ifrom + tid) * maxnbors; - int *neighptr = firstneigh + ct; - - for (int i = ifrom; i < ito; i++) { - int j, k, n, n2, itype, jtype, ibin; - double xtmp, ytmp, ztmp, delx, dely, delz, rsq; - - n = 0; - n2 = maxnbors; - - xtmp = x[i].x; - ytmp = x[i].y; - ztmp = x[i].z; - itype = x[i].w; - const int ioffset = ntypes*itype; - - // loop over all atoms in other bins in stencil including self - // only store pair if i < j - // stores own/own pairs only once - // stores own/ghost pairs on both procs - - ibin = atombin[i]; - - for (k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; - for (int jj = bstart; jj < bend; jj++) { - const int j = binpacked[jj]; - if (j <= i) continue; - - jtype = x[j].w; - #ifndef _LMP_INTEL_OFFLOAD - if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue; - #endif - - delx = xtmp - x[j].x; - dely = ytmp - x[j].y; - delz = ztmp - x[j].z; - rsq = delx * delx + dely * dely + delz * delz; - if (rsq <= cutneighsq[ioffset + jtype]) { - if (j < nlocal) { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[n++] = -j - 1; - else - neighptr[n++] = j; - } else - neighptr[n++] = j; - #ifdef _LMP_INTEL_OFFLOAD - if (j < lmin) lmin = j; - if (j > lmax) lmax = j; - #endif - } else { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[n2++] = -j - 1; - else - neighptr[n2++] = j; - } else - neighptr[n2++] = j; - #ifdef _LMP_INTEL_OFFLOAD - if (j < gmin) gmin = j; - if (j > gmax) gmax = j; - #endif - } - } - } - } - ilist[i] = i; - - cnumneigh[i] = ct; - if (n > maxnbors) *overflow = 1; - for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k]; - - const int edge = (n % pad_width); - if (edge) { - const int pad_end = n + (pad_width - edge); - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min=1, max=15, avg=8 - #endif - for ( ; n < pad_end; n++) - neighptr[n] = nall; - } - numneigh[i] = n; - while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++; - ct += n; - neighptr += n; - if (ct + n + maxnbors > list_size) { - *overflow = 1; - ct = (ifrom + tid) * maxnbors; - } - } - - if (*overflow == 1) - for (int i = ifrom; i < ito; i++) - numneigh[i] = 0; - - #ifdef _LMP_INTEL_OFFLOAD - if (separate_buffers) { - #if defined(_OPENMP) - #pragma omp critical - #endif - { - if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; - if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; - if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; - if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; - } - #pragma omp barrier - } - - int ghost_offset = 0, nall_offset = nall; - if (separate_buffers) { - int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; - if (nghost < 0) nghost = 0; - if (offload) { - ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; - nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; - } else { - ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; - nall_offset = nlocal + nghost; - } - } - #endif - - if (molecular) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - for (int jj = 0; jj < jnum; jj++) { - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } - } - #ifdef _LMP_INTEL_OFFLOAD - else if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - int jj = 0; - for (jj = 0; jj < jnum; jj++) - if (jlist[jj] >= nlocal) break; - while (jj < jnum) { - if (jlist[jj] == nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; - jj++; - } - } - } - #endif - } // end omp - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime() - *timer_compute; - #endif - } // end offload - - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - _fix->stop_watch(TIME_OFFLOAD_LATENCY); - _fix->start_watch(TIME_HOST_NEIGHBOR); - for (int n = 0; n < aend; n++) { - ilist[n] = n; - numneigh[n] = 0; - } - } else { - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; - if (separate_buffers) { - _fix->start_watch(TIME_PACK); - _fix->set_neighbor_host_sizes(); - buffers->pack_sep_from_single(_fix->host_min_local(), - _fix->host_used_local(), - _fix->host_min_ghost(), - _fix->host_used_ghost()); - _fix->stop_watch(TIME_PACK); - } - } - #else - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; - #endif -} diff --git a/src/USER-INTEL/npair_half_bin_newtoff_intel.h b/src/USER-INTEL/npair_half_bin_newtoff_intel.h deleted file mode 100644 index 49482f8b3e..0000000000 --- a/src/USER-INTEL/npair_half_bin_newtoff_intel.h +++ /dev/null @@ -1,52 +0,0 @@ -/* -*- c++ -*- ---------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -#ifdef NPAIR_CLASS - -NPairStyle(half/bin/newtoff/intel, - NPairHalfBinNewtoffIntel, - NP_HALF | NP_BIN | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL) - -#else - -#ifndef LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H -#define LMP_NPAIR_HALF_BIN_NEWTOFF_INTEL_H - -#include "npair_intel.h" -#include "fix_intel.h" - -namespace LAMMPS_NS { - -class NPairHalfBinNewtoffIntel : public NPairIntel { - public: - NPairHalfBinNewtoffIntel(class LAMMPS *); - ~NPairHalfBinNewtoffIntel() {} - void build(class NeighList *); - - private: - template <class flt_t, class acc_t> - void hbnni(NeighList *, IntelBuffers<flt_t,acc_t> *); - template <class flt_t, class acc_t, int> - void hbnni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int, - const int); -}; - -} - -#endif -#endif - -/* ERROR/WARNING messages: - - -*/ diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp index 6313ab944f..e7d5995cc5 100644 --- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp +++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp @@ -26,7 +26,7 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : +NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : NPairIntel(lmp) {} /* ---------------------------------------------------------------------- @@ -75,536 +75,32 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int need_ic = 0; if (atom->molecular) dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); + neighbor->cutneighmax); #ifdef _LMP_INTEL_OFFLOAD if (need_ic) { if (offload_noghost) { - hbni<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end); - hbni<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal, + off_end); } else { - hbni<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end); - hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal); } } else { if (offload_noghost) { - hbni<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end); - hbni<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal, + off_end); } else { - hbni<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end); - hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal); } } #else - if (need_ic) - hbni<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal); + if (need_ic) + bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal); else - hbni<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal); - #endif -} - -template <class flt_t, class acc_t, int offload_noghost, int need_ic> -void NPairHalfBinNewtonIntel:: -hbni(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers, - const int astart, const int aend, const int offload_end) { - - if (aend-astart == 0) return; - - const int nall = atom->nlocal + atom->nghost; - int pad = 1; - int nall_t = nall; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost && offload) nall_t = atom->nlocal; - if (offload) { - if (INTEL_MIC_NBOR_PAD > 1) - pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); - } else - #endif - if (INTEL_NBOR_PAD > 1) - pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); - const int pad_width = pad; - - const ATOM_T * _noalias const x = buffers->get_x(); - int * _noalias const firstneigh = buffers->firstneigh(list); - const int e_nall = nall_t; - - const int molecular = atom->molecular; - int *ns = NULL; - tagint *s = NULL; - int tag_size = 0, special_size; - if (buffers->need_tag()) tag_size = e_nall; - if (molecular) { - s = atom->special[0]; - ns = atom->nspecial[0]; - special_size = aend; - } else { - s = &buffers->_special_holder; - ns = &buffers->_nspecial_holder; - special_size = 0; - } - const tagint * _noalias const special = s; - const int * _noalias const nspecial = ns; - const int maxspecial = atom->maxspecial; - const tagint * _noalias const tag = atom->tag; - - int * _noalias const ilist = list->ilist; - int * _noalias numneigh = list->numneigh; - int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int nstencil = this->nstencil; - const int * _noalias const stencil = this->stencil; - const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; - const int ntypes = atom->ntypes + 1; - const int nlocal = atom->nlocal; - - #ifndef _LMP_INTEL_OFFLOAD - int * const mask = atom->mask; - tagint * const molecule = atom->molecule; - #endif - - int tnum; - int *overflow; - double *timer_compute; - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - timer_compute = _fix->off_watch_neighbor(); - tnum = buffers->get_off_threads(); - overflow = _fix->get_off_overflow_flag(); - _fix->stop_watch(TIME_HOST_NEIGHBOR); - _fix->start_watch(TIME_OFFLOAD_LATENCY); - } else - #endif - { - tnum = comm->nthreads; - overflow = _fix->get_overflow_flag(); - } - const int nthreads = tnum; - const int maxnbors = buffers->get_max_nbors(); - int * _noalias const atombin = buffers->get_atombin(); - const int * _noalias const binpacked = buffers->get_binpacked(); - - const int xperiodic = domain->xperiodic; - const int yperiodic = domain->yperiodic; - const int zperiodic = domain->zperiodic; - const flt_t xprd_half = domain->xprd_half; - const flt_t yprd_half = domain->yprd_half; - const flt_t zprd_half = domain->zprd_half; - - #ifdef _LMP_INTEL_OFFLOAD - const int * _noalias const binhead = this->binhead; - const int * _noalias const bins = this->bins; - const int cop = _fix->coprocessor_number(); - const int separate_buffers = _fix->separate_buffers(); - #pragma offload target(mic:cop) if(offload) \ - in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ - in(tag:length(tag_size) alloc_if(0) free_if(0)) \ - in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ - in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ - in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \ - in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ - in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ - in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ - out(numneigh:length(0) alloc_if(0) free_if(0)) \ - in(ilist:length(0) alloc_if(0) free_if(0)) \ - in(atombin:length(aend) alloc_if(0) free_if(0)) \ - in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pad_width) \ - in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \ - in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \ - out(overflow:length(5) alloc_if(0) free_if(0)) \ - out(timer_compute:length(1) alloc_if(0) free_if(0)) \ - signal(tag) - #endif - { - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime(); - #endif - - #ifdef _LMP_INTEL_OFFLOAD - overflow[LMP_LOCAL_MIN] = astart; - overflow[LMP_LOCAL_MAX] = aend - 1; - overflow[LMP_GHOST_MIN] = e_nall; - overflow[LMP_GHOST_MAX] = -1; - #endif - - int nstencilp = 0; - int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL]; - for (int k = 0; k < nstencil; k++) { - binstart[nstencilp] = stencil[k]; - int end = stencil[k] + 1; - for (int kk = k + 1; kk < nstencil; kk++) { - if (stencil[kk-1]+1 == stencil[kk]) { - end++; - k++; - } else break; - } - binend[nstencilp] = end; - nstencilp++; - } - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(numneigh, overflow, nstencilp, binstart, binend) - #endif - { - #ifdef _LMP_INTEL_OFFLOAD - int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; - #endif - - const int num = aend - astart; - int tid, ifrom, ito; - - #ifdef OUTER_CHUNK - const int swidth = ip_simd::SIMD_type<flt_t>::width(); - IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth); - ifrom += astart; - ito += astart; - int e_ito = ito; - if (ito == num) { - int imod = ito % swidth; - if (imod) e_ito += swidth - imod; - } - const int list_size = (e_ito + tid * 2 + 2) * maxnbors; - #else - const int swidth = 1; - IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); - ifrom += astart; - ito += astart; - const int list_size = (ito + tid * 2 + 2) * maxnbors; - #endif - - int which; - - int pack_offset = maxnbors * swidth; - int ct = (ifrom + tid * 2) * maxnbors; - int *neighptr = firstneigh + ct; - const int obound = pack_offset + maxnbors * 2; - - int max_chunk = 0; - int lane = 0; - for (int i = ifrom; i < ito; i++) { - const flt_t xtmp = x[i].x; - const flt_t ytmp = x[i].y; - const flt_t ztmp = x[i].z; - const int itype = x[i].w; - const int ioffset = ntypes * itype; - - // loop over rest of atoms in i's bin, ghosts are at end of linked list - // if j is owned atom, store it, since j is beyond i in linked list - // if j is ghost, only store if j coords are "above/to the right" of i - - int raw_count = pack_offset; - for (int j = bins[i]; j >= 0; j = bins[j]) { - if (j >= nlocal) { - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost && offload) continue; - #endif - if (x[j].z < ztmp) continue; - if (x[j].z == ztmp) { - if (x[j].y < ytmp) continue; - if (x[j].y == ytmp && x[j].x < xtmp) continue; - } - } - #ifdef _LMP_INTEL_OFFLOAD - else if (offload_noghost && i < offload_end) continue; - #endif - - #ifndef _LMP_INTEL_OFFLOAD - if (exclude) { - const int jtype = x[j].w; - if (exclusion(i,j,itype,jtype,mask,molecule)) continue; - } - #endif - - neighptr[raw_count++] = j; - } - - // loop over all atoms in other bins in stencil, store every pair - - const int ibin = atombin[i]; - if (exclude) { - for (int k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; - #ifndef _LMP_INTEL_OFFLOAD - #ifdef INTEL_VMASK - #pragma simd - #endif - #endif - for (int jj = bstart; jj < bend; jj++) { - const int j = binpacked[jj]; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost) { - if (j < nlocal) { - if (i < offload_end) continue; - } else if (offload) continue; - } - #endif - - #ifndef _LMP_INTEL_OFFLOAD - const int jtype = x[j].w; - if (exclusion(i,j,itype,jtype,mask,molecule)) continue; - #endif - - neighptr[raw_count++] = j; - } - } - } else { - for (int k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; - #ifndef _LMP_INTEL_OFFLOAD - #ifdef INTEL_VMASK - #pragma simd - #endif - #endif - for (int jj = bstart; jj < bend; jj++) { - const int j = binpacked[jj]; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost) { - if (j < nlocal) { - if (i < offload_end) continue; - } else if (offload) continue; - } - #endif - - neighptr[raw_count++] = j; - } - } - } - - if (raw_count > obound) *overflow = 1; - - #if defined(LMP_SIMD_COMPILER) - #ifdef _LMP_INTEL_OFFLOAD - int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; - #if __INTEL_COMPILER+0 > 1499 - #pragma vector aligned - #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) - #endif - #else - #pragma vector aligned - #pragma simd - #endif - #endif - for (int u = pack_offset; u < raw_count; u++) { - int j = neighptr[u]; - const flt_t delx = xtmp - x[j].x; - const flt_t dely = ytmp - x[j].y; - const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; - const flt_t rsq = delx * delx + dely * dely + delz * delz; - if (rsq > cutneighsq[ioffset + jtype]) - neighptr[u] = e_nall; - else { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[u] = -j - 1; - } - #ifdef _LMP_INTEL_OFFLOAD - if (j < nlocal) { - if (j < vlmin) vlmin = j; - if (j > vlmax) vlmax = j; - } else { - if (j < vgmin) vgmin = j; - if (j > vgmax) vgmax = j; - } - #endif - } - } - #ifdef _LMP_INTEL_OFFLOAD - lmin = MIN(lmin,vlmin); - gmin = MIN(gmin,vgmin); - lmax = MAX(lmax,vlmax); - gmax = MAX(gmax,vgmax); - #endif - - int n = lane, n2 = pack_offset; - for (int u = pack_offset; u < raw_count; u++) { - const int j = neighptr[u]; - int pj = j; - if (pj < e_nall) { - if (need_ic) - if (pj < 0) pj = -pj - 1; - - if (pj < nlocal) { - neighptr[n] = j; - n += swidth; - } else - neighptr[n2++] = j; - } - } - int ns = (n - lane) / swidth; - for (int u = pack_offset; u < n2; u++) { - neighptr[n] = neighptr[u]; - n += swidth; - } - - ilist[i] = i; - cnumneigh[i] = ct + lane; - ns += n2 - pack_offset; - #ifndef OUTER_CHUNK - int edge = (ns % pad_width); - if (edge) { - const int pad_end = ns + (pad_width - edge); - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min=1, max=15, avg=8 - #endif - for ( ; ns < pad_end; ns++) - neighptr[ns] = e_nall; - } - #endif - numneigh[i] = ns; - - #ifdef OUTER_CHUNK - if (ns > max_chunk) max_chunk = ns; - lane++; - if (lane == swidth) { - ct += max_chunk * swidth; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - int edge = (ct % alignb); - if (edge) ct += alignb - edge; - neighptr = firstneigh + ct; - max_chunk = 0; - pack_offset = maxnbors * swidth; - lane = 0; - if (ct + obound > list_size) { - if (i < ito - 1) { - *overflow = 1; - ct = (ifrom + tid * 2) * maxnbors; - } - } - } - #else - ct += ns; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - edge = (ct % alignb); - if (edge) ct += alignb - edge; - neighptr = firstneigh + ct; - if (ct + obound > list_size) { - if (i < ito - 1) { - *overflow = 1; - ct = (ifrom + tid * 2) * maxnbors; - } - } - #endif - } - - if (*overflow == 1) - for (int i = ifrom; i < ito; i++) - numneigh[i] = 0; - - #ifdef _LMP_INTEL_OFFLOAD - if (separate_buffers) { - #if defined(_OPENMP) - #pragma omp critical - #endif - { - if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; - if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; - if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; - if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; - } - #pragma omp barrier - } - - int ghost_offset = 0, nall_offset = e_nall; - if (separate_buffers) { - int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; - if (nghost < 0) nghost = 0; - if (offload) { - ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; - nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; - } else { - ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; - nall_offset = nlocal + nghost; - } - } - #endif - - if (molecular) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - #ifndef OUTER_CHUNK - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd - #endif - for (int jj = 0; jj < jnum; jj++) { - #else - const int trip = jnum * swidth; - for (int jj = 0; jj < trip; jj+= swidth) { - #endif - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } - } - #ifdef _LMP_INTEL_OFFLOAD - else if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - int jj = 0; - for (jj = 0; jj < jnum; jj++) - if (jlist[jj] >= nlocal) break; - while (jj < jnum) { - if (jlist[jj] == e_nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; - jj++; - } - } - } - #endif - } // end omp - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime() - *timer_compute; - #endif - } // end offload - - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - _fix->stop_watch(TIME_OFFLOAD_LATENCY); - _fix->start_watch(TIME_HOST_NEIGHBOR); - for (int n = 0; n < aend; n++) { - ilist[n] = n; - numneigh[n] = 0; - } - } else { - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; - if (separate_buffers) { - _fix->start_watch(TIME_PACK); - _fix->set_neighbor_host_sizes(); - buffers->pack_sep_from_single(_fix->host_min_local(), - _fix->host_used_local(), - _fix->host_min_ghost(), - _fix->host_used_ghost()); - _fix->stop_watch(TIME_PACK); - } - } - #else - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; + bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal); #endif } diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.h b/src/USER-INTEL/npair_half_bin_newton_intel.h index 9b5d0780a1..54a8e24135 100644 --- a/src/USER-INTEL/npair_half_bin_newton_intel.h +++ b/src/USER-INTEL/npair_half_bin_newton_intel.h @@ -36,9 +36,6 @@ class NPairHalfBinNewtonIntel : public NPairIntel { private: template <class flt_t, class acc_t> void hbni(NeighList *, IntelBuffers<flt_t,acc_t> *); - template <class flt_t, class acc_t, int, int> - void hbni(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int, - const int, const int offload_end = 0); }; } diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp index 5f191e0797..3c36458f06 100644 --- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp +++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp @@ -26,7 +26,7 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : +NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : NPairIntel(lmp) {} /* ---------------------------------------------------------------------- @@ -75,439 +75,32 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int need_ic = 0; if (atom->molecular) dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); + neighbor->cutneighmax); #ifdef _LMP_INTEL_OFFLOAD if (need_ic) { if (offload_noghost) { - hbnti<flt_t,acc_t,1,1>(1, list, buffers, 0, off_end); - hbnti<flt_t,acc_t,1,1>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal, + off_end); } else { - hbnti<flt_t,acc_t,0,1>(1, list, buffers, 0, off_end); - hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal); } } else { if (offload_noghost) { - hbnti<flt_t,acc_t,1,0>(1, list, buffers, 0, off_end); - hbnti<flt_t,acc_t,1,0>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal, + off_end); } else { - hbnti<flt_t,acc_t,0,0>(1, list, buffers, 0, off_end); - hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal); } } #else if (need_ic) - hbnti<flt_t,acc_t,0,1>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal); else - hbnti<flt_t,acc_t,0,0>(0, list, buffers, host_start, nlocal); - #endif -} - -template <class flt_t, class acc_t, int offload_noghost, int need_ic> -void NPairHalfBinNewtonTriIntel:: -hbnti(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers, - const int astart, const int aend, const int offload_end) { - if (aend-astart == 0) return; - - const int nall = atom->nlocal + atom->nghost; - int pad = 1; - int nall_t = nall; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost && offload) nall_t = atom->nlocal; - if (offload) { - if (INTEL_MIC_NBOR_PAD > 1) - pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); - } else - #endif - if (INTEL_NBOR_PAD > 1) - pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); - const int pad_width = pad; - - const ATOM_T * _noalias const x = buffers->get_x(); - int * _noalias const firstneigh = buffers->firstneigh(list); - const int e_nall = nall_t; - - const int molecular = atom->molecular; - int *ns = NULL; - tagint *s = NULL; - int tag_size = 0, special_size; - if (buffers->need_tag()) tag_size = e_nall; - if (molecular) { - s = atom->special[0]; - ns = atom->nspecial[0]; - special_size = aend; - } else { - s = &buffers->_special_holder; - ns = &buffers->_nspecial_holder; - special_size = 0; - } - const tagint * _noalias const special = s; - const int * _noalias const nspecial = ns; - const int maxspecial = atom->maxspecial; - const tagint * _noalias const tag = atom->tag; - - int * _noalias const ilist = list->ilist; - int * _noalias numneigh = list->numneigh; - int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int nstencil = this->nstencil; - const int * _noalias const stencil = this->stencil; - const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; - const int ntypes = atom->ntypes + 1; - const int nlocal = atom->nlocal; - - #ifndef _LMP_INTEL_OFFLOAD - int * const mask = atom->mask; - tagint * const molecule = atom->molecule; - #endif - - int tnum; - int *overflow; - double *timer_compute; - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - timer_compute = _fix->off_watch_neighbor(); - tnum = buffers->get_off_threads(); - overflow = _fix->get_off_overflow_flag(); - _fix->stop_watch(TIME_HOST_NEIGHBOR); - _fix->start_watch(TIME_OFFLOAD_LATENCY); - } else - #endif - { - tnum = comm->nthreads; - overflow = _fix->get_overflow_flag(); - } - const int nthreads = tnum; - const int maxnbors = buffers->get_max_nbors(); - int * _noalias const atombin = buffers->get_atombin(); - const int * _noalias const binpacked = buffers->get_binpacked(); - - const int xperiodic = domain->xperiodic; - const int yperiodic = domain->yperiodic; - const int zperiodic = domain->zperiodic; - const flt_t xprd_half = domain->xprd_half; - const flt_t yprd_half = domain->yprd_half; - const flt_t zprd_half = domain->zprd_half; - - #ifdef _LMP_INTEL_OFFLOAD - const int * _noalias const binhead = this->binhead; - const int * _noalias const bins = this->bins; - const int cop = _fix->coprocessor_number(); - const int separate_buffers = _fix->separate_buffers(); - #pragma offload target(mic:cop) if(offload) \ - in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ - in(tag:length(tag_size) alloc_if(0) free_if(0)) \ - in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ - in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ - in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \ - in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ - in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ - in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ - out(numneigh:length(0) alloc_if(0) free_if(0)) \ - in(ilist:length(0) alloc_if(0) free_if(0)) \ - in(atombin:length(aend) alloc_if(0) free_if(0)) \ - in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ - in(maxnbors,nthreads,maxspecial,nstencil,offload_end,pad_width,e_nall) \ - in(offload,separate_buffers, astart, aend, nlocal, molecular, ntypes) \ - in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \ - out(overflow:length(5) alloc_if(0) free_if(0)) \ - out(timer_compute:length(1) alloc_if(0) free_if(0)) \ - signal(tag) - #endif - { - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime(); - #endif - - #ifdef _LMP_INTEL_OFFLOAD - overflow[LMP_LOCAL_MIN] = astart; - overflow[LMP_LOCAL_MAX] = aend - 1; - overflow[LMP_GHOST_MIN] = e_nall; - overflow[LMP_GHOST_MAX] = -1; - #endif - - int nstencilp = 0; - int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL]; - for (int k = 0; k < nstencil; k++) { - binstart[nstencilp] = stencil[k]; - int end = stencil[k] + 1; - for (int kk = k + 1; kk < nstencil; kk++) { - if (stencil[kk-1]+1 == stencil[kk]) { - end++; - k++; - } else break; - } - binend[nstencilp] = end; - nstencilp++; - } - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(numneigh, overflow, nstencilp, binstart, binend) - #endif - { - #ifdef _LMP_INTEL_OFFLOAD - int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; - #endif - - const int num = aend - astart; - int tid, ifrom, ito; - IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); - ifrom += astart; - ito += astart; - - int which; - - const int list_size = (ito + tid * 2 + 2) * maxnbors; - int ct = (ifrom + tid * 2) * maxnbors; - int *neighptr = firstneigh + ct; - const int obound = maxnbors * 3; - - for (int i = ifrom; i < ito; i++) { - const flt_t xtmp = x[i].x; - const flt_t ytmp = x[i].y; - const flt_t ztmp = x[i].z; - const int itype = x[i].w; - const int ioffset = ntypes * itype; - - // loop over all atoms in bins in stencil - // pairs for atoms j "below" i are excluded - // below = lower z or (equal z and lower y) or (equal zy and lower x) - // (equal zyx and j <= i) - // latter excludes self-self interaction but allows superposed atoms - - const int ibin = atombin[i]; - - int raw_count = maxnbors; - for (int k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; - for (int jj = bstart; jj < bend; jj++) { - const int j = binpacked[jj]; - - #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost) { - if (j < nlocal) { - if (i < offload_end) continue; - } else if (offload) continue; - } - #endif - - if (x[j].z < ztmp) continue; - if (x[j].z == ztmp) { - if (x[j].y < ytmp) continue; - if (x[j].y == ytmp) { - if (x[j].x < xtmp) continue; - if (x[j].x == xtmp && j <= i) continue; - } - } - - #ifndef _LMP_INTEL_OFFLOAD - if (exclude) { - const int jtype = x[j].w; - if (exclusion(i,j,itype,jtype,mask,molecule)) continue; - } - #endif - - neighptr[raw_count++] = j; - } - } - if (raw_count > obound) - *overflow = 1; - - #if defined(LMP_SIMD_COMPILER) - #ifdef _LMP_INTEL_OFFLOAD - int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; - #if __INTEL_COMPILER+0 > 1499 - #pragma vector aligned - #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) - #endif - #else - #pragma vector aligned - #pragma simd - #endif - #endif - for (int u = maxnbors; u < raw_count; u++) { - int j = neighptr[u]; - const flt_t delx = xtmp - x[j].x; - const flt_t dely = ytmp - x[j].y; - const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; - const flt_t rsq = delx * delx + dely * dely + delz * delz; - if (rsq > cutneighsq[ioffset + jtype]) - neighptr[u] = e_nall; - else { - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - neighptr[u] = -j - 1; - } - - #ifdef _LMP_INTEL_OFFLOAD - if (j < nlocal) { - if (j < vlmin) vlmin = j; - if (j > vlmax) vlmax = j; - } else { - if (j < vgmin) vgmin = j; - if (j > vgmax) vgmax = j; - } - #endif - } - } - - int n = 0, n2 = maxnbors; - for (int u = maxnbors; u < raw_count; u++) { - const int j = neighptr[u]; - int pj = j; - if (pj < e_nall) { - if (need_ic) - if (pj < 0) pj = -pj - 1; - - if (pj < nlocal) - neighptr[n++] = j; - else - neighptr[n2++] = j; - } - } - int ns = n; - for (int u = maxnbors; u < n2; u++) - neighptr[n++] = neighptr[u]; - - ilist[i] = i; - cnumneigh[i] = ct; - ns += n2 - maxnbors; - - int edge = (ns % pad_width); - if (edge) { - const int pad_end = ns + (pad_width - edge); - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min=1, max=15, avg=8 - #endif - for ( ; ns < pad_end; ns++) - neighptr[ns] = e_nall; - } - numneigh[i] = ns; - - ct += ns; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - edge = (ct % alignb); - if (edge) ct += alignb - edge; - neighptr = firstneigh + ct; - if (ct + obound > list_size) { - if (i < ito - 1) { - *overflow = 1; - ct = (ifrom + tid * 2) * maxnbors; - } - } - } - - if (*overflow == 1) - for (int i = ifrom; i < ito; i++) - numneigh[i] = 0; - - #ifdef _LMP_INTEL_OFFLOAD - if (separate_buffers) { - #if defined(_OPENMP) - #pragma omp critical - #endif - { - if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; - if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; - if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; - if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; - } - #pragma omp barrier - } - - int ghost_offset = 0, nall_offset = e_nall; - if (separate_buffers) { - int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; - if (nghost < 0) nghost = 0; - if (offload) { - ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; - nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; - } else { - ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; - nall_offset = nlocal + nghost; - } - } - #endif - - if (molecular) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd - #endif - for (int jj = 0; jj < jnum; jj++) { - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } - } - #ifdef _LMP_INTEL_OFFLOAD - else if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - int jj = 0; - for (jj = 0; jj < jnum; jj++) - if (jlist[jj] >= nlocal) break; - while (jj < jnum) { - if (jlist[jj] == e_nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; - jj++; - } - } - } - #endif - } // end omp - #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) - *timer_compute = MIC_Wtime() - *timer_compute; - #endif - } // end offload - - #ifdef _LMP_INTEL_OFFLOAD - if (offload) { - _fix->stop_watch(TIME_OFFLOAD_LATENCY); - _fix->start_watch(TIME_HOST_NEIGHBOR); - for (int n = 0; n < aend; n++) { - ilist[n] = n; - numneigh[n] = 0; - } - } else { - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; - if (separate_buffers) { - _fix->start_watch(TIME_PACK); - _fix->set_neighbor_host_sizes(); - buffers->pack_sep_from_single(_fix->host_min_local(), - _fix->host_used_local(), - _fix->host_min_ghost(), - _fix->host_used_ghost()); - _fix->stop_watch(TIME_PACK); - } - } - #else - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; + bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal); #endif } diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.h b/src/USER-INTEL/npair_half_bin_newton_tri_intel.h index d144c2fc52..7a7f4c8030 100644 --- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.h +++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.h @@ -36,9 +36,6 @@ class NPairHalfBinNewtonTriIntel : public NPairIntel { private: template <class flt_t, class acc_t> void hbnti(NeighList *, IntelBuffers<flt_t,acc_t> *); - template <class flt_t, class acc_t, int, int> - void hbnti(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int, - const int, const int offload_end = 0); }; } diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index c92ed88774..0412398796 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -48,6 +48,678 @@ NPairIntel::~NPairIntel() { /* ---------------------------------------------------------------------- */ +template <class flt_t, class acc_t, int offload_noghost, int need_ic, + int FULL, int TRI, int THREE> +void NPairIntel::bin_newton(const int offload, NeighList *list, + IntelBuffers<flt_t,acc_t> *buffers, + const int astart, const int aend, + const int offload_end) { + + if (aend-astart == 0) return; + + const int nall = atom->nlocal + atom->nghost; + int pad = 1; + int nall_t = nall; + + #ifdef _LMP_INTEL_OFFLOAD + if (offload_noghost && offload) nall_t = atom->nlocal; + if (THREE == 0 && offload) { + if (INTEL_MIC_NBOR_PAD > 1) + pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); + } else + #endif + if (THREE == 0 && INTEL_NBOR_PAD > 1) + pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); + const int pad_width = pad; + const int pack_width = _fix->nbor_pack_width(); + + const ATOM_T * _noalias const x = buffers->get_x(); + int * _noalias const firstneigh = buffers->firstneigh(list); + const int e_nall = nall_t; + + const int molecular = atom->molecular; + int *ns = NULL; + tagint *s = NULL; + int tag_size = 0, special_size; + if (buffers->need_tag()) tag_size = e_nall; + if (molecular) { + s = atom->special[0]; + ns = atom->nspecial[0]; + special_size = aend; + } else { + s = &buffers->_special_holder; + ns = &buffers->_nspecial_holder; + special_size = 0; + } + const tagint * _noalias const special = s; + const int * _noalias const nspecial = ns; + const int maxspecial = atom->maxspecial; + const tagint * _noalias const tag = atom->tag; + + int * _noalias const ilist = list->ilist; + int * _noalias numneigh = list->numneigh; + int * _noalias const cnumneigh = buffers->cnumneigh(list); + const int nstencil = this->nstencil; + const int * _noalias const stencil = this->stencil; + const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; + const int ntypes = atom->ntypes + 1; + const int nlocal = atom->nlocal; + + #ifndef _LMP_INTEL_OFFLOAD + int * const mask = atom->mask; + tagint * const molecule = atom->molecule; + #endif + + int tnum; + int *overflow; + double *timer_compute; + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + timer_compute = _fix->off_watch_neighbor(); + tnum = buffers->get_off_threads(); + overflow = _fix->get_off_overflow_flag(); + _fix->stop_watch(TIME_HOST_NEIGHBOR); + _fix->start_watch(TIME_OFFLOAD_LATENCY); + } else + #endif + { + tnum = comm->nthreads; + overflow = _fix->get_overflow_flag(); + } + const int nthreads = tnum; + const int maxnbors = buffers->get_max_nbors(); + int * _noalias const atombin = buffers->get_atombin(); + const int * _noalias const binpacked = buffers->get_binpacked(); + + const int xperiodic = domain->xperiodic; + const int yperiodic = domain->yperiodic; + const int zperiodic = domain->zperiodic; + const flt_t xprd_half = domain->xprd_half; + const flt_t yprd_half = domain->yprd_half; + const flt_t zprd_half = domain->zprd_half; + + flt_t * _noalias const ncachex = buffers->get_ncachex(); + flt_t * _noalias const ncachey = buffers->get_ncachey(); + flt_t * _noalias const ncachez = buffers->get_ncachez(); + int * _noalias const ncachej = buffers->get_ncachej(); + int * _noalias const ncachejtype = buffers->get_ncachejtype(); + const int ncache_stride = buffers->ncache_stride(); + + #ifdef _LMP_INTEL_OFFLOAD + const int * _noalias const binhead = this->binhead; + const int * _noalias const bins = this->bins; + const int cop = _fix->coprocessor_number(); + const int separate_buffers = _fix->separate_buffers(); + #pragma offload target(mic:cop) if(offload) \ + in(x:length(e_nall+1) alloc_if(0) free_if(0)) \ + in(tag:length(tag_size) alloc_if(0) free_if(0)) \ + in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \ + in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \ + in(bins,binpacked:length(nall) alloc_if(0) free_if(0)) \ + in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ + in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ + in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + out(numneigh:length(0) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ + in(atombin:length(aend) alloc_if(0) free_if(0)) \ + in(stencil:length(nstencil) alloc_if(0) free_if(0)) \ + in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \ + in(ncachejtype:length(0) alloc_if(0) free_if(0)) \ + in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \ + in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \ + in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \ + in(pack_width) \ + out(overflow:length(5) alloc_if(0) free_if(0)) \ + out(timer_compute:length(1) alloc_if(0) free_if(0)) \ + signal(tag) + #endif + { + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime(); + #endif + + #ifdef _LMP_INTEL_OFFLOAD + overflow[LMP_LOCAL_MIN] = astart; + overflow[LMP_LOCAL_MAX] = aend - 1; + overflow[LMP_GHOST_MIN] = e_nall; + overflow[LMP_GHOST_MAX] = -1; + #endif + + int nstencilp = 0; + int binstart[INTEL_MAX_STENCIL], binend[INTEL_MAX_STENCIL]; + for (int k = 0; k < nstencil; k++) { + binstart[nstencilp] = stencil[k]; + int end = stencil[k] + 1; + for (int kk = k + 1; kk < nstencil; kk++) { + if (stencil[kk-1]+1 == stencil[kk]) { + end++; + k++; + } else break; + } + binend[nstencilp] = end; + nstencilp++; + } + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(numneigh, overflow, nstencilp, binstart, binend) + #endif + { + #ifdef _LMP_INTEL_OFFLOAD + int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1; + #endif + + const int num = aend - astart; + int tid, ifrom, ito; + + if (THREE) { + IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width); + } else { + IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); + } + ifrom += astart; + ito += astart; + int e_ito = ito; + if (THREE && ito == num) { + int imod = ito % pack_width; + if (imod) e_ito += pack_width - imod; + } + const int list_size = (e_ito + tid * 2 + 2) * maxnbors; + + int which; + + int pack_offset = maxnbors; + if (THREE) pack_offset *= pack_width; + int ct = (ifrom + tid * 2) * maxnbors; + int *neighptr = firstneigh + ct; + const int obound = pack_offset + maxnbors * 2; + + const int toffs = tid * ncache_stride; + flt_t * _noalias const tx = ncachex + toffs; + flt_t * _noalias const ty = ncachey + toffs; + flt_t * _noalias const tz = ncachez + toffs; + int * _noalias const tj = ncachej + toffs; + int * _noalias const tjtype = ncachejtype + toffs; + + flt_t * _noalias itx; + flt_t * _noalias ity; + flt_t * _noalias itz; + int * _noalias itj; + int * _noalias itjtype; + + // loop over all atoms in other bins in stencil, store every pair + int istart, icount, ncount, oldbin = -9999999, lane, max_chunk; + if (THREE) { + lane = 0; + max_chunk = 0; + } + for (int i = ifrom; i < ito; i++) { + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + const int itype = x[i].w; + tagint itag; + if (THREE) itag = tag[i]; + const int ioffset = ntypes * itype; + + const int ibin = atombin[i]; + if (ibin != oldbin) { + oldbin = ibin; + ncount = 0; + for (int k = 0; k < nstencilp; k++) { + const int bstart = binhead[ibin + binstart[k]]; + const int bend = binhead[ibin + binend[k]]; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int jj = bstart; jj < bend; jj++) + tj[ncount++] = binpacked[jj]; + } + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int u = 0; u < ncount; u++) { + const int j = tj[u]; + tx[u] = x[j].x; + ty[u] = x[j].y; + tz[u] = x[j].z; + tjtype[u] = x[j].w; + } + + if (FULL == 0 || TRI == 1) { + icount = 0; + istart = ncount; + const int alignb = INTEL_DATA_ALIGN / sizeof(int); + int nedge = istart % alignb; + if (nedge) istart + (alignb - nedge); + itx = tx + istart; + ity = ty + istart; + itz = tz + istart; + itj = tj + istart; + itjtype = tjtype + istart; + + const int bstart = binhead[ibin]; + const int bend = binhead[ibin + 1]; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int jj = bstart; jj < bend; jj++) { + const int j = binpacked[jj]; + itj[icount] = j; + itx[icount] = x[j].x; + ity[icount] = x[j].y; + itz[icount] = x[j].z; + itjtype[icount] = x[j].w; + icount++; + } + if (icount + istart > obound) *overflow = 1; + } else + if (ncount > obound) *overflow = 1; + } + + // ---------------------- Loop over i bin + + int n = 0; + if (FULL == 0 || TRI == 1) { + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int u = 0; u < icount; u++) { + int addme = 1; + int j = itj[u]; + + // Cutoff Check + const flt_t delx = xtmp - itx[u]; + const flt_t dely = ytmp - ity[u]; + const flt_t delz = ztmp - itz[u]; + const int jtype = itjtype[u]; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + if (rsq > cutneighsq[ioffset + jtype]) addme = 0; + + // i bin (half) check and offload ghost check + if (j < nlocal) { + const int ijmod = (i + j) % 2; + if (i > j) { + if (ijmod == 0) addme = 0; + } else if (i < j) { + if (ijmod == 1) addme = 0; + } else + addme = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (offload_noghost && i < offload_end) addme = 0; + #endif + } else { + #ifdef _LMP_INTEL_OFFLOAD + if (offload_noghost && offload) addme = 0; + #endif + if (itz[u] < ztmp) addme = 0; + if (itz[u] == ztmp) { + if (ity[u] < ytmp) addme = 0; + if (ity[u] == ytmp && itx[u] < xtmp) addme = 0; + } + } + + if (need_ic) { + int no_special; + ominimum_image_check(no_special, delx, dely, delz); + if (no_special) + j = -j - 1; + } + + if (addme) + neighptr[n++] = j; + } + } // if FULL==0 + + // ---------------------- Loop over other bins + + int n2, *neighptr2; + if (THREE) { + n = pack_offset; + n2 = pack_offset + maxnbors; + neighptr2 = neighptr; + } + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int u = 0; u < ncount; u++) { + int addme = 1; + int j = tj[u]; + + if (FULL) + if (i == j) addme = 0; + + // Cutoff Check + const flt_t delx = xtmp - tx[u]; + const flt_t dely = ytmp - ty[u]; + const flt_t delz = ztmp - tz[u]; + const int jtype = tjtype[u]; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + if (rsq > cutneighsq[ioffset + jtype]) addme = 0; + + // Triclinic + if (TRI) { + if (tz[u] < ztmp) addme = 0; + if (tz[u] == ztmp) { + if (ty[u] < ytmp) addme = 0; + if (ty[u] == ytmp) { + if (tx[u] < xtmp) addme = 0; + if (tx[u] == xtmp && j <= i) addme = 0; + } + } + } + + // offload ghost check + #ifdef _LMP_INTEL_OFFLOAD + if (offload_noghost) { + if (j < nlocal) { + if (i < offload_end) addme = 0; + } else if (offload) addme = 0; + } + #endif + + int pj; + if (THREE) pj = j; + if (need_ic) { + int no_special; + ominimum_image_check(no_special, delx, dely, delz); + if (no_special) + j = -j - 1; + } + + if (THREE) { + const int jtag = tag[pj]; + int flist = 0; + if (itag > jtag) { + if ((itag+jtag) % 2 == 0) flist = 1; + } else if (itag < jtag) { + if ((itag+jtag) % 2 == 1) flist = 1; + } else { + if (tz[u] < ztmp) flist = 1; + else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1; + else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) + flist = 1; + } + if (addme) { + if (flist) + neighptr2[n2++] = j; + else + neighptr[n++] = j; + } + } else { + if (addme) + neighptr[n++] = j; + } + } // for u + + #ifndef _LMP_INTEL_OFFLOAD + if (exclude) { + int alln = n; + if (THREE) n = pack_offset; + else n = 0; + for (int u = pack_offset; u < alln; u++) { + const int j = neighptr[u]; + int pj = j; + if (need_ic) + if (pj < 0) pj = -j - 1; + const int jtype = x[pj].w; + if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; + neighptr[n++] = j; + } + if (THREE) { + alln = n2; + n2 = pack_offset + maxnbors; + for (int u = pack_offset + maxnbors; u < alln; u++) { + const int j = neighptr[u]; + int pj = j; + if (need_ic) + if (pj < 0) pj = -j - 1; + const int jtype = x[pj].w; + if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; + neighptr[n2++] = j; + } + } + } + #endif + int ns; + if (THREE) { + int alln = n; + ns = n - pack_offset; + atombin[i] = ns; + n = lane; + for (int u = pack_offset; u < alln; u++) { + neighptr[n] = neighptr[u]; + n += pack_width; + } + ns += n2 - pack_offset - maxnbors; + for (int u = pack_offset + maxnbors; u < n2; u++) { + neighptr[n] = neighptr[u]; + n += pack_width; + } + if (ns > maxnbors) *overflow = 1; + } else + if (n > maxnbors) *overflow = 1; + + ilist[i] = i; + cnumneigh[i] = ct; + if (THREE) { + cnumneigh[i] += lane; + numneigh[i] = ns; + } else { + int edge = (n % pad_width); + if (edge) { + const int pad_end = n + (pad_width - edge); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif + for ( ; n < pad_end; n++) + neighptr[n] = e_nall; + } + numneigh[i] = n; + } + + if (THREE) { + if (ns > max_chunk) max_chunk = ns; + lane++; + if (lane == pack_width) { + ct += max_chunk * pack_width; + const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); + const int edge = (ct % alignb); + if (edge) ct += alignb - edge; + neighptr = firstneigh + ct; + max_chunk = 0; + pack_offset = maxnbors * pack_width; + lane = 0; + if (ct + obound > list_size) { + if (i < ito - 1) { + *overflow = 1; + ct = (ifrom + tid * 2) * maxnbors; + } + } + } + } else { + ct += n; + const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); + const int edge = (ct % alignb); + if (edge) ct += alignb - edge; + neighptr = firstneigh + ct; + if (ct + obound > list_size) { + if (i < ito - 1) { + *overflow = 1; + ct = (ifrom + tid * 2) * maxnbors; + } + } + } + } + + if (*overflow == 1) + for (int i = ifrom; i < ito; i++) + numneigh[i] = 0; + + #ifdef _LMP_INTEL_OFFLOAD + int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; + int ghost_offset = 0, nall_offset = e_nall; + if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * _noalias jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + #if __INTEL_COMPILER+0 > 1499 + #pragma vector aligned + #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) + #endif + for (int jj = 0; jj < jnum; jj++) { + int j = jlist[jj]; + if (need_ic && j < 0) j = -j - 1; + if (j < nlocal) { + if (j < vlmin) vlmin = j; + if (j > vlmax) vlmax = j; + } else { + if (j < vgmin) vgmin = j; + if (j > vgmax) vgmax = j; + } + } + } + lmin = MIN(lmin,vlmin); + gmin = MIN(gmin,vgmin); + lmax = MAX(lmax,vlmax); + gmax = MAX(gmax,vgmax); + + #if defined(_OPENMP) + #pragma omp critical + #endif + { + if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; + if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; + if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; + if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; + } + #pragma omp barrier + + int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; + if (nghost < 0) nghost = 0; + if (offload) { + ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; + nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; + } else { + ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; + nall_offset = nlocal + nghost; + } + } // if separate_buffers + #endif + + if (molecular) { + for (int i = ifrom; i < ito; ++i) { + int * _noalias jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + if (THREE) { + const int trip = jnum * pack_width; + for (int jj = 0; jj < trip; jj+=pack_width) { + const int j = jlist[jj]; + if (need_ic && j < 0) { + which = 0; + jlist[jj] = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; + } else + #endif + if (which) jlist[jj] = j ^ (which << SBBITS); + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd + #endif + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj]; + if (need_ic && j < 0) { + which = 0; + jlist[jj] = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; + } else + #endif + if (which) jlist[jj] = j ^ (which << SBBITS); + } + } + } // for i + } // if molecular + #ifdef _LMP_INTEL_OFFLOAD + else if (separate_buffers) { + for (int i = ifrom; i < ito; ++i) { + int * _noalias jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + int jj = 0; + #pragma vector aligned + #pragma simd + for (jj = 0; jj < jnum; jj++) { + if (jlist[jj] >= nlocal) { + if (jlist[jj] == e_nall) jlist[jj] = nall_offset; + else jlist[jj] -= ghost_offset; + } + } + } + } + #endif + } // end omp + #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) + *timer_compute = MIC_Wtime() - *timer_compute; + #endif + } // end offload + + #ifdef _LMP_INTEL_OFFLOAD + if (offload) { + _fix->stop_watch(TIME_OFFLOAD_LATENCY); + _fix->start_watch(TIME_HOST_NEIGHBOR); + for (int n = 0; n < aend; n++) { + ilist[n] = n; + numneigh[n] = 0; + } + } else { + for (int i = astart; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + if (separate_buffers) { + _fix->start_watch(TIME_PACK); + _fix->set_neighbor_host_sizes(); + buffers->pack_sep_from_single(_fix->host_min_local(), + _fix->host_used_local(), + _fix->host_min_ghost(), + _fix->host_used_ghost()); + _fix->stop_watch(TIME_PACK); + } + } + #else + #pragma vector aligned + #pragma simd + for (int i = astart; i < aend; i++) + list->firstneigh[i] = firstneigh + cnumneigh[i]; + #endif +} + +/* ---------------------------------------------------------------------- */ + #ifdef _LMP_INTEL_OFFLOAD void NPairIntel::grow_stencil() { @@ -62,6 +734,204 @@ void NPairIntel::grow_stencil() const int maxstencil = ns->get_maxstencil(); #pragma offload_transfer target(mic:_cop) \ in(stencil:length(maxstencil) alloc_if(1) free_if(0)) - } + } } #endif + +/* ---------------------------------------------------------------------- */ + +// ---- Half, no IC + +template void NPairIntel::bin_newton<float, float, 0, 0, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 0, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 0, 0, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Half, IC + +template void NPairIntel::bin_newton<float, float, 0, 1, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 1, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 1, 0, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Tri, no IC + +template void NPairIntel::bin_newton<float, float, 0, 0, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 0, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 0, 0, 1, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Tri, IC + +template void NPairIntel::bin_newton<float, float, 0, 1, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 1, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 1, 0, 1, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Full, no IC + +template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Full, IC + +template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- 3-body, no IC + +template void NPairIntel::bin_newton<float, float, 0, 0, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 0, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 0, 1, 0, 1> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- 3-body, IC + +template void NPairIntel::bin_newton<float, float, 0, 1, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 0, 1, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 0, 1, 1, 0, 1> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +#ifdef _LMP_INTEL_OFFLOAD + +// ---- Half, no IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 0, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 0, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 0, 0, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Half, IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 1, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 1, 0, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 1, 0, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Tri, no IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 0, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 0, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 0, 0, 1, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Tri, IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 1, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 1, 0, 1, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 1, 0, 1, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Full, no IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- Full, IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 0> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 0> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- 3-body, no IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 0, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 0, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 0, 1, 0, 1> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +// ---- 3-body, IC, no ghost + +template void NPairIntel::bin_newton<float, float, 1, 1, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,float> *, const int, const int, + const int); +template void NPairIntel::bin_newton<float, double, 1, 1, 1, 0, 1> + (const int, NeighList *, IntelBuffers<float,double> *, const int, const int, + const int); +template void NPairIntel::bin_newton<double, double, 1, 1, 1, 0, 1> + (const int, NeighList *, IntelBuffers<double,double> *, const int, const int, + const int); + +#endif diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h index 06d5d79cac..51574a252c 100644 --- a/src/USER-INTEL/npair_intel.h +++ b/src/USER-INTEL/npair_intel.h @@ -25,10 +25,6 @@ #include "intel_simd.h" #endif -#ifdef OUTER_CHUNK -#include "intel_simd.h" -#endif - #ifdef _LMP_INTEL_OFFLOAD #pragma offload_attribute(push,target(mic)) #endif @@ -87,6 +83,10 @@ class NPairIntel : public NPair { protected: FixIntel *_fix; + template <class flt_t, class acc_t, int, int, int, int, int> + void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, + const int, const int, const int offload_end = 0); + #ifdef _LMP_INTEL_OFFLOAD int _cop; int *_off_map_stencil; diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp index 4f34a484cb..cdea9e76c4 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp @@ -85,53 +85,47 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag, if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairBuckCoulCutIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -165,7 +159,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -208,27 +202,26 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - for (int i = iifrom; i < iito; ++i) { + for (int i = iifrom; i < iito; i += iip) { const int itype = x[i].w; const int ptr_off = itype * ntypes; @@ -246,10 +239,9 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned @@ -319,71 +311,72 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, if (rsq < c_cuti[jtype].cutsq) { #endif const flt_t fpair = (forcecoul + forcebuck) * r2inv; - fxtmp += delx * fpair; - fytmp += dely * fpair; - fztmp += delz * fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx * fpair; - f[j].y -= dely * fpair; - f[j].z -= delz * fpair; - } + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - sevdwl += ev_pre * evdwl; - secoul += ev_pre * ecoul; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR || j < nlocal) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - } + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz); - } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } #endif } // for jj - - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - - IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -395,7 +388,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -406,6 +399,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, void PairBuckCoulCutIntel::init_style() { PairBuckCoulCut::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.h b/src/USER-INTEL/pair_buck_coul_cut_intel.h index 6590cd9c16..42a55ac21f 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.h +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.h @@ -49,7 +49,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp index 9319f531e1..a9aee1e53e 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp @@ -85,53 +85,47 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag, if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairBuckCoulLongIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -170,9 +164,17 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const int ntypes = atom->ntypes + 1; const int eatom = this->eflag_atom; + flt_t * _noalias const ccachex = buffers->get_ccachex(); + flt_t * _noalias const ccachey = buffers->get_ccachey(); + flt_t * _noalias const ccachez = buffers->get_ccachez(); + flt_t * _noalias const ccachew = buffers->get_ccachew(); + int * _noalias const ccachei = buffers->get_ccachei(); + int * _noalias const ccachej = buffers->get_ccachej(); + const int ccache_stride = _ccache_stride; + // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -208,8 +210,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ + in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ - in(f_stride,nlocal,minlocal,separate_flag,offload) \ + in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ @@ -224,27 +228,34 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - - for (int i = iifrom; i < iito; ++i) { + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + const int toffs = tid * ccache_stride; + flt_t * _noalias const tdelx = ccachex + toffs; + flt_t * _noalias const tdely = ccachey + toffs; + flt_t * _noalias const tdelz = ccachez + toffs; + flt_t * _noalias const trsq = ccachew + toffs; + int * _noalias const tj = ccachei + toffs; + int * _noalias const tjtype = ccachej + toffs; + + for (int i = iifrom; i < iito; i += iip) { const int itype = x[i].w; const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; @@ -262,85 +273,98 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + int ej = 0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { - flt_t forcecoul, forcebuck, evdwl, ecoul; - forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0; - - const int sbindex = jlist[jj] >> SBBITS & 3; const int j = jlist[jj] & NEIGHMASK; - const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; + const int jtype = x[j].w; const flt_t rsq = delx * delx + dely * dely + delz * delz; + + if (rsq < c_forcei[jtype].cutsq) { + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + tjtype[ej]=jtype; + tj[ej]=jlist[jj]; + ej++; + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < ej; jj++) { + flt_t forcecoul, forcebuck, evdwl, ecoul; + forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0; + + const int j = tj[jj] & NEIGHMASK; + const int sbindex = tj[jj] >> SBBITS & 3; + const int jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r = (flt_t)1.0 / sqrt(r2inv); - #ifdef INTEL_VMASK - if (rsq < c_forcei[jtype].cutsq) { + #ifdef INTEL_ALLOW_TABLE + if (!ncoultablebits || rsq <= tabinnersq) { #endif - #ifdef INTEL_ALLOW_TABLE - if (!ncoultablebits || rsq <= tabinnersq) { - #endif - const flt_t A1 = 0.254829592; - const flt_t A2 = -0.284496736; - const flt_t A3 = 1.421413741; - const flt_t A4 = -1.453152027; - const flt_t A5 = 1.061405429; - const flt_t EWALD_F = 1.12837917; - const flt_t INV_EWALD_P = 1.0 / 0.3275911; - - const flt_t grij = g_ewald * r; - const flt_t expm2 = exp(-grij * grij); - const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); - const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const flt_t prefactor = qqrd2e * qtmp * q[j] / r; - forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); - if (EFLAG) ecoul = prefactor * erfc; - - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + const flt_t A1 = 0.254829592; + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + + #ifdef INTEL_ALLOW_TABLE + } else { + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * prefactor; forcecoul -= adjust; if (EFLAG) ecoul -= adjust; - - #ifdef INTEL_ALLOW_TABLE - } else { - float rsq_lookup = rsq; - const int itable = (__intel_castf32_u32(rsq_lookup) & - ncoulmask) >> ncoulshiftbits; - const flt_t fraction = (rsq_lookup - table[itable].r) * - table[itable].dr; - - const flt_t tablet = table[itable].f + - fraction * table[itable].df; - forcecoul = qtmp * q[j] * tablet; - if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + - fraction * detable[itable]); - if (sbindex) { - const flt_t table2 = ctable[itable] + - fraction * dctable[itable]; - const flt_t prefactor = qtmp * q[j] * table2; - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } } - #endif - #ifdef INTEL_VMASK } - #endif + #endif #ifdef INTEL_VMASK if (rsq < c_forcei[jtype].cut_ljsq) { @@ -361,80 +385,74 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, #ifdef INTEL_VMASK } #else - if (rsq > c_forcei[jtype].cutsq) - { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } if (rsq > c_forcei[jtype].cut_ljsq) { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; } #endif - #ifdef INTEL_VMASK - if (rsq < c_forcei[jtype].cutsq) { - #endif - const flt_t fpair = (forcecoul + forcebuck) * r2inv; - fxtmp += delx * fpair; - fytmp += dely * fpair; - fztmp += delz * fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx * fpair; - f[j].y -= dely * fpair; - f[j].z -= delz * fpair; + const flt_t fpair = (forcecoul + forcebuck) * r2inv; + const flt_t fpx = fpair * tdelx[jj]; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * tdely[jj]; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * tdelz[jj]; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - sevdwl += ev_pre * evdwl; - secoul += ev_pre * ecoul; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR || j < nlocal) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - } - } - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz); - } - #ifdef INTEL_VMASK - } - #endif + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -446,7 +464,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -457,6 +475,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, void PairBuckCoulLongIntel::init_style() { PairBuckCoulLong::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); @@ -484,6 +506,13 @@ template <class flt_t, class acc_t> void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> *buffers) { + int off_ccache = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (_cop >= 0) off_ccache = 1; + #endif + buffers->grow_ccache(off_ccache, comm->nthreads, 1); + _ccache_stride = buffers->ccache_stride(); + int tp1 = atom->ntypes + 1; int ntable = 1; if (ncoultablebits) @@ -518,6 +547,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < tp1; i++) { for (int j = 0; j < tp1; j++) { + if (cutsq[i][j] < cut_ljsq[i][j]) + error->all(FLERR, + "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic"); fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; fc.c_force[i][j].buck1 = buck1[i][j]; diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.h b/src/USER-INTEL/pair_buck_coul_long_intel.h index 57e4517404..ec2cdba177 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.h +++ b/src/USER-INTEL/pair_buck_coul_long_intel.h @@ -40,7 +40,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong { private: FixIntel *fix; - int _cop, _lrt; + int _cop, _lrt, _ccache_stride; template <class flt_t> class ForceConst; @@ -48,7 +48,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp index 4815d1e025..bbfc7225dd 100644 --- a/src/USER-INTEL/pair_buck_intel.cpp +++ b/src/USER-INTEL/pair_buck_intel.cpp @@ -78,57 +78,51 @@ void PairBuckIntel::compute(int eflag, int vflag, if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairBuckIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -152,7 +146,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -192,27 +186,26 @@ void PairBuckIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - for (int i = iifrom; i < iito; ++i) { + for (int i = iifrom; i < iito; i += iip) { const int itype = x[i].w; const int ptr_off = itype * ntypes; @@ -228,10 +221,9 @@ void PairBuckIntel::eval(const int offload, const int vflag, const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } #if defined(LMP_SIMD_COMPILER) #pragma vector aligned @@ -284,69 +276,70 @@ void PairBuckIntel::eval(const int offload, const int vflag, evdwl *= factor_lj; } const flt_t fpair = forcebuck * r2inv; - fxtmp += delx * fpair; - fytmp += dely * fpair; - fztmp += delz * fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx * fpair; - f[j].y -= dely * fpair; - f[j].z -= delz * fpair; - } - - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - sevdwl += ev_pre * evdwl; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl; - if (NEWTON_PAIR || j < nlocal) - f[j].w += (flt_t)0.5 * evdwl; - } - } - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz); - } + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } #endif } // for jj - - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -358,7 +351,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -367,6 +360,10 @@ void PairBuckIntel::eval(const int offload, const int vflag, void PairBuckIntel::init_style() { PairBuck::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); diff --git a/src/USER-INTEL/pair_buck_intel.h b/src/USER-INTEL/pair_buck_intel.h index 4f039c3f97..e699a1611e 100644 --- a/src/USER-INTEL/pair_buck_intel.h +++ b/src/USER-INTEL/pair_buck_intel.h @@ -48,7 +48,7 @@ private: void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp index f8c972ab8b..541f9745cb 100644 --- a/src/USER-INTEL/pair_eam_intel.cpp +++ b/src/USER-INTEL/pair_eam_intel.cpp @@ -90,78 +90,58 @@ void PairEAMIntel::compute(int eflag, int vflag, if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; if (_onetype) { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } else { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + if (eflag) { + if (force->newton_pair) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } @@ -169,8 +149,7 @@ void PairEAMIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ -template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, - class acc_t> +template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairEAMIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -186,7 +165,10 @@ void PairEAMIntel::eval(const int offload, const int vflag, nmax = atom->nmax; int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN; if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t); - memory->create(rho,nmax*comm->nthreads,"pair:rho"); + if (NEWTON_PAIR) + memory->create(rho,nmax*comm->nthreads,"pair:rho"); + else + memory->create(rho,nmax,"pair:rho"); memory->create(fp,nmax,"pair:fp"); // Use single precision allocation for single/mixed mode // Keep double version for single and swap_eam @@ -222,9 +204,17 @@ void PairEAMIntel::eval(const int offload, const int vflag, const int ntypes = atom->ntypes + 1; const int eatom = this->eflag_atom; + flt_t * _noalias const ccachex = buffers->get_ccachex(); + flt_t * _noalias const ccachey = buffers->get_ccachey(); + flt_t * _noalias const ccachez = buffers->get_ccachez(); + flt_t * _noalias const ccachew = buffers->get_ccachew(); + int * _noalias const ccachei = buffers->get_ccachei(); + int * _noalias const ccachej = buffers->get_ccachej(); + const int ccache_stride = _ccache_stride; + // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -252,16 +242,12 @@ void PairEAMIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(fp_f, f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { int iifrom, iito, tid; @@ -270,12 +256,25 @@ void PairEAMIntel::eval(const int offload, const int vflag, iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - double * _noalias const trho = rho + tid*nmax; - if (NEWTON_PAIR) + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) foff = tid * nmax; + else foff = 0; + double * _noalias const trho = rho + foff; + if (NEWTON_PAIR) { + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); memset(trho, 0, nall * sizeof(double)); - else - memset(trho, 0, nlocal * sizeof(double)); + } + + const int toffs = tid * ccache_stride; + flt_t * _noalias const tdelx = ccachex + toffs; + flt_t * _noalias const tdely = ccachey + toffs; + flt_t * _noalias const tdelz = ccachez + toffs; + flt_t * _noalias const trsq = ccachew + toffs; + int * _noalias const tj = ccachei + toffs; + int * _noalias const tjtype = ccachej + toffs; flt_t oscale; int rhor_joff, frho_ioff; @@ -300,53 +299,67 @@ void PairEAMIntel::eval(const int offload, const int vflag, const flt_t ztmp = x[i].z; acc_t rhoi = (acc_t)0.0; - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd reduction(+:rhoi) + int ej = 0; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { - int j, jtype; - j = jlist[jj] & NEIGHMASK; - + const int j = jlist[jj] & NEIGHMASK; const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; const flt_t rsq = delx*delx + dely*dely + delz*delz; if (rsq < fcutforcesq) { - if (!ONETYPE) jtype = x[j].w; - flt_t p = sqrt(rsq)*frdr + (flt_t)1.0; - int m = static_cast<int> (p); - m = MIN(m,nr-1); - p -= m; - p = MIN(p,(flt_t)1.0); - if (!ONETYPE) - rhor_joff = rhor_ioff + jtype * jstride; - const int joff = rhor_joff + m; - flt_t ra; - ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p + - rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d; - rhoi += ra; - if (NEWTON_PAIR || j < nlocal) { - if (!ONETYPE) { - const int ioff = jtype * istride + itype * jstride + m; - ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p + - rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d; - } - trho[j] += ra; - } + trsq[ej]=rsq; + if (!ONETYPE) tjtype[ej]=x[j].w; + tj[ej]=jlist[jj]; + ej++; } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:rhoi) + #endif + for (int jj = 0; jj < ej; jj++) { + int jtype; + const int j = tj[jj] & NEIGHMASK; + if (!ONETYPE) jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; + flt_t p = sqrt(rsq)*frdr + (flt_t)1.0; + int m = static_cast<int> (p); + m = MIN(m,nr-1); + p -= m; + p = MIN(p,(flt_t)1.0); + if (!ONETYPE) + rhor_joff = rhor_ioff + jtype * jstride; + const int joff = rhor_joff + m; + flt_t ra; + ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p + + rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d; + rhoi += ra; + if (NEWTON_PAIR) { + if (!ONETYPE) { + const int ioff = jtype * istride + itype * jstride + m; + ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p + + rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d; + } + trho[j] += ra; + } } // for jj - trho[i] += rhoi; + if (NEWTON_PAIR) + trho[i] += rhoi; + else + trho[i] = rhoi; } // for i #if defined(_OPENMP) - if (nthreads > 1) { + if (NEWTON_PAIR && nthreads > 1) { #pragma omp barrier if (tid == 0) { - int rcount; - if (NEWTON_PAIR) rcount = nall; - else rcount = nlocal; + const int rcount = nall; if (nthreads == 2) { double *trho2 = rho + nmax; #pragma vector aligned @@ -431,10 +444,9 @@ void PairEAMIntel::eval(const int offload, const int vflag, #pragma omp barrier #endif - if (tid == 0) { + if (tid == 0) comm->forward_comm_pair(this); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - } else + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); #if defined(_OPENMP) @@ -462,124 +474,142 @@ void PairEAMIntel::eval(const int offload, const int vflag, const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = (acc_t)0; - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } - - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + + int ej = 0; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { - int j, jtype; - j = jlist[jj] & NEIGHMASK; - + const int j = jlist[jj] & NEIGHMASK; const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; const flt_t rsq = delx*delx + dely*dely + delz*delz; - if (rsq < fcutforcesq) { - if (!ONETYPE) jtype = x[j].w; - const flt_t r = sqrt(rsq); - flt_t p = r*frdr + (flt_t)1.0; - int m = static_cast<int> (p); - m = MIN(m,nr-1); - p -= m; - p = MIN(p,(flt_t)1.0); - if (!ONETYPE) - rhor_joff = rhor_ioff + jtype * jstride; - const int joff = rhor_joff + m; - const flt_t rhojp = (rhor_spline_f[joff].a*p + - rhor_spline_f[joff].b)*p + - rhor_spline_f[joff].c; - flt_t rhoip; - if (!ONETYPE) { - const int ioff = jtype * istride + itype * jstride + m; - rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + - rhor_spline_f[ioff].c; - } else - rhoip = rhojp; - const flt_t z2p = (z2r_spline_t[joff].a*p + - z2r_spline_t[joff].b)*p + - z2r_spline_t[joff].c; - const flt_t z2 = ((z2r_spline_t[joff].d*p + - z2r_spline_t[joff].e)*p + - z2r_spline_t[joff].f)*p + - z2r_spline_t[joff].g; - - const flt_t recip = (flt_t)1.0/r; - const flt_t phi = z2*recip; - const flt_t phip = z2p*recip - phi*recip; - const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip; - if (!ONETYPE) - oscale = scale_fi[jtype]; - const flt_t fpair = -oscale*psip*recip; - - fxtmp += delx*fpair; - fytmp += dely*fpair; - fztmp += delz*fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx*fpair; - f[j].y -= dely*fpair; - f[j].z -= delz*fpair; - } + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + if (!ONETYPE) tjtype[ej]=x[j].w; + tj[ej]=jlist[jj]; + ej++; + } + } - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i<nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j<nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - const flt_t evdwl = oscale*phi; - sevdwl += ev_pre * evdwl; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl; - if (NEWTON_PAIR || j < nlocal) - f[j].w += (flt_t)0.5 * evdwl; - } - } - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, - delx, dely, delz); - } - } // if rsq + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < ej; jj++) { + int jtype; + const int j = tj[jj] & NEIGHMASK; + if (!ONETYPE) jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; + const flt_t r = sqrt(rsq); + flt_t p = r*frdr + (flt_t)1.0; + int m = static_cast<int> (p); + m = MIN(m,nr-1); + p -= m; + p = MIN(p,(flt_t)1.0); + if (!ONETYPE) + rhor_joff = rhor_ioff + jtype * jstride; + const int joff = rhor_joff + m; + const flt_t rhojp = (rhor_spline_f[joff].a*p + + rhor_spline_f[joff].b)*p + + rhor_spline_f[joff].c; + flt_t rhoip; + if (!ONETYPE) { + const int ioff = jtype * istride + itype * jstride + m; + rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + + rhor_spline_f[ioff].c; + } else + rhoip = rhojp; + const flt_t z2p = (z2r_spline_t[joff].a*p + + z2r_spline_t[joff].b)*p + + z2r_spline_t[joff].c; + const flt_t z2 = ((z2r_spline_t[joff].d*p + + z2r_spline_t[joff].e)*p + + z2r_spline_t[joff].f)*p + + z2r_spline_t[joff].g; + + const flt_t recip = (flt_t)1.0/r; + const flt_t phi = z2*recip; + const flt_t phip = z2p*recip - phi*recip; + const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip; + if (!ONETYPE) + oscale = scale_fi[jtype]; + const flt_t fpair = -oscale*psip*recip; + + const flt_t fpx = fpair * tdelx[jj]; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * tdely[jj]; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * tdelz[jj]; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + const flt_t evdwl = oscale*phi; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - - IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + sevdwl *= (acc_t)0.5; + } + + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for i - if (vflag == 2) { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } - + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } /// omp - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; - } + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -591,7 +621,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -604,6 +634,10 @@ void PairEAMIntel::eval(const int offload, const int vflag, void PairEAMIntel::init_style() { PairEAM::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); @@ -633,6 +667,13 @@ template <class flt_t, class acc_t> void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> *buffers) { + int off_ccache = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (_cop >= 0) off_ccache = 1; + #endif + buffers->grow_ccache(off_ccache, comm->nthreads, 1); + _ccache_stride = buffers->ccache_stride(); + int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1,nr,nrho,memory,_cop); buffers->set_ntypes(tp1); diff --git a/src/USER-INTEL/pair_eam_intel.h b/src/USER-INTEL/pair_eam_intel.h index f7fb71ad2c..c7bb3b7bd0 100644 --- a/src/USER-INTEL/pair_eam_intel.h +++ b/src/USER-INTEL/pair_eam_intel.h @@ -41,7 +41,7 @@ class PairEAMIntel : public PairEAM { protected: FixIntel *fix; - int _cop, _onetype; + int _cop, _onetype, _ccache_stride; float *fp_float; template <class flt_t> @@ -53,7 +53,7 @@ class PairEAMIntel : public PairEAM { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, + template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index c1e3d1b37f..af96fcbb79 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -88,12 +88,16 @@ void PairGayBerneIntel::compute(int eflag, int vflag, const AtomVecEllipsoid::Bonus * const bonus = avec->bonus; const int * const ellipsoid = atom->ellipsoid; QUAT_T * _noalias const quat = buffers->get_quat(); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; - IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, + IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads, sizeof(ATOM_T)); if (ago != 0) buffers->thr_pack(ifrom,ito,ago); @@ -114,39 +118,29 @@ void PairGayBerneIntel::compute(int eflag, int vflag, fix->stop_watch(TIME_PACK); } - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairGayBerneIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -167,8 +161,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, if (fix->separate_buffers()) { fix->start_watch(TIME_PACK); if (offload) { - #pragma omp parallel default(none) \ - shared(buffers,nlocal,nall,bonus,ellipsoid) + #pragma omp parallel { int ifrom, ito, tid; int nthreads = comm->nthreads; @@ -258,7 +251,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -334,6 +327,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, x[nall].x = (flt_t)INTEL_BIGP; x[nall].y = (flt_t)INTEL_BIGP; x[nall].z = (flt_t)INTEL_BIGP; + x[nall].w = 1; quat[nall].w = (flt_t)1.0; quat[nall].i = (flt_t)0.0; quat[nall].j = (flt_t)0.0; @@ -342,25 +336,25 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, #endif acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = (acc_t)0.0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; - } + if (EFLAG) oevdwl = (acc_t)0.0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + if (NEWTON_PAIR == 0) f_start[1].w = 0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal * 2 + (tid * f_stride); - memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T)); + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal * 2; + else foff = minlocal*-2; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T)); flt_t * _noalias const rsq_form = rsq_formi + tid * max_nbors; flt_t * _noalias const delx_form = delx_formi + tid * max_nbors; @@ -370,7 +364,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, int * _noalias const jlist_form = jlist_formi + tid * max_nbors; int ierror = 0; - for (int i = iifrom; i < iito; ++i) { + for (int i = iifrom; i < iito; i += iip) { // const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; @@ -401,14 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = (acc_t)0.0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0.0; + if (NEWTON_PAIR == 0) if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; - } bool multiple_forms = false; int packed_j = 0; - for (int jj = 0; jj < jnum; jj++) { + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int jj = 0; jj < jnum; jj++) { int jm = jlist[jj]; int j = jm & NEIGHMASK; const int jtype = x[j].w; @@ -573,7 +570,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ME_cross3(tempv, tempv2, dUr); flt_t dUr2_0, dUr2_1, dUr2_2; - if (NEWTON_PAIR || j < nlocal) { + if (NEWTON_PAIR) { ME_vecmat(kappa, g2, tempv2); ME_cross3(tempv, tempv2, dUr2); } @@ -588,7 +585,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, dchi_2 *= temp1; flt_t dchi2_0, dchi2_1, dchi2_2; - if (NEWTON_PAIR || j < nlocal) { + if (NEWTON_PAIR) { ME_vecmat(iota, b2, tempv); ME_cross3(tempv, iota, dchi2); dchi2_0 *= temp1; @@ -630,7 +627,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, // compute d_eta for particle 2 flt_t deta2_0, deta2_1, deta2_2; - if (NEWTON_PAIR || j < nlocal) { + if (NEWTON_PAIR) { deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0; ME_compute_eta_torque(g12, a2, shape2, temp); @@ -672,7 +669,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) * (flt_t)-1.0; - if (NEWTON_PAIR || j < nlocal) { + if (NEWTON_PAIR) { rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) * (flt_t)-1.0; rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) * @@ -714,7 +711,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, t2tmp += ttor_1; t3tmp += ttor_2; - if (NEWTON_PAIR || j < nlocal) { + if (NEWTON_PAIR) { rtor_0 *= factor_lj; rtor_1 *= factor_lj; rtor_2 *= factor_lj; @@ -728,34 +725,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, f[jp].z += rtor_2; } - if (EVFLAG) { - flt_t ev_pre = (flt_t)0.0; - if (NEWTON_PAIR || i < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - evdwl = factor_lj * one_eng; - sevdwl += ev_pre * evdwl; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl; - if (NEWTON_PAIR || j < nlocal) - f[j*2].w += (flt_t)0.5 * evdwl; - } + if (EFLAG) { + evdwl = factor_lj * one_eng; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j*2].w += (flt_t)0.5 * evdwl; } + } + if (NEWTON_PAIR == 0) { if (vflag == 1) { - ev_pre *= (flt_t)-1.0; - sv0 += ev_pre * delx_form[jj] * fforce_0; - sv1 += ev_pre * dely_form[jj] * fforce_1; - sv2 += ev_pre * delz_form[jj] * fforce_2; - sv3 += ev_pre * delx_form[jj] * fforce_1; - sv4 += ev_pre * delx_form[jj] * fforce_2; - sv5 += ev_pre * dely_form[jj] * fforce_2; + sv0 += delx_form[jj] * fforce_0; + sv1 += dely_form[jj] * fforce_1; + sv2 += delz_form[jj] * fforce_2; + sv3 += delx_form[jj] * fforce_1; + sv4 += delx_form[jj] * fforce_2; + sv5 += dely_form[jj] * fforce_2; } - } // EVFLAG + } // EVFLAG #ifdef INTEL_VMASK } #endif @@ -767,19 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ierror = 2; int ip = i * 2; - f[ip].x += fxtmp; - f[ip].y += fytmp; - f[ip].z += fztmp; - ip++; - f[ip].x += t1tmp; - f[ip].y += t2tmp; - f[ip].z += t3tmp; - - if (EVFLAG) { - if (EFLAG) { - if (eatom) f[i * 2].w += fwtmp; - oevdwl += sevdwl; - } + if (NEWTON_PAIR) { + f[ip].x += fxtmp; + f[ip].y += fytmp; + f[ip].z += fztmp; + ip++; + f[ip].x += t1tmp; + f[ip].y += t2tmp; + f[ip].z += t3tmp; + } else { + f[ip].x = fxtmp; + f[ip].y = fytmp; + f[ip].z = fztmp; + ip++; + f[ip].x = t1tmp; + f[ip].y = t2tmp; + f[ip].z = t3tmp; + } + + if (EFLAG) { + oevdwl += sevdwl; + if (eatom) f[i * 2].w += fwtmp; + } + if (NEWTON_PAIR == 0) { if (vflag == 1) { ov0 += sv0; ov1 += sv1; @@ -791,56 +790,31 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } } // for i int o_range; - if (NEWTON_PAIR) + if (NEWTON_PAIR) { o_range = nall; - else - o_range = nlocal; - if (offload == 0) o_range -= minlocal; - IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, - sizeof(FORCE_T)); - const int two_iito = iito * 2; - - acc_t *facc = &(f_start[0].x); - const int sto = two_iito * 4; - const int fst4 = f_stride * 4; - #if defined(_OPENMP) - #pragma omp barrier - #endif - int t_off = f_stride; - if (EFLAG && eatom) { - for (int t = 1; t < nthreads; t++) { - #if defined(LMP_SIMD_COMPILER) - #pragma vector nontemporal - #pragma novector - #endif - for (int n = iifrom * 2; n < two_iito; n++) { - f_start[n].x += f_start[n + t_off].x; - f_start[n].y += f_start[n + t_off].y; - f_start[n].z += f_start[n + t_off].z; - f_start[n].w += f_start[n + t_off].w; - } - t_off += f_stride; - } - } else { - for (int t = 1; t < nthreads; t++) { + if (offload == 0) o_range -= minlocal; + IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, + sizeof(FORCE_T)); + const int sto = iito * 8; + const int fst4 = f_stride * 4; + #if defined(_OPENMP) + #pragma omp barrier + #endif + acc_t *f_scalar = &f_start[0].x; + acc_t *f_scalar2 = f_scalar + fst4; + for (int t = 1; t < nthreads; t++) { #if defined(LMP_SIMD_COMPILER) - #pragma vector nontemporal - #pragma novector + #pragma vector aligned + #pragma simd #endif - for (int n = iifrom * 2; n < two_iito; n++) { - f_start[n].x += f_start[n + t_off].x; - f_start[n].y += f_start[n + t_off].y; - f_start[n].z += f_start[n + t_off].z; - } - t_off += f_stride; + for (int n = iifrom * 8; n < sto; n++) + f_scalar[n] += f_scalar2[n]; + f_scalar2 += fst4; } - } - if (EVFLAG) { if (vflag==2) { - const ATOM_T * _noalias const xo = x + minlocal; + const ATOM_T * _noalias const xo = x + minlocal; #if defined(LMP_SIMD_COMPILER) - #pragma vector nontemporal #pragma novector #endif for (int n = iifrom; n < iito; n++) { @@ -852,26 +826,33 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ov4 += f_start[nt2].z * xo[n].x; ov5 += f_start[nt2].z * xo[n].y; } - } + } } if (ierror) f_start[1].w = ierror; } // omp - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)-0.5; + ov1 *= (acc_t)-0.5; + ov2 *= (acc_t)-0.5; + ov3 *= (acc_t)-0.5; + ov4 *= (acc_t)-0.5; + ov5 *= (acc_t)-0.5; } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) @@ -884,7 +865,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2); else fix->add_result_array(f_start, 0, offload, 0, 0, 2); @@ -895,6 +876,10 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, void PairGayBerneIntel::init_style() { PairGayBerne::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h index aaed31d567..07dfba14d1 100644 --- a/src/USER-INTEL/pair_gayberne_intel.h +++ b/src/USER-INTEL/pair_gayberne_intel.h @@ -43,7 +43,7 @@ class PairGayBerneIntel : public PairGayBerne { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index ce6e40141f..7548b6eea3 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -82,54 +82,48 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag, if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } // -------------------- Regular version - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -182,7 +176,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -236,25 +230,24 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); flt_t cutboth = cut_coulsq; const int toffs = tid * ccache_stride; @@ -265,7 +258,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, int * _noalias const tj = ccachei + toffs; int * _noalias const tjtype = ccachej + toffs; - for (int i = iifrom; i < iito; ++i) { + for (int i = iifrom; i < iito; i += iip) { // const int i = ilist[ii]; const int itype = x[i].w; @@ -284,10 +277,9 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } int ej = 0; #if defined(LMP_SIMD_COMPILER) @@ -421,77 +413,76 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, #ifdef INTEL_VMASK } #else - if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } #endif - const flt_t delx = tdelx[jj]; - const flt_t dely = tdely[jj]; - const flt_t delz = tdelz[jj]; const flt_t fpair = (forcecoul + forcelj) * r2inv; - fxtmp += delx * fpair; - fytmp += dely * fpair; - fztmp += delz * fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx * fpair; - f[j].y -= dely * fpair; - f[j].z -= delz * fpair; - } - - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - sevdwl += ev_pre * evdwl; - secoul += ev_pre * ecoul; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR || j < nlocal) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - } + const flt_t fpx = fpair * tdelx[jj]; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * tdely[jj]; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * tdelz[jj]; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, - delx, dely, delz); } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - - IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = oecoul; + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -503,7 +494,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -514,6 +505,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, void PairLJCharmmCoulLongIntel::init_style() { PairLJCharmmCoulLong::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); @@ -541,11 +536,6 @@ template <class flt_t, class acc_t> void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> *buffers) { - int tp1 = atom->ntypes + 1; - int ntable = 1; - if (ncoultablebits) - for (int i = 0; i < ncoultablebits; i++) ntable *= 2; - int off_ccache = 0; #ifdef _LMP_INTEL_OFFLOAD if (_cop >= 0) off_ccache = 1; @@ -553,6 +543,11 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, buffers->grow_ccache(off_ccache, comm->nthreads, 1); _ccache_stride = buffers->ccache_stride(); + int tp1 = atom->ntypes + 1; + int ntable = 1; + if (ncoultablebits) + for (int i = 0; i < ncoultablebits; i++) ntable *= 2; + fc.set_ntypes(tp1, ntable, memory, _cop); buffers->set_ntypes(tp1); flt_t **cutneighsq = buffers->get_cutneighsq(); diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h index 6a207d8400..cafc412a91 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h @@ -48,7 +48,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index f26ff724c8..8a0bed2c01 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -83,57 +83,50 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag, if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + if (force->newton_pair) { + eval<1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0>(0, ovflag, buffers, fc, host_start, inum); } } } /* ---------------------------------------------------------------------- */ -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -167,9 +160,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const int ntypes = atom->ntypes + 1; const int eatom = this->eflag_atom; + flt_t * _noalias const ccachex = buffers->get_ccachex(); + flt_t * _noalias const ccachey = buffers->get_ccachey(); + flt_t * _noalias const ccachez = buffers->get_ccachez(); + flt_t * _noalias const ccachew = buffers->get_ccachew(); + int * _noalias const ccachei = buffers->get_ccachei(); + int * _noalias const ccachej = buffers->get_ccachej(); + const int ccache_stride = _ccache_stride; + // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -204,8 +205,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ + in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ + in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ - in(f_stride,nlocal,minlocal,separate_flag,offload) \ + in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ @@ -220,27 +223,34 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - - for (int i = iifrom; i < iito; ++i) { + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + + const int toffs = tid * ccache_stride; + flt_t * _noalias const tdelx = ccachex + toffs; + flt_t * _noalias const tdely = ccachey + toffs; + flt_t * _noalias const tdelz = ccachez + toffs; + flt_t * _noalias const trsq = ccachew + toffs; + int * _noalias const tj = ccachei + toffs; + int * _noalias const tjtype = ccachej + toffs; + + for (int i = iifrom; i < iito; i += iip) { const int itype = x[i].w; const int ptr_off = itype * ntypes; @@ -258,86 +268,98 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } + int ej = 0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma vector aligned + #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { - flt_t forcecoul, forcelj, evdwl, ecoul; - forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; - - const int sbindex = jlist[jj] >> SBBITS & 3; const int j = jlist[jj] & NEIGHMASK; - const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; + const int jtype = x[j].w; const flt_t rsq = delx * delx + dely * dely + delz * delz; + if (rsq < c_forcei[jtype].cutsq) { + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + tjtype[ej]=jtype; + tj[ej]=jlist[jj]; + ej++; + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ + sv0, sv1, sv2, sv3, sv4, sv5) + #endif + for (int jj = 0; jj < ej; jj++) { + flt_t forcecoul, forcelj, evdwl, ecoul; + forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; + + const int j = tj[jj] & NEIGHMASK; + const int sbindex = tj[jj] >> SBBITS & 3; + const int jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; const flt_t r2inv = (flt_t)1.0 / rsq; - #ifdef INTEL_VMASK - if (rsq < c_forcei[jtype].cutsq) { + #ifdef INTEL_ALLOW_TABLE + if (!ncoultablebits || rsq <= tabinnersq) { #endif - #ifdef INTEL_ALLOW_TABLE - if (!ncoultablebits || rsq <= tabinnersq) { - #endif - const flt_t A1 = 0.254829592; - const flt_t A2 = -0.284496736; - const flt_t A3 = 1.421413741; - const flt_t A4 = -1.453152027; - const flt_t A5 = 1.061405429; - const flt_t EWALD_F = 1.12837917; - const flt_t INV_EWALD_P = 1.0 / 0.3275911; - - const flt_t r = (flt_t)1.0 / sqrt(r2inv); - const flt_t grij = g_ewald * r; - const flt_t expm2 = exp(-grij * grij); - const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); - const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const flt_t prefactor = qqrd2e * qtmp * q[j] / r; - forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); - if (EFLAG) ecoul = prefactor * erfc; - - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + const flt_t A1 = 0.254829592; + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t r = (flt_t)1.0 / sqrt(r2inv); + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + + #ifdef INTEL_ALLOW_TABLE + } else { + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * prefactor; forcecoul -= adjust; if (EFLAG) ecoul -= adjust; - - #ifdef INTEL_ALLOW_TABLE - } else { - float rsq_lookup = rsq; - const int itable = (__intel_castf32_u32(rsq_lookup) & - ncoulmask) >> ncoulshiftbits; - const flt_t fraction = (rsq_lookup - table[itable].r) * - table[itable].dr; - - const flt_t tablet = table[itable].f + - fraction * table[itable].df; - forcecoul = qtmp * q[j] * tablet; - if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + - fraction * detable[itable]); - if (sbindex) { - const flt_t table2 = ctable[itable] + - fraction * dctable[itable]; - const flt_t prefactor = qtmp * q[j] * table2; - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } - } - #endif - #ifdef INTEL_VMASK + } } - #endif + #endif #ifdef INTEL_VMASK if (rsq < c_forcei[jtype].cut_ljsq) { @@ -357,80 +379,79 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, #ifdef INTEL_VMASK } #else - if (rsq > c_forcei[jtype].cutsq) - { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } if (rsq > c_forcei[jtype].cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } #endif - #ifdef INTEL_VMASK - if (rsq < c_forcei[jtype].cutsq) { - #endif - const flt_t fpair = (forcecoul + forcelj) * r2inv; - fxtmp += delx * fpair; - fytmp += dely * fpair; - fztmp += delz * fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx * fpair; - f[j].y -= dely * fpair; - f[j].z -= delz * fpair; + const flt_t fpair = (forcecoul + forcelj) * r2inv; + const flt_t fpx = fpair * tdelx[jj]; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * tdely[jj]; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * tdelz[jj]; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - sevdwl += ev_pre * evdwl; - secoul += ev_pre * ecoul; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR || j < nlocal) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - } - } - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz); - } - #ifdef INTEL_VMASK - } - #endif + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -442,7 +463,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -453,6 +474,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, void PairLJCutCoulLongIntel::init_style() { PairLJCutCoulLong::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); @@ -480,6 +505,13 @@ template <class flt_t, class acc_t> void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> *buffers) { + int off_ccache = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (_cop >= 0) off_ccache = 1; + #endif + buffers->grow_ccache(off_ccache, comm->nthreads, 1); + _ccache_stride = buffers->ccache_stride(); + int tp1 = atom->ntypes + 1; int ntable = 1; if (ncoultablebits) @@ -514,6 +546,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < tp1; i++) { for (int j = 0; j < tp1; j++) { + if (cutsq[i][j] < cut_ljsq[i][j]) + error->all(FLERR, + "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic"); fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; fc.c_force[i][j].lj1 = lj1[i][j]; diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h index dad73d18bd..2b7d87c040 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h @@ -42,13 +42,13 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong { private: FixIntel *fix; - int _cop, _lrt; + int _cop, _lrt, _ccache_stride; template <class flt_t> class ForceConst; template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index dd08dc023c..8620646343 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -75,85 +75,64 @@ void PairLJCutIntel::compute(int eflag, int vflag, if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; if (_onetype) { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + if (eflag) { + if (force->newton_pair) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } else { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - if (force->newton_pair) { - eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum); - } + if (eflag) { + if (force->newton_pair) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - if (force->newton_pair) { - eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum); - } + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } } -template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, - class acc_t> +template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairLJCutIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -181,7 +160,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -200,25 +179,24 @@ void PairLJCutIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; - FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride); - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + int foff; + if (NEWTON_PAIR) foff = tid * f_stride - minlocal; + else foff = -minlocal; + FORCE_T * _noalias const f = f_start + foff; + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); flt_t cutsq, lj1, lj2, lj3, lj4, offset; if (ONETYPE) { @@ -229,7 +207,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, lj4 = lj34[3].lj4; offset = ljc12o[3].offset; } - for (int i = iifrom; i < iito; ++i) { + for (int i = iifrom; i < iito; i += iip) { int itype, ptr_off; const FC_PACKED1_T * _noalias ljc12oi; const FC_PACKED2_T * _noalias lj34i; @@ -250,10 +228,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag, const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; fxtmp = fytmp = fztmp = (acc_t)0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = (acc_t)0; - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned @@ -301,83 +278,84 @@ void PairLJCutIntel::eval(const int offload, const int vflag, else fpair = forcelj * r2inv; - fxtmp += delx * fpair; - fytmp += dely * fpair; - fztmp += delz * fpair; - if (NEWTON_PAIR || j < nlocal) { - f[j].x -= delx * fpair; - f[j].y -= dely * fpair; - f[j].z -= delz * fpair; - } - - if (EVFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_PAIR || i<nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_PAIR || j<nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - if (!ONETYPE) { - lj3 = lj34i[jtype].lj3; - lj4 = lj34i[jtype].lj4; - offset = ljc12oi[jtype].offset; - } - evdwl = r6inv * (lj3 * r6inv - lj4); - #ifdef INTEL_VMASK - evdwl -= offset; - #else - if (rsq < cutsq) evdwl -= offset; - #endif - if (!ONETYPE) evdwl *= factor_lj; - sevdwl += ev_pre*evdwl; - if (eatom) { - if (NEWTON_PAIR || i < nlocal) - fwtmp += 0.5 * evdwl; - if (NEWTON_PAIR || j < nlocal) - f[j].w += 0.5 * evdwl; - } + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + if (!ONETYPE) { + lj3 = lj34i[jtype].lj3; + lj4 = lj34i[jtype].lj4; + offset = ljc12oi[jtype].offset; + } + evdwl = r6inv * (lj3 * r6inv - lj4); + #ifdef INTEL_VMASK + evdwl -= offset; + #else + if (rsq < cutsq) evdwl -= offset; + #endif + if (!ONETYPE) evdwl *= factor_lj; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; } + } - IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, - delx, dely, delz); - } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } // if rsq #endif } // for jj - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } - IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end omp - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; + + IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + if (NEWTON_PAIR == 0) { + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -389,7 +367,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -400,6 +378,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag, void PairLJCutIntel::init_style() { PairLJCut::init_style(); + if (force->newton_pair == 0) { + neighbor->requests[neighbor->nrequest-1]->half = 0; + neighbor->requests[neighbor->nrequest-1]->full = 1; + } neighbor->requests[neighbor->nrequest-1]->intel = 1; int ifix = modify->find_fix("package_intel"); diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h index a9c77324f3..b577a04658 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.h +++ b/src/USER-INTEL/pair_lj_cut_intel.h @@ -45,8 +45,7 @@ class PairLJCutIntel : public PairLJCut { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, - class acc_t> + template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp new file mode 100644 index 0000000000..99c7045098 --- /dev/null +++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp @@ -0,0 +1,50 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#include <math.h> +#include "pair_lj_long_coul_long_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "group.h" +#include "kspace.h" +#include "memory.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "memory.h" +#include "suffix.h" + + +using namespace LAMMPS_NS; + +#define C_FORCE_T typename ForceConst<flt_t>::c_force_t +#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t +#define TABLE_T typename ForceConst<flt_t>::table_t + +PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) : + PairLJLongCoulLong(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + cut_respa = NULL; +} + + +PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel() +{ +} diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.h b/src/USER-INTEL/pair_lj_long_coul_long_intel.h new file mode 100644 index 0000000000..b8e4e68928 --- /dev/null +++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.h @@ -0,0 +1,39 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel) + +#else + +#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H +#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H + +#include "pair_lj_long_coul_long.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + class PairLJLongCoulLongIntel : public PairLJLongCoulLong { + public: + PairLJLongCoulLongIntel(class LAMMPS *); + virtual ~PairLJLongCoulLongIntel(); + + }; +} +#endif +#endif diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp index 09e00fd867..835f78664a 100644 --- a/src/USER-INTEL/pair_sw_intel.cpp +++ b/src/USER-INTEL/pair_sw_intel.cpp @@ -109,85 +109,59 @@ void PairSWIntel::compute(int eflag, int vflag, if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom, ito, ago); } fix->stop_watch(TIME_PACK); } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; if (_onetype) { if (_spq) { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } else { - eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } + if (eflag) { + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<1,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad); + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } else { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } else { - eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } + if (eflag) { + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<0,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad); + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } } else { if (_spq) { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - eval<1,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } else { - eval<1,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } + if (eflag) { + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } else { - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - eval<0,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } else { - eval<0,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); - } + if (eflag) { + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } } @@ -196,7 +170,7 @@ void PairSWIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ #ifndef LMP_USE_AVXCD -template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t> +template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t> void PairSWIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, const int astart, @@ -235,7 +209,7 @@ void PairSWIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -276,19 +250,15 @@ void PairSWIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads); iifrom += astart; iito += astart; @@ -328,7 +298,7 @@ void PairSWIntel::eval(const int offload, const int vflag, } } - for (int i = iifrom; i < iito; ++i) { + for (int i = iifrom; i < iito; i += iip) { int itype, itype_offset; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; @@ -344,14 +314,13 @@ void PairSWIntel::eval(const int offload, const int vflag, const int jnumhalf = numneighhalf[i]; acc_t fxtmp, fytmp, fztmp, fwtmp; - acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + acc_t sevdwl; fxtmp = fytmp = fztmp = (acc_t)0.0; - if (EVFLAG) { - if (EFLAG) fwtmp = sevdwl = (acc_t)0; - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - } + if (EFLAG) fwtmp = sevdwl = (acc_t)0; int ejnum = 0, ejnumhalf = 0; + #pragma vector aligned + #pragma ivdep for (int jj = 0; jj < jnum; jj++) { int j = jlist[jj]; j &= NEIGHMASK; @@ -390,8 +359,7 @@ void PairSWIntel::eval(const int offload, const int vflag, #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl) #endif for (int jj = 0; jj < ejnum_pad; jj++) { acc_t fjxtmp, fjytmp, fjztmp, fjtmp; @@ -399,9 +367,6 @@ void PairSWIntel::eval(const int offload, const int vflag, if (EFLAG) fjtmp = (acc_t)0.0; int ijtype; - const flt_t delx = tdelx[jj]; - const flt_t dely = tdely[jj]; - const flt_t delz = tdelz[jj]; if (!ONETYPE) ijtype = tjtype[jj] + itype_offset; const flt_t rsq1 = trsq[jj]; @@ -440,29 +405,31 @@ void PairSWIntel::eval(const int offload, const int vflag, const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * rainvsq) * expsrainv * rinvsq1; - fxtmp -= delx * fpair; - fytmp -= dely * fpair; - fztmp -= delz * fpair; - fjxtmp += delx * fpair; - fjytmp += dely * fpair; - fjztmp += delz * fpair; + const flt_t delx = tdelx[jj]; + const flt_t dely = tdely[jj]; + const flt_t delz = tdelz[jj]; + const flt_t fpx = fpair * delx; + fxtmp -= fpx; + fjxtmp += fpx; + const flt_t fpy = fpair * dely; + fytmp -= fpy; + fjytmp += fpy; + const flt_t fpz = fpair * delz; + fztmp -= fpz; + fjztmp += fpz; - if (EVFLAG) { - if (EFLAG) { - flt_t evdwl; - if (!ONETYPE) { - c5 = p2e[ijtype].c5; - c6 = p2e[ijtype].c6; - } - evdwl = (c5 * rp - c6 * rq) * expsrainv; - sevdwl += evdwl; - if (eatom) { - fwtmp += (acc_t)0.5 * evdwl; - fjtmp += (acc_t)0.5 * evdwl; - } - } - IP_PRE_ev_tally_nbor(vflag, (flt_t)1.0, fpair, - -delx, -dely, -delz); + if (EFLAG) { + flt_t evdwl; + if (!ONETYPE) { + c5 = p2e[ijtype].c5; + c6 = p2e[ijtype].c6; + } + evdwl = (c5 * rp - c6 * rq) * expsrainv; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + fjtmp += (flt_t)0.5 * evdwl; + } } /*---------------------------------------------*/ @@ -533,17 +500,13 @@ void PairSWIntel::eval(const int offload, const int vflag, fjytmp += fjy; fjztmp += fjz; - if (EVFLAG) { - if (EFLAG) { - const flt_t evdwl = facrad * (flt_t)0.5; - sevdwl += evdwl; - if (eatom) { - fwtmp += (acc_t)0.33333333 * evdwl; - fjtmp += (acc_t)0.33333333 * facrad; - } + if (EFLAG) { + const flt_t evdwl = facrad * (flt_t)0.5; + sevdwl += evdwl; + if (eatom) { + fwtmp += (acc_t)0.33333333 * evdwl; + fjtmp += (acc_t)0.33333333 * facrad; } - IP_PRE_ev_tally_nbor3v(vflag, fjx, fjy, fjz, - delx, dely, delz); } } // for kk const int j = tj[jj]; @@ -557,34 +520,31 @@ void PairSWIntel::eval(const int offload, const int vflag, f[i].x += fxtmp; f[i].y += fytmp; f[i].z += fztmp; - IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp); + + if (EFLAG) { + f[i].w += fwtmp; + oevdwl += sevdwl; + } } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, + x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); } // end omp - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; - } + + IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -595,7 +555,7 @@ void PairSWIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -614,7 +574,7 @@ authors for more details. ------------------------------------------------------------------------- */ -template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t> +template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t> void PairSWIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, const int astart, @@ -659,7 +619,7 @@ void PairSWIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -701,19 +661,17 @@ void PairSWIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { - int iifrom, iito, tid; - IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, swidth); + int iifrom, iip, iito, tid; + IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads, + swidth); + iifrom += astart; iito += astart; @@ -760,7 +718,7 @@ void PairSWIntel::eval(const int offload, const int vflag, 144,160,176,192,208,224,240); ilist = ilist + iifrom; acc_t * const dforce = &(f[0].x); - for (int i = iifrom; i < iito; i += swidth) { + for (int i = iifrom; i < iito; i += iip) { SIMD_mask imask = ilist < iito; SIMD_flt_t xtmp, ytmp, ztmp; SIMD_int itype, itype_offset; @@ -793,20 +751,10 @@ void PairSWIntel::eval(const int offload, const int vflag, if (EFLAG) fwtmp2 = SIMD_set((acc_t)0); } - SIMD_acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; - if (EVFLAG) { - if (EFLAG) { - fwtmp = SIMD_set((acc_t)0); - sevdwl = SIMD_set((acc_t)0); - } - if (vflag==1) { - sv0 = SIMD_set((acc_t)0); - sv1 = SIMD_set((acc_t)0); - sv2 = SIMD_set((acc_t)0); - sv3 = SIMD_set((acc_t)0); - sv4 = SIMD_set((acc_t)0); - sv5 = SIMD_set((acc_t)0); - } + SIMD_acc_t sevdwl; + if (EFLAG) { + fwtmp = SIMD_set((acc_t)0); + sevdwl = SIMD_set((acc_t)0); } SIMD_int ejnum = SIMD_set(0); @@ -930,19 +878,15 @@ void PairSWIntel::eval(const int offload, const int vflag, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2, fztmp2, fjxtmp2, fjytmp2, fjztmp2); - if (EVFLAG) { - if (EFLAG) { - if (!ONETYPE) { - c5 = SIMD_gather(&(p2e[0].c5), ijtype); - c6 = SIMD_gather(&(p2e[0].c6), ijtype); - } - SIMD_flt_t evdwl; - evdwl = (c5 * rp - c6 * rq) * expsrainv; - SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp, - fwtmp2, fjtmp2); - } - SIMD_ev_tally_nbor(hmask, vflag, (flt_t)1.0, fpair, delx, dely, - delz, sv0, sv1, sv2, sv3, sv4, sv5); + if (EFLAG) { + if (!ONETYPE) { + c5 = SIMD_gather(&(p2e[0].c5), ijtype); + c6 = SIMD_gather(&(p2e[0].c6), ijtype); + } + SIMD_flt_t evdwl; + evdwl = (c5 * rp - c6 * rq) * expsrainv; + SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp, + fwtmp2, fjtmp2); } } @@ -1012,21 +956,15 @@ void PairSWIntel::eval(const int offload, const int vflag, fztmp2, fjxtmp2, fjytmp2, fjztmp2, tf + kcoffset * 3, swidth); - if (EVFLAG) { - if (EFLAG) { - SIMD_int k; - if (eatom) { - k = SIMD_load(tj + kcoffset); - k = k << 4; - } - SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp, - fwtmp2, fjtmp2, k, dforce); + if (EFLAG) { + SIMD_int k; + if (eatom) { + k = SIMD_load(tj + kcoffset); + k = k << 4; } - SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz, - delx, dely, delz, delr2x, delr2y, delr2z, - sv0, sv1, sv2, sv3, sv4, sv5); + SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp, + fwtmp2, fjtmp2, k, dforce); } - } // for kk if (is_same<flt_t,acc_t>::value == 1) SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp); @@ -1087,52 +1025,34 @@ void PairSWIntel::eval(const int offload, const int vflag, } // for jj second loop SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp, - EVFLAG, eatom, fwtmp); + EFLAG, eatom, fwtmp); if (is_same<flt_t,acc_t>::value == 0) { imask = imask >> 8; SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, - fztmp2, EVFLAG, eatom, fwtmp2); - } - if (EVFLAG) { - if (EFLAG) oevdwl += SIMD_sum(sevdwl); - if (vflag == 1) { - ov0 += SIMD_sum(sv0); - ov1 += SIMD_sum(sv1); - ov2 += SIMD_sum(sv2); - ov3 += SIMD_sum(sv3); - ov4 += SIMD_sum(sv4); - ov5 += SIMD_sum(sv5); - } + fztmp2, EFLAG, eatom, fwtmp2); } - ilist = ilist + swidth; + if (EFLAG) oevdwl += SIMD_sum(sevdwl); + ilist = ilist + iip; } // for ii - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(1, EVFLAG, EFLAG, vflag, eatom, nall, nlocal, - minlocal, nthreads, f_start, f_stride, x, - offload); - } + IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, + x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); } // end omp - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; - } + IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -1143,7 +1063,7 @@ void PairSWIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -1212,6 +1132,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, #ifdef LMP_USE_AVXCD fix->nbor_pack_width(SIMD_type<flt_t>::width()); #endif + fix->three_body_neighbor(1); int off_ccache = 0; #ifdef _LMP_INTEL_OFFLOAD diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h index 8723803a35..b55022328f 100644 --- a/src/USER-INTEL/pair_sw_intel.h +++ b/src/USER-INTEL/pair_sw_intel.h @@ -46,7 +46,7 @@ class PairSWIntel : public PairSW { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t> + template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend, const int pad_width); diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp index 88354ec4d0..f59a6b7c96 100644 --- a/src/USER-INTEL/pair_tersoff_intel.cpp +++ b/src/USER-INTEL/pair_tersoff_intel.cpp @@ -119,32 +119,30 @@ void PairTersoffIntel::compute(int eflag, int vflag, if (ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); + int packthreads; + if (nthreads > INTEL_HTHREADS) packthreads = nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - nthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - if (evflag || vflag_fdotr) { - int ovflag = 0; - if (vflag_fdotr) ovflag = 2; - else if (vflag) ovflag = 1; - if (eflag) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); - } else { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); - } + int ovflag = 0; + if (vflag_fdotr) ovflag = 2; + else if (vflag) ovflag = 1; + if (eflag) { + eval<1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,1>(1, 0, buffers, fc, 0, offload_end); - eval<0,0,1>(0, 0, buffers, fc, host_start, inum); + eval<0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0>(0, ovflag, buffers, fc, host_start, inum); } } @@ -202,7 +200,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> ); // perform the actual computation - template<bool EVFLAG, bool EFLAG> + template<bool EFLAG> static void kernel( int iito, int iifrom, int eatom, int vflag, const int * _noalias const numneigh, @@ -213,11 +211,11 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5 + acc_t *evdwl ); // perform one step of calculation, pass in i-j pairs of atoms (is, js) - template<int EVFLAG, int EFLAG> + template<int EFLAG> static void kernel_step( int eatom, int vflag, const int * _noalias const numneigh, @@ -228,13 +226,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - avec *vsevdwl, avec *vsv0, avec * vsv1, avec *vsv2, avec* vsv3, avec *vsv4, avec *vsv5, - int compress_idx, iarr is, iarr js, bvec vmask_repulsive + avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive ); // perform one step of calculation, as opposed to the previous method now // with fixed i and a number of js - template<int EVFLAG, int EFLAG> + template<int EFLAG> static void kernel_step_const_i( int eatom, int vflag, const int * _noalias const numneigh, const int * _noalias const cnumneigh, @@ -243,8 +240,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - avec *vsevdwl, avec *vsv0, avec *vsv1, avec *vsv2, avec *vsv3, avec *vsv4, avec *vsv5, - int compress_idx, int i, iarr js, bvec vmask_repulsive + avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive ); }; @@ -257,7 +253,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> // Dispatch to correct kernel instatiation and perform all the work neccesary // for offloading. In this routine we enter the Phi. // This method is nearly identical to what happens in the other /intel styles -template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> +template <int EFLAG, class flt_t, class acc_t> void PairTersoffIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, @@ -292,7 +288,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; - IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag, + IP_PRE_get_transfern(ago, 1, EFLAG, vflag, buffers, offload, fix, separate_flag, x_size, q_size, ev_size, f_stride); @@ -330,20 +326,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag, #endif #endif - IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, 0); + IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, + f_stride, x, 0); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EFLAG) oevdwl = oecoul = (acc_t)0; + if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(f_start,f_stride,nlocal,nall,minlocal) \ - reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) #endif { int iifrom, iito, tid; @@ -355,10 +347,10 @@ void PairTersoffIntel::eval(const int offload, const int vflag, memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); { - acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; - sevdwl = sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = 0.; + acc_t sevdwl; + sevdwl = 0.; #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \ - firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl, &sv0, &sv1, &sv2, &sv3, &sv4, &sv5 + firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl // Pick the variable i algorithm under specific conditions // do use scalar algorithm with very short vectors int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL; @@ -366,50 +358,34 @@ void PairTersoffIntel::eval(const int offload, const int vflag, lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops; bool use_scalar = VL < 4; if (use_scalar) { - IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EVFLAG,EFLAG>(ARGS); + IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS); } else if (pack_i) { - IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EVFLAG,EFLAG>(ARGS); + IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS); } else { - IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EVFLAG,EFLAG>(ARGS); - } - if (EVFLAG) { - if (EFLAG) oevdwl += sevdwl; - if (vflag == 1) { - ov0 += sv0; - ov1 += sv1; - ov2 += sv2; - ov3 += sv3; - ov4 += sv4; - ov5 += sv5; - } + IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS); } + if (EFLAG) oevdwl += sevdwl; } - #ifndef _LMP_INTEL_OFFLOAD - if (vflag == 2) - #endif - { - #if defined(_OPENMP) - #pragma omp barrier - #endif - IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG, EFLAG, vflag, eatom, nall, - nlocal, minlocal, nthreads, f_start, f_stride, - x, offload); - } + IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region - if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = 0.0; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; - } + + IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, + ov0, ov1, ov2, ov3, ov4, ov5); + + if (EFLAG) { + ev_global[0] = oevdwl; + ev_global[1] = 0.0; + } + if (vflag) { + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #ifdef _LMP_INTEL_OFFLOAD @@ -424,7 +400,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag, else fix->stop_watch(TIME_HOST_PAIR); - if (EVFLAG) + if (EFLAG || vflag) fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag); else fix->add_result_array(f_start, 0, offload); @@ -457,6 +433,7 @@ void PairTersoffIntel::init_style() fix = static_cast<FixIntel *>(modify->fix[ifix]); fix->pair_init_check(); + fix->three_body_neighbor(1); #ifdef _LMP_INTEL_OFFLOAD _cop = fix->coprocessor_number(); #endif @@ -663,7 +640,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, static const int N_CACHE = 8; template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> -template<int EVFLAG, int EFLAG> +template<int EFLAG> void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( int eatom, int vflag, const int * _noalias const numneigh, const int * _noalias const cnumneigh, @@ -673,12 +650,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, avec *vsevdwl, - avec *vsv0, - avec *vsv1, - avec *vsv2, - avec* vsv3, - avec *vsv4, - avec *vsv5, int compress_idx, iarr is, iarr js, @@ -829,21 +800,11 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair; vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair; - if (EVFLAG) { - if (EFLAG) { - *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); - if (eatom) { - v::store(fw, (v_0_5 * vevdwl)); - } + if (EFLAG) { + *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); + if (eatom) { + v::store(fw, (v_0_5 * vevdwl)); } - if (vflag == 1) { - *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair); - *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair); - *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair); - *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair); - *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair); - *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair); - } } { while (cache_idx-- > 0) { @@ -933,7 +894,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( f[t_].x += fx[t]; f[t_].y += fy[t]; f[t_].z += fz[t]; - if (EVFLAG && EFLAG && eatom) { + if (EFLAG && eatom) { f[t_].w += fw[t]; } } @@ -945,7 +906,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( f[t_].x += fx[t]; f[t_].y += fy[t]; f[t_].z += fz[t]; - if (EVFLAG && EFLAG && eatom) { + if (EFLAG && eatom) { f[t_].w += fw[t]; } } @@ -954,7 +915,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( // Specialized kernel step for fixed i, means that we don't have to use the // convoluted iteration scheme above, as the loop variables are uniform. template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> -template<int EVFLAG, int EFLAG> +template<int EFLAG> void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( int eatom, int vflag, const int * _noalias const numneigh, const int * _noalias const cnumneigh, @@ -964,12 +925,6 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, avec *vsevdwl, - avec *vsv0, - avec *vsv1, - avec *vsv2, - avec* vsv3, - avec *vsv4, - avec *vsv5, int compress_idx, int i, iarr js, @@ -1097,22 +1052,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair); vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair); - if (EVFLAG) { - if (EFLAG) { - *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); - if (eatom) { - vfwtmp = v_0_5 * vevdwl; - v::store(fw, vfwtmp); - } + if (EFLAG) { + *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); + if (eatom) { + vfwtmp = v_0_5 * vevdwl; + v::store(fw, vfwtmp); } - if (vflag == 1) { - *vsv0 = v::acc_mask_add(*vsv0, vmask, *vsv0, vdx_ij * vdx_ij * vfpair); - *vsv1 = v::acc_mask_add(*vsv1, vmask, *vsv1, vdy_ij * vdy_ij * vfpair); - *vsv2 = v::acc_mask_add(*vsv2, vmask, *vsv2, vdz_ij * vdz_ij * vfpair); - *vsv3 = v::acc_mask_add(*vsv3, vmask, *vsv3, vdx_ij * vdy_ij * vfpair); - *vsv4 = v::acc_mask_add(*vsv4, vmask, *vsv4, vdx_ij * vdz_ij * vfpair); - *vsv5 = v::acc_mask_add(*vsv5, vmask, *vsv5, vdy_ij * vdz_ij * vfpair); - } } while (cache_idx-- > 0) { fvec vfkx = vprefactor * vfkx_cache[cache_idx]; @@ -1169,20 +1114,20 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( f[t_].x += fx[t]; f[t_].y += fy[t]; f[t_].z += fz[t]; - if (EVFLAG && EFLAG && eatom) { + if (EFLAG && eatom) { f[t_].w += fw[t]; } } f[i].x += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfxtmp, v::zero())); f[i].y += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfytmp, v::zero())); f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfztmp, v::zero())); - if (EVFLAG && EFLAG && eatom) { + if (EFLAG && eatom) { f[i].z += v::acc_reduce_add(v::acc_mask_add(v::acc_zero(), vmask, vfwtmp, v::zero())); } } template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> -template<bool EVFLAG, bool EFLAG> +template<bool EFLAG> void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( int iito, int iifrom, int eatom, int vflag, const int * _noalias const numneigh, @@ -1193,14 +1138,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - acc_t *evdwl, acc_t *ov0, acc_t * ov1, acc_t *ov2, acc_t* ov3, acc_t *ov4, acc_t *ov5 + acc_t *evdwl ) { int compress_idx = 0; int ii, jj; iarr is, js; avec vsevdwl = v::acc_zero(); - avec vsv0 = v::acc_zero(), vsv1 = v::acc_zero(), vsv2 = v::acc_zero(); - avec vsv3 = v::acc_zero(), vsv4 = v::acc_zero(), vsv5 = v::acc_zero(); ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4)); ivec vj, v_NEIGHMASK(NEIGHMASK); bvec vmask_repulsive(0); @@ -1237,11 +1180,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( if (pack_i) { if (compress_idx == v::VL) { vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); - kernel_step<EVFLAG,EFLAG>( + kernel_step<EFLAG>( eatom, vflag, numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, - &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, + &vsevdwl, compress_idx, is, js, vmask_repulsive ); compress_idx = 0; @@ -1250,11 +1193,11 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( } else { if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) { vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); - kernel_step_const_i<EVFLAG,EFLAG>( + kernel_step_const_i<EFLAG>( eatom, vflag, numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, - &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, + &vsevdwl, compress_idx, i, js, vmask_repulsive ); compress_idx = 0; @@ -1265,26 +1208,16 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( } if (compress_idx > 0) { vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); - IntelKernelTersoff::kernel_step<EVFLAG,EFLAG>( + IntelKernelTersoff::kernel_step<EFLAG>( eatom, vflag, numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, - &vsevdwl, &vsv0, &vsv1, &vsv2, &vsv3, &vsv4, &vsv5, compress_idx, + &vsevdwl, compress_idx, is, js, vmask_repulsive ); } - if (EVFLAG) { - if (EFLAG) { - *evdwl += v::acc_reduce_add(vsevdwl); - } - if (vflag == 1) { - *ov0 += v::acc_reduce_add(vsv0); - *ov1 += v::acc_reduce_add(vsv1); - *ov2 += v::acc_reduce_add(vsv2); - *ov3 += v::acc_reduce_add(vsv3); - *ov4 += v::acc_reduce_add(vsv4); - *ov5 += v::acc_reduce_add(vsv5); - } + if (EFLAG) { + *evdwl += v::acc_reduce_add(vsevdwl); } } diff --git a/src/USER-INTEL/pair_tersoff_intel.h b/src/USER-INTEL/pair_tersoff_intel.h index c9604f2797..c725487ae7 100644 --- a/src/USER-INTEL/pair_tersoff_intel.h +++ b/src/USER-INTEL/pair_tersoff_intel.h @@ -79,7 +79,7 @@ class PairTersoffIntel : public PairTersoff { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> + template <int EFLAG, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp new file mode 100644 index 0000000000..110649f8ee --- /dev/null +++ b/src/USER-INTEL/pppm_disp_intel.cpp @@ -0,0 +1,3034 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#include <mpi.h> +#include <stdlib.h> +#include <math.h> +#include "pppm_disp_intel.h" +#include "atom.h" +#include "error.h" +#include "fft3d_wrap.h" +#include "gridcomm.h" +#include "math_const.h" +#include "math_special.h" +#include "memory.h" +#include "suffix.h" + +using namespace LAMMPS_NS; +using namespace MathConst; +using namespace MathSpecial; + +#define MAXORDER 7 +#define OFFSET 16384 +#define SMALL 0.00001 +#define LARGE 10000.0 +#define EPS_HOC 1.0e-7 + +enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER}; +enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE}; +enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM, + FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G, + FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A, + FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE, + FORWARD_AD_PERATOM_NONE}; + +#ifdef FFT_SINGLE +#define ZEROF 0.0f +#define ONEF 1.0f +#else +#define ZEROF 0.0 +#define ONEF 1.0 +#endif + +/* ---------------------------------------------------------------------- */ + +PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) : + PPPMDisp(lmp, narg, arg) +{ + suffix_flag |= Suffix::INTEL; + + order = 7; + order_6 = 7; //sets default stencil sizes to 7 + + perthread_density = NULL; + particle_ekx = particle_eky = particle_ekz = NULL; + particle_ekx0 = particle_eky0 = particle_ekz0 = NULL; + particle_ekx1 = particle_eky1 = particle_ekz1 = NULL; + particle_ekx2 = particle_eky2 = particle_ekz2 = NULL; + particle_ekx3 = particle_eky3 = particle_ekz3 = NULL; + particle_ekx4 = particle_eky4 = particle_ekz4 = NULL; + particle_ekx5 = particle_eky5 = particle_ekz5 = NULL; + particle_ekx6 = particle_eky6 = particle_ekz6 = NULL; + + rho_lookup = drho_lookup = NULL; + rho6_lookup = drho6_lookup = NULL; + rho_points = 0; + + _use_table = _use_packing = _use_lrt = 0; +} + +PPPMDispIntel::~PPPMDispIntel() +{ + memory->destroy(perthread_density); + memory->destroy(particle_ekx); + memory->destroy(particle_eky); + memory->destroy(particle_ekz); + + memory->destroy(rho_lookup); + memory->destroy(drho_lookup); + memory->destroy(rho6_lookup); + memory->destroy(drho6_lookup); +} + + + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + + +void PPPMDispIntel::init() +{ + + PPPMDisp::init(); + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast<FixIntel *>(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->kspace_init_check(); + + _use_lrt = fix->lrt(); + if (_use_lrt) + error->all(FLERR, + "LRT mode is currently not supported for pppm/disp/intel"); + + + // For vectorization, we need some padding in the end + // The first thread computes on the global density + if ((comm->nthreads > 1) && !_use_lrt) { + memory->destroy(perthread_density); + memory->create(perthread_density, comm->nthreads-1, + ngrid + INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:perthread_density"); + } + + _use_table = fix->pppm_table(); + if (_use_table) { + rho_points = 5000; + memory->destroy(rho_lookup); + memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:rho_lookup"); + memory->destroy(rho6_lookup); + memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:rho6_lookup"); + + if(differentiation_flag == 1) { + memory->destroy(drho_lookup); + memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:drho_lookup"); + memory->destroy(drho6_lookup); + memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:drho6_lookup"); + } + precompute_rho(); + } + if (order > INTEL_P3M_MAXORDER) + error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); +} + +/* ---------------------------------------------------------------------- + compute the PPPMDispIntel long-range force, energy, virial +------------------------------------------------------------------------- */ + +void PPPMDispIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + PPPMDisp::compute(eflag, vflag); + return; + } + #endif + int i; + // convert atoms from box to lamda coords + + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = evflag_atom = eflag_global = vflag_global = + eflag_atom = vflag_atom = 0; + + if (evflag_atom && !peratom_allocate_flag) { + allocate_peratom(); + if (function[0]) { + cg_peratom->ghost_notify(); + cg_peratom->setup(); + } + if (function[1] + function[2] + function[3]) { + cg_peratom_6->ghost_notify(); + cg_peratom_6->setup(); + } + peratom_allocate_flag = 1; + } + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + // extend size of per-atom arrays if necessary + + if (atom->nmax > nmax) { + + if (function[0]) memory->destroy(part2grid); + if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6); + if (differentiation_flag == 1) { + memory->destroy(particle_ekx); + memory->destroy(particle_eky); + memory->destroy(particle_ekz); + if (function[2] == 1){ + memory->destroy(particle_ekx0); + memory->destroy(particle_eky0); + memory->destroy(particle_ekz0); + memory->destroy(particle_ekx1); + memory->destroy(particle_eky1); + memory->destroy(particle_ekz1); + memory->destroy(particle_ekx2); + memory->destroy(particle_eky2); + memory->destroy(particle_ekz2); + memory->destroy(particle_ekx3); + memory->destroy(particle_eky3); + memory->destroy(particle_ekz3); + memory->destroy(particle_ekx4); + memory->destroy(particle_eky4); + memory->destroy(particle_ekz4); + memory->destroy(particle_ekx5); + memory->destroy(particle_eky5); + memory->destroy(particle_ekz5); + memory->destroy(particle_ekx6); + memory->destroy(particle_eky6); + memory->destroy(particle_ekz6); + } + + } + nmax = atom->nmax; + if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid"); + if (function[1] + function[2] + function[3]) + memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6"); + if (differentiation_flag == 1) { + memory->create(particle_ekx, nmax, "pppmdispintel:pekx"); + memory->create(particle_eky, nmax, "pppmdispintel:peky"); + memory->create(particle_ekz, nmax, "pppmdispintel:pekz"); + if (function[2] == 1){ + memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0"); + memory->create(particle_eky0, nmax, "pppmdispintel:peky0"); + memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0"); + memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1"); + memory->create(particle_eky1, nmax, "pppmdispintel:peky1"); + memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1"); + memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2"); + memory->create(particle_eky2, nmax, "pppmdispintel:peky2"); + memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2"); + memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3"); + memory->create(particle_eky3, nmax, "pppmdispintel:peky3"); + memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3"); + memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4"); + memory->create(particle_eky4, nmax, "pppmdispintel:peky4"); + memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4"); + memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5"); + memory->create(particle_eky5, nmax, "pppmdispintel:peky5"); + memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5"); + memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6"); + memory->create(particle_eky6, nmax, "pppmdispintel:peky6"); + memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6"); + } + } + } + energy = 0.0; + energy_1 = 0.0; + energy_6 = 0.0; + if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0; + + // find grid points for all my particles + // distribute partcles' charges/dispersion coefficients on the grid + // communication between processors and remapping two fft + // Solution of poissons equation in k-space and backtransformation + // communication between processors + // calculation of forces + + if (function[0]) { + + //perform calculations for coulomb interactions only + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid, + nupper, nlower, nxlo_out, nylo_out, nzlo_out, + nxhi_out, nyhi_out, nzhi_out, + fix->get_mixed_buffers()); + make_rho_c<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid, + nupper, nlower, nxlo_out, nylo_out, + nzlo_out, nxhi_out, nyhi_out, nzhi_out, + fix->get_double_buffers()); + make_rho_c<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid, + nupper, nlower, nxlo_out, nylo_out, nzlo_out, + nxhi_out, nyhi_out, nzhi_out, + fix->get_single_buffers()); + make_rho_c<float,float>(fix->get_single_buffers()); + } + + cg->reverse_comm(this,REVERSE_RHO); + + brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, + density_brick, density_fft, work1,remap); + + if (differentiation_flag == 1) { + poisson_ad(work1, work2, density_fft, fft1, fft2, + nx_pppm, ny_pppm, nz_pppm, nfft, + nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft, + nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, + energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick, + v1_brick, v2_brick, v3_brick, v4_brick, v5_brick); + + cg->forward_comm(this,FORWARD_AD); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_c_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_c_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_c_ad<float,float>(fix->get_single_buffers()); + } + + if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM); + + } else { + poisson_ik(work1, work2, density_fft, fft1, fft2, + nx_pppm, ny_pppm, nz_pppm, nfft, + nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft, + nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, + energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2, + vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2, + u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, + v5_brick); + + cg->forward_comm(this, FORWARD_IK); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_c_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_c_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_c_ik<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM); + } + if (evflag_atom) fieldforce_c_peratom(); + } + + if (function[1]) { + //perfrom calculations for geometric mixing + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_mixed_buffers()); + make_rho_g<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_double_buffers()); + make_rho_g<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_single_buffers()); + make_rho_g<float,float>(fix->get_single_buffers()); + } + + + cg_6->reverse_comm(this, REVERSE_RHO_G); + + brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6, + density_brick_g, density_fft_g, work1_6,remap_6); + + if (differentiation_flag == 1) { + + poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, + nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, + nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6, + nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6, + virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g, + v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g); + + cg_6->forward_comm(this,FORWARD_AD_G); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_g_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_g_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_g_ad<float,float>(fix->get_single_buffers()); + } + + if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G); + + } else { + poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, + nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, + nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, + nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6, + fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g, + vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, + v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g); + + cg_6->forward_comm(this,FORWARD_IK_G); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_g_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_g_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_g_ik<float,float>(fix->get_single_buffers()); + } + + + if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G); + } + if (evflag_atom) fieldforce_g_peratom(); + } + + if (function[2]) { + //perform calculations for arithmetic mixing + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, + nxlo_out_6, nylo_out_6, nzlo_out_6, + nxhi_out_6, nyhi_out_6, nzhi_out_6, + fix->get_mixed_buffers()); + make_rho_a<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_double_buffers()); + make_rho_a<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_single_buffers()); + make_rho_a<float,float>(fix->get_single_buffers()); + } + + cg_6->reverse_comm(this, REVERSE_RHO_A); + + brick2fft_a(); + + if ( differentiation_flag == 1) { + + poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, + nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, + nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, + nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6, + u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, + v3_brick_a3, v4_brick_a3, v5_brick_a3); + poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0, + v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, + v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6, + v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6); + poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1, + v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, + v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5, + v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5); + poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2, + v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, + v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4, + v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4); + + cg_6->forward_comm(this, FORWARD_AD_A); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_a_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_a_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_a_ad<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A); + + } else { + + poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, + nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, + nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, + nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6, + fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, + virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3, + v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3); + poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0, + vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6, + vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0, + v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0, + u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, + v3_brick_a6, v4_brick_a6, v5_brick_a6); + poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1, + vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5, + vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1, + v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1, + u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, + v3_brick_a5, v4_brick_a5, v5_brick_a5); + poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2, + vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4, + vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2, + v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2, + u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, + v3_brick_a4, v4_brick_a4, v5_brick_a4); + + cg_6->forward_comm(this, FORWARD_IK_A); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_a_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_a_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_a_ik<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A); + } + if (evflag_atom) fieldforce_a_peratom(); + } + + if (function[3]) { + //perform calculations if no mixing rule applies + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_mixed_buffers()); + make_rho_none<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_double_buffers()); + make_rho_none<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_single_buffers()); + make_rho_none<float,float>(fix->get_single_buffers()); + } + + cg_6->reverse_comm(this, REVERSE_RHO_NONE); + + brick2fft_none(); + + if (differentiation_flag == 1) { + + int n = 0; + for (int k = 0; k<nsplit_alloc/2; k++) { + poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1], + u_brick_none[n],u_brick_none[n+1], + v0_brick_none, v1_brick_none, v2_brick_none, + v3_brick_none, v4_brick_none, v5_brick_none); + n += 2; + } + + cg_6->forward_comm(this,FORWARD_AD_NONE); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_none_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_none_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_none_ad<float,float>(fix->get_single_buffers()); + } + + if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE); + + } else { + int n = 0; + for (int k = 0; k<nsplit_alloc/2; k++) { + + poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1], + vdx_brick_none[n], vdy_brick_none[n], + vdz_brick_none[n], vdx_brick_none[n+1], + vdy_brick_none[n+1], vdz_brick_none[n+1], + u_brick_none, v0_brick_none, v1_brick_none, + v2_brick_none, v3_brick_none, v4_brick_none, + v5_brick_none); + n += 2; + } + + cg_6->forward_comm(this,FORWARD_IK_NONE); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_none_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_none_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_none_ik<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) + cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE); + } + if (evflag_atom) fieldforce_none_peratom(); + } + + // update qsum and qsqsum, if atom count has changed and energy needed + + if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) { + qsum_qsq(); + natoms_original = atom->natoms; + } + + // sum energy across procs and add in volume-dependent term + + const double qscale = force->qqrd2e * scale; + if (eflag_global) { + double energy_all; + MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy_1 = energy_all; + MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy_6 = energy_all; + + energy_1 *= 0.5*volume; + energy_6 *= 0.5*volume; + + energy_1 -= g_ewald*qsqsum/MY_PIS + + MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume); + energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij + + 1.0/12.0*pow(g_ewald_6,6)*csum; + energy_1 *= qscale; + } + + // sum virial across procs + + if (vflag_global) { + double virial_all[6]; + MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; + MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i]; + if (function[1]+function[2]+function[3]){ + double a = MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij; + virial[0] -= a; + virial[1] -= a; + virial[2] -= a; + } + } + + if (eflag_atom) { + if (function[0]) { + double *q = atom->q; + for (i = 0; i < atom->nlocal; i++) { + eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]* + qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction + } + } + if (function[1] + function[2] + function[3]) { + int tmp; + for (i = 0; i < atom->nlocal; i++) { + tmp = atom->type[i]; + eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] + + 1.0/12.0*pow(g_ewald_6,6)*cii[tmp]; + } + } + } + + if (vflag_atom) { + if (function[1] + function[2] + function[3]) { + int tmp; + for (i = 0; i < atom->nlocal; i++) { + tmp = atom->type[i]; + //dispersion self virial correction + for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)* + pow(g_ewald_6,3)*csumi[tmp]; + } + } + } + + + // 2d slab correction + + if (slabflag) slabcorr(eflag); + if (function[0]) energy += energy_1; + if (function[1] + function[2] + function[3]) energy += energy_6; + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); +} + + +/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + find center grid pt for each of my particles + check that full stencil for the particle will fit in my 3d brick + store central grid pt indices in part2grid array +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t> +void PPPMDispIntel::particle_map(double delx, double dely, double delz, + double sft, int** p2g, int nup, int nlow, + int nxlo, int nylo, int nzlo, + int nxhi, int nyhi, int nzhi, + IntelBuffers<flt_t,acc_t> *buffers) +{ + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2])) + error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); + + int flag = 0; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\ + nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt) + #endif + { + double **x = atom->x; + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delx; + const flt_t yi = dely; + const flt_t zi = delz; + const flt_t fshift = sft; + + + int iifrom, iito, tid; + IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T)); + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:flag) + #endif + for (int i = iifrom; i < iito; i++) { + + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // current particle coord can be outside global and local box + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET; + int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET; + int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET; + + p2g[i][0] = nx; + p2g[i][1] = ny; + p2g[i][2] = nz; + + // check that entire stencil around nx,ny,nz will fit in my 3d brick + + if (nx+nlow < nxlo || nx+nup > nxhi || + ny+nlow < nylo || ny+nup > nyhi || + nz+nlow < nzlo || nz+nup > nzhi) + flag = 1; + } + } + + if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp"); +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) +{ + // clear 3d density array + + FFT_SCALAR * _noalias global_density = + &(density_brick[nzlo_out][nylo_out][nxlo_out]); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + //double *q = atom->q; + //double **x = atom->x; + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + double *q = atom->q; + double **x = atom->x; + + const int nix = nxhi_out - nxlo_out + 1; + const int niy = nyhi_out - nylo_out + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshift = shift; + const flt_t fshiftone = shiftone; + const flt_t fdelvolinv = delvolinv; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : + perthread_density[tid - 1]; + // clear 3d density array + memset(my_density, 0, ngrid * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + + int nysum = nlower + ny - nylo_out; + int nxsum = nlower + nx - nxlo_out; + int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho_lookup[idx][k]; + rho[1][k] = rho_lookup[idy][k]; + rho[2][k] = rho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order-1; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + rho[0][k-nlower] = r1; + rho[1][k-nlower] = r2; + rho[2][k-nlower] = r3; + } + } + + FFT_SCALAR z0 = fdelvolinv * q[i]; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + my_density[mzyx] += x0*rho[0][l]; + } + } + } + } + } + + // reduce all the perthread_densities into global_density + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; + } + } + } +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = dispersion "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid --- geometric mixing +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) +{ + // clear 3d density array + + FFT_SCALAR * _noalias global_density = + &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + int type; + double **x = atom->x; + + const int nix = nxhi_out_6 - nxlo_out_6 + 1; + const int niy = nyhi_out_6 - nylo_out_6 + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshift = shift_6; + const flt_t fshiftone = shiftone_6; + const flt_t fdelvolinv = delvolinv_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : + perthread_density[tid - 1]; + + // clear 3d density array + memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nysum = nlower_6 + ny - nylo_out_6; + int nxsum = nlower_6 + nx - nxlo_out_6; + int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order_6-1; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + } + } + + type = atom->type[i]; + FFT_SCALAR z0 = fdelvolinv * B[type]; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + my_density[mzyx] += x0*rho[0][l]; + } + } + } + } + } + + // reduce all the perthread_densities into global_density + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; + } + } + } + +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = dispersion "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid --- arithmetic mixing +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) +{ + // clear 3d density array + + memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atom->nlocal; + + double **x = atom->x; + + const int nix = nxhi_out_6 - nxlo_out_6 + 1; + const int niy = nyhi_out_6 - nylo_out_6 + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshift = shift_6; + const flt_t fshiftone = shiftone_6; + const flt_t fdelvolinv = delvolinv_6; + + for (int i = 0; i < nlocal; i++) { + + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order_6-1; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + } + } + + const int type = atom->type[i]; + FFT_SCALAR z0 = fdelvolinv; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + FFT_SCALAR w = x0*rho[0][l]; + density_brick_a0[mz][my][mx] += w*B[7*type]; + density_brick_a1[mz][my][mx] += w*B[7*type+1]; + density_brick_a2[mz][my][mx] += w*B[7*type+2]; + density_brick_a3[mz][my][mx] += w*B[7*type+3]; + density_brick_a4[mz][my][mx] += w*B[7*type+4]; + density_brick_a5[mz][my][mx] += w*B[7*type+5]; + density_brick_a6[mz][my][mx] += w*B[7*type+6]; + } + } + } + } +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = dispersion "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid --- case when mixing rules don't apply +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) +{ + + FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + int type; + double **x = atom->x; + + const int nix = nxhi_out_6 - nxlo_out_6 + 1; + const int niy = nyhi_out_6 - nylo_out_6 + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshift = shift_6; + const flt_t fshiftone = shiftone_6; + const flt_t fdelvolinv = delvolinv_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : + perthread_density[tid - 1]; + // clear 3d density array + memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nysum = nlower_6 + ny - nylo_out_6; + int nxsum = nlower_6 + nx - nxlo_out_6; + int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order_6-1; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + } + } + + type = atom->type[i]; + FFT_SCALAR z0 = fdelvolinv; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + FFT_SCALAR w0 = x0*rho[0][l]; + for(int k = 0; k < nsplit; k++) + my_density[mzyx + k*ngrid_6] += x0*rho[0][l]; + } + } + } + } + } + + // reduce all the perthread_densities into global_density + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; + } + } + } + +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles + for ik scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + //double *q = atom->q; + //double **x = atom->x; + //double **f = atom->f; + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshiftone = shiftone; + const flt_t fqqrd2es = qqrd2e * scale; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + + int nxsum = nx + nlower; + int nysum = ny + nlower; + int nzsum = nz + nlower;; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho_lookup[idx][k]; + rho1[k] = rho_lookup[idy][k]; + rho2[k] = rho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1 = rho_coeff[order-1][k]; + FFT_SCALAR r2 = rho_coeff[order-1][k]; + FFT_SCALAR r3 = rho_coeff[order-1][k]; + for (int l = order-2; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + + rho0[k-nlower] = r1; + rho1[k-nlower] = r2; + rho2[k-nlower] = r3; + } + } + + _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx_arr[l] -= x0*vdx_brick[mz][my][mx]; + eky_arr[l] -= x0*vdy_brick[mz][my][mx]; + ekz_arr[l] -= x0*vdz_brick[mz][my][mx]; + + } + } + } + + FFT_SCALAR ekx, eky, ekz; + ekx = eky = ekz = ZEROF; + + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } + + // convert E-field to force + + const flt_t qfactor = fqqrd2es * q[i]; + f[i][0] += qfactor*ekx; + f[i][1] += qfactor*eky; + if (slabflag != 2) f[i][2] += qfactor*ekz; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles + for ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + //double *q = atom->q; + //double **x = atom->x; + //double **f = atom->f; + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; + FFT_SCALAR * _noalias const particle_eky = this->particle_eky; + FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshiftone = shiftone; + const flt_t fqqrd2es = qqrd2e * scale; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm/xprd; + const flt_t hy_inv = ny_pppm/yprd; + const flt_t hz_inv = nz_pppm/zprd; + + const flt_t fsf_coeff0 = sf_coeff[0]; + const flt_t fsf_coeff1 = sf_coeff[1]; + const flt_t fsf_coeff2 = sf_coeff[2]; + const flt_t fsf_coeff3 = sf_coeff[3]; + const flt_t fsf_coeff4 = sf_coeff[4]; + const flt_t fsf_coeff5 = sf_coeff[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower; + int nysum = ny + nlower; + int nzsum = nz + nlower; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho_lookup[idx][k]; + rho[1][k] = rho_lookup[idy][k]; + rho[2][k] = rho_lookup[idz][k]; + drho[0][k] = drho_lookup[idx][k]; + drho[1][k] = drho_lookup[idy][k]; + drho[2][k] = drho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff[order-1][k]; + r2 = rho_coeff[order-1][k]; + r3 = rho_coeff[order-1][k]; + for (int l = order-2; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1 * dx; + r2 = rho_coeff[l][k] + r2 * dy; + r3 = rho_coeff[l][k] + r3 * dz; + dr1 = drho_coeff[l][k] + dr1 * dx; + dr2 = drho_coeff[l][k] + dr2 * dy; + dr3 = drho_coeff[l][k] + dr3 * dz; + } + rho[0][k-nlower] = r1; + rho[1][k-nlower] = r2; + rho[2][k-nlower] = r3; + drho[0][k-nlower] = dr1; + drho[1][k-nlower] = dr2; + drho[2][k-nlower] = dr3; + } + } + _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; + eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx]; + ekz[l] += rho[0][l] * ekz_p * u_brick[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx[i] += ekx[l]; + particle_eky[i] += eky[l]; + particle_ekz[i] += ekz[l]; + } + } + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx[i] *= hx_inv; + particle_eky[i] *= hy_inv; + particle_ekz[i] *= hz_inv; + + // convert E-field to force + + const flt_t qfactor = fqqrd2es * q[i]; + const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= twoqsq; + f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= twoqsq; + f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= twoqsq; + + if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for geometric mixing rule +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double lj; + int type; + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho6_lookup[idx][k]; + rho1[k] = rho6_lookup[idy][k]; + rho2[k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + + rho0[k-nlower_6] = r1; + rho1[k-nlower_6] = r2; + rho2[k-nlower_6] = r3; + } + } + + _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx]; + eky_arr[l] -= x0*vdy_brick_g[mz][my][mx]; + ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx]; + + } + } + } + + FFT_SCALAR ekx, eky, ekz; + ekx = eky = ekz = ZEROF; + + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } + + // convert E-field to force + + type = atom->type[i]; + lj = B[type]; + f[i][0] += lj*ekx; + f[i][1] += lj*eky; + if (slabflag != 2) f[i][2] += lj*ekz; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for geometric mixing rule for ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; + FFT_SCALAR * _noalias const particle_eky = this->particle_eky; + FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm_6/xprd; + const flt_t hy_inv = ny_pppm_6/yprd; + const flt_t hz_inv = nz_pppm_6/zprd; + + const flt_t fsf_coeff0 = sf_coeff_6[0]; + const flt_t fsf_coeff1 = sf_coeff_6[1]; + const flt_t fsf_coeff2 = sf_coeff_6[2]; + const flt_t fsf_coeff3 = sf_coeff_6[3]; + const flt_t fsf_coeff4 = sf_coeff_6[4]; + const flt_t fsf_coeff5 = sf_coeff_6[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + drho[0][k] = drho6_lookup[idx][k]; + drho[1][k] = drho6_lookup[idy][k]; + drho[2][k] = drho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff_6[order_6-1][k]; + r2 = rho_coeff_6[order_6-1][k]; + r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1 * dx; + r2 = rho_coeff_6[l][k] + r2 * dy; + r3 = rho_coeff_6[l][k] + r3 * dz; + dr1 = drho_coeff_6[l][k] + dr1 * dx; + dr2 = drho_coeff_6[l][k] + dr2 * dy; + dr3 = drho_coeff_6[l][k] + dr3 * dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + drho[0][k-nlower_6] = dr1; + drho[1][k-nlower_6] = dr2; + drho[2][k-nlower_6] = dr3; + } + } + _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx]; + eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx]; + ekz[l] += rho[0][l] * ekz_p * u_brick_g[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx[i] += ekx[l]; + particle_eky[i] += eky[l]; + particle_ekz[i] += ekz[l]; + } + } + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx[i] *= hx_inv; + particle_eky[i] *= hy_inv; + particle_ekz[i] *= hz_inv; + + // convert E-field to force + + const int type = atom->type[i]; + const flt_t lj = B[type]; + const flt_t twoljsq = 2.*lj*lj; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= twoljsq; + f[i][0] += lj * particle_ekx[i] - sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= twoljsq; + f[i][1] += lj * particle_eky[i] - sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= twoljsq; + + if (slabflag != 2) f[i][2] += lj * particle_ekz[i] - sf; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for arithmetic mixing rule and ik scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho6_lookup[idx][k]; + rho1[k] = rho6_lookup[idy][k]; + rho2[k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + + rho0[k-nlower_6] = r1; + rho1[k-nlower_6] = r2; + rho2[k-nlower_6] = r3; + } + } + + _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx]; + eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx]; + ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx]; + ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx]; + eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx]; + ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx]; + ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx]; + eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx]; + ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx]; + ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx]; + eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx]; + ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx]; + ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx]; + eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx]; + ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx]; + ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx]; + eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx]; + ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx]; + ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx]; + eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx]; + ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx]; + } + } + } + + FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2; + FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5; + FFT_SCALAR ekx6, eky6, ekz6; + ekx0 = eky0 = ekz0 = ZEROF; + ekx1 = eky1 = ekz1 = ZEROF; + ekx2 = eky2 = ekz2 = ZEROF; + ekx3 = eky3 = ekz3 = ZEROF; + ekx4 = eky4 = ekz4 = ZEROF; + ekx5 = eky5 = ekz5 = ZEROF; + ekx6 = eky6 = ekz6 = ZEROF; + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx0 += ekx0_arr[l]; + eky0 += eky0_arr[l]; + ekz0 += ekz0_arr[l]; + ekx1 += ekx1_arr[l]; + eky1 += eky1_arr[l]; + ekz1 += ekz1_arr[l]; + ekx2 += ekx2_arr[l]; + eky2 += eky2_arr[l]; + ekz2 += ekz2_arr[l]; + ekx3 += ekx3_arr[l]; + eky3 += eky3_arr[l]; + ekz3 += ekz3_arr[l]; + ekx4 += ekx4_arr[l]; + eky4 += eky4_arr[l]; + ekz4 += ekz4_arr[l]; + ekx5 += ekx5_arr[l]; + eky5 += eky5_arr[l]; + ekz5 += ekz5_arr[l]; + ekx6 += ekx6_arr[l]; + eky6 += eky6_arr[l]; + ekz6 += ekz6_arr[l]; + } + + // convert D-field to force + + const int type = atom->type[i]; + const FFT_SCALAR lj0 = B[7*type+6]; + const FFT_SCALAR lj1 = B[7*type+5]; + const FFT_SCALAR lj2 = B[7*type+4]; + const FFT_SCALAR lj3 = B[7*type+3]; + const FFT_SCALAR lj4 = B[7*type+2]; + const FFT_SCALAR lj5 = B[7*type+1]; + const FFT_SCALAR lj6 = B[7*type]; + + f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + + lj4*ekx4 + lj5*ekx5 + lj6*ekx6; + f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + + lj4*eky4 + lj5*eky5 + lj6*eky6; + if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for arithmetic mixing rule for the ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0; + FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0; + FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0; + FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1; + FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1; + FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1; + FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2; + FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2; + FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2; + FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3; + FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3; + FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3; + FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4; + FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4; + FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4; + FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5; + FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5; + FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5; + FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6; + FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6; + FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm_6/xprd; + const flt_t hy_inv = ny_pppm_6/yprd; + const flt_t hz_inv = nz_pppm_6/zprd; + + const flt_t fsf_coeff0 = sf_coeff_6[0]; + const flt_t fsf_coeff1 = sf_coeff_6[1]; + const flt_t fsf_coeff2 = sf_coeff_6[2]; + const flt_t fsf_coeff3 = sf_coeff_6[3]; + const flt_t fsf_coeff4 = sf_coeff_6[4]; + const flt_t fsf_coeff5 = sf_coeff_6[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + drho[0][k] = drho6_lookup[idx][k]; + drho[1][k] = drho6_lookup[idy][k]; + drho[2][k] = drho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff_6[order_6-1][k]; + r2 = rho_coeff_6[order_6-1][k]; + r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1 * dx; + r2 = rho_coeff_6[l][k] + r2 * dy; + r3 = rho_coeff_6[l][k] + r3 * dz; + dr1 = drho_coeff_6[l][k] + dr1 * dx; + dr2 = drho_coeff_6[l][k] + dr2 * dy; + dr3 = drho_coeff_6[l][k] + dr3 * dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + drho[0][k-nlower_6] = dr1; + drho[1][k-nlower_6] = dr2; + drho[2][k-nlower_6] = dr3; + } + } + _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF; + particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF; + particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF; + particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF; + particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF; + particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF; + particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + FFT_SCALAR x0 = drho[0][l] * ekx_p; + FFT_SCALAR y0 = rho[0][l] * eky_p; + FFT_SCALAR z0 = rho[0][l] * ekz_p; + + ekx0[l] += x0 * u_brick_a0[mz][my][mx]; + eky0[l] += y0 * u_brick_a0[mz][my][mx]; + ekz0[l] += z0 * u_brick_a0[mz][my][mx]; + ekx1[l] += x0 * u_brick_a1[mz][my][mx]; + eky1[l] += y0 * u_brick_a1[mz][my][mx]; + ekz1[l] += z0 * u_brick_a1[mz][my][mx]; + ekx2[l] += x0 * u_brick_a2[mz][my][mx]; + eky2[l] += y0 * u_brick_a2[mz][my][mx]; + ekz2[l] += z0 * u_brick_a2[mz][my][mx]; + ekx3[l] += x0 * u_brick_a3[mz][my][mx]; + eky3[l] += y0 * u_brick_a3[mz][my][mx]; + ekz3[l] += z0 * u_brick_a3[mz][my][mx]; + ekx4[l] += x0 * u_brick_a4[mz][my][mx]; + eky4[l] += y0 * u_brick_a4[mz][my][mx]; + ekz4[l] += z0 * u_brick_a4[mz][my][mx]; + ekx5[l] += x0 * u_brick_a5[mz][my][mx]; + eky5[l] += y0 * u_brick_a5[mz][my][mx]; + ekz5[l] += z0 * u_brick_a5[mz][my][mx]; + ekx6[l] += x0 * u_brick_a6[mz][my][mx]; + eky6[l] += y0 * u_brick_a6[mz][my][mx]; + ekz6[l] += z0 * u_brick_a6[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx0[i] += ekx0[l]; + particle_eky0[i] += eky0[l]; + particle_ekz0[i] += ekz0[l]; + particle_ekx1[i] += ekx1[l]; + particle_eky1[i] += eky1[l]; + particle_ekz1[i] += ekz1[l]; + particle_ekx2[i] += ekx2[l]; + particle_eky2[i] += eky2[l]; + particle_ekz2[i] += ekz2[l]; + particle_ekx3[i] += ekx3[l]; + particle_eky3[i] += eky3[l]; + particle_ekz3[i] += ekz3[l]; + particle_ekx4[i] += ekx4[l]; + particle_eky4[i] += eky4[l]; + particle_ekz4[i] += ekz4[l]; + particle_ekx5[i] += ekx5[l]; + particle_eky5[i] += eky5[l]; + particle_ekz5[i] += ekz5[l]; + particle_ekx6[i] += ekx6[l]; + particle_eky6[i] += eky6[l]; + particle_ekz6[i] += ekz6[l]; + } + } + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx0[i] *= hx_inv; + particle_eky0[i] *= hy_inv; + particle_ekz0[i] *= hz_inv; + particle_ekx1[i] *= hx_inv; + particle_eky1[i] *= hy_inv; + particle_ekz1[i] *= hz_inv; + particle_ekx2[i] *= hx_inv; + particle_eky2[i] *= hy_inv; + particle_ekz2[i] *= hz_inv; + particle_ekx3[i] *= hx_inv; + particle_eky3[i] *= hy_inv; + particle_ekz3[i] *= hz_inv; + particle_ekx4[i] *= hx_inv; + particle_eky4[i] *= hy_inv; + particle_ekz4[i] *= hz_inv; + particle_ekx5[i] *= hx_inv; + particle_eky5[i] *= hy_inv; + particle_ekz5[i] *= hz_inv; + particle_ekx6[i] *= hx_inv; + particle_eky6[i] *= hy_inv; + particle_ekz6[i] *= hz_inv; + + // convert D-field to force + + const int type = atom->type[i]; + const FFT_SCALAR lj0 = B[7*type+6]; + const FFT_SCALAR lj1 = B[7*type+5]; + const FFT_SCALAR lj2 = B[7*type+4]; + const FFT_SCALAR lj3 = B[7*type+3]; + const FFT_SCALAR lj4 = B[7*type+2]; + const FFT_SCALAR lj5 = B[7*type+1]; + const FFT_SCALAR lj6 = B[7*type]; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; + f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] + + lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] + + lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; + f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] + + lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] + + lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; + if (slabflag != 2) + f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] + + lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] + + lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for no mixing rule and ik scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double lj; + int type; + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho6_lookup[idx][k]; + rho1[k] = rho6_lookup[idy][k]; + rho2[k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + + rho0[k-nlower_6] = r1; + rho1[k-nlower_6] = r2; + rho2[k-nlower_6] = r3; + } + } + + + _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); + _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); + _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); + + for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) { + ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF; + } + + for (int k = 0; k < nsplit; k++) { + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= + x0*vdx_brick_none[k][mz][my][mx]; + eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= + x0*vdy_brick_none[k][mz][my][mx]; + ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= + x0*vdz_brick_none[k][mz][my][mx]; + } + } + } + } + + _alignvar(FFT_SCALAR ekx[nsplit], 64); + _alignvar(FFT_SCALAR eky[nsplit], 64); + _alignvar(FFT_SCALAR ekz[nsplit], 64); + for (int k = 0; k < nsplit; k++) { + ekx[k] = eky[k] = ekz[k] = ZEROF; + } + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int k = 0; k < nsplit; k++) { + ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + } + } + + // convert E-field to force + + type = atom->type[i]; + for (int k = 0; k < nsplit; k++) { + lj = B[nsplit*type + k]; + f[i][0] += lj*ekx[k]; + f[i][1] += lj*eky[k]; + if (slabflag != 2) f[i][2] += lj*ekz[k]; + } + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for no mixing rule for the ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm_6/xprd; + const flt_t hy_inv = ny_pppm_6/yprd; + const flt_t hz_inv = nz_pppm_6/zprd; + + const flt_t fsf_coeff0 = sf_coeff_6[0]; + const flt_t fsf_coeff1 = sf_coeff_6[1]; + const flt_t fsf_coeff2 = sf_coeff_6[2]; + const flt_t fsf_coeff3 = sf_coeff_6[3]; + const flt_t fsf_coeff4 = sf_coeff_6[4]; + const flt_t fsf_coeff5 = sf_coeff_6[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + drho[0][k] = drho6_lookup[idx][k]; + drho[1][k] = drho6_lookup[idy][k]; + drho[2][k] = drho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff_6[order_6-1][k]; + r2 = rho_coeff_6[order_6-1][k]; + r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1 * dx; + r2 = rho_coeff_6[l][k] + r2 * dy; + r3 = rho_coeff_6[l][k] + r3 * dz; + dr1 = drho_coeff_6[l][k] + dr1 * dx; + dr2 = drho_coeff_6[l][k] + dr2 * dy; + dr3 = drho_coeff_6[l][k] + dr3 * dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + drho[0][k-nlower_6] = dr1; + drho[1][k-nlower_6] = dr2; + drho[2][k-nlower_6] = dr3; + } + } + _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); + _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); + _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); + + for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) { + ekx[k]=eky[k]=ekz[k]=ZEROF; + } + + for (int k = 0; k < nsplit; k++) { + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p * + u_brick_none[k][mz][my][mx]; + eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * eky_p * + u_brick_none[k][mz][my][mx]; + ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * ekz_p * + u_brick_none[k][mz][my][mx]; + } + } + } + } + + _alignvar(FFT_SCALAR ekx_tot[nsplit], 64); + _alignvar(FFT_SCALAR eky_tot[nsplit], 64); + _alignvar(FFT_SCALAR ekz_tot[nsplit], 64); + for (int k = 0; k < nsplit; k++) { + ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF; + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int k = 0; k < nsplit; k++) { + ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l]; + eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l]; + ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l]; + } + } + + for (int k = 0; k < nsplit; k++) { + ekx_tot[k] *= hx_inv; + eky_tot[k] *= hy_inv; + ekz_tot[k] *= hz_inv; + } + // convert D-field to force + + const int type = atom->type[i]; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1); + sf1 += fsf_coeff1 * sin(ffour_pi * s1); + + flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2); + sf2 += fsf_coeff3 * sin(ffour_pi * s2); + + flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3); + sf3 += fsf_coeff5 * sin(ffour_pi * s3); + for (int k = 0; k < nsplit; k++) { + const flt_t lj = B[nsplit*type + k]; + const flt_t twoljsq = lj*lj * B[k] * 2; + flt_t sf = sf1*twoljsq; + f[i][0] += lj * ekx_tot[k] - sf; + sf = sf2*twoljsq; + f[i][1] += lj * eky_tot[k] - sf; + sf = sf3*twoljsq; + if (slabflag != 2) f[i][2] += lj * ekz_tot[k] - sf; + } + } + } +} + +/* ---------------------------------------------------------------------- + precompute rho coefficients as a lookup table to save time in make_rho + and fieldforce. Instead of doing this polynomial for every atom 6 times + per time step, precompute it for some number of points. +------------------------------------------------------------------------- */ + +void PPPMDispIntel::precompute_rho() +{ + + half_rho_scale = (rho_points - 1.)/2.; + half_rho_scale_plus = half_rho_scale + 0.5; + + for (int i = 0; i < rho_points; i++) { + FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k=nlower; k<=nupper;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order-1; l>=0; l--){ + r1 = rho_coeff[l][k] + r1*dx; + } + rho_lookup[i][k-nlower] = r1; + } + for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho_lookup[i][k] = 0; + } + if (differentiation_flag == 1) { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k=nlower; k<=nupper;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order-2; l>=0; l--){ + r1 = drho_coeff[l][k] + r1*dx; + } + drho_lookup[i][k-nlower] = r1; + } + for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + drho_lookup[i][k] = 0; + } + } + } + for (int i = 0; i < rho_points; i++) { + FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k=nlower_6; k<=nupper_6;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order_6-1; l>=0; l--){ + r1 = rho_coeff_6[l][k] + r1*dx; + } + rho6_lookup[i][k-nlower_6] = r1; + } + for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho6_lookup[i][k] = 0; + } + if (differentiation_flag == 1) { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k=nlower_6; k<=nupper_6;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order_6-2; l>=0; l--){ + r1 = drho_coeff_6[l][k] + r1*dx; + } + drho6_lookup[i][k-nlower_6] = r1; + } + for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + drho6_lookup[i][k] = 0; + } + } + } +} + +/* ---------------------------------------------------------------------- + Returns 0 if Intel optimizations for PPPM ignored due to offload +------------------------------------------------------------------------- */ + +#ifdef _LMP_INTEL_OFFLOAD +int PPPMDispIntel::use_base() { + return _use_base; +} +#endif diff --git a/src/USER-INTEL/pppm_disp_intel.h b/src/USER-INTEL/pppm_disp_intel.h new file mode 100644 index 0000000000..166152004e --- /dev/null +++ b/src/USER-INTEL/pppm_disp_intel.h @@ -0,0 +1,238 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm/disp/intel,PPPMDispIntel) + +#else + +#ifndef LMP_PPPMINTEL_DISP_H +#define LMP_PPPMINTEL_DISP_H + +#include "pppm_disp.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + + class PPPMDispIntel : public PPPMDisp { + public: + PPPMDispIntel(class LAMMPS *, int, char **); + virtual ~PPPMDispIntel(); + virtual void init(); + virtual void compute(int, int); + + #ifdef _LMP_INTEL_OFFLOAD + int use_base(); + #endif + + protected: + FixIntel *fix; + + int _use_lrt; + FFT_SCALAR **perthread_density; + FFT_SCALAR *particle_ekx; + FFT_SCALAR *particle_eky; + FFT_SCALAR *particle_ekz; + FFT_SCALAR *particle_ekx0; + FFT_SCALAR *particle_eky0; + FFT_SCALAR *particle_ekz0; + FFT_SCALAR *particle_ekx1; + FFT_SCALAR *particle_eky1; + FFT_SCALAR *particle_ekz1; + FFT_SCALAR *particle_ekx2; + FFT_SCALAR *particle_eky2; + FFT_SCALAR *particle_ekz2; + FFT_SCALAR *particle_ekx3; + FFT_SCALAR *particle_eky3; + FFT_SCALAR *particle_ekz3; + FFT_SCALAR *particle_ekx4; + FFT_SCALAR *particle_eky4; + FFT_SCALAR *particle_ekz4; + FFT_SCALAR *particle_ekx5; + FFT_SCALAR *particle_eky5; + FFT_SCALAR *particle_ekz5; + FFT_SCALAR *particle_ekx6; + FFT_SCALAR *particle_eky6; + FFT_SCALAR *particle_ekz6; + + + + int _use_table; + int rho_points; + FFT_SCALAR **rho_lookup; + FFT_SCALAR **rho6_lookup; + FFT_SCALAR **drho_lookup; + FFT_SCALAR **drho6_lookup; + FFT_SCALAR half_rho_scale, half_rho_scale_plus; + + int _use_packing; + + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template<class flt_t, class acc_t> + void particle_map(double, double, double, + double, int **, int, int, + int, int, int, + int, int, int, + IntelBuffers<flt_t,acc_t> *buffers); + + template<class flt_t, class acc_t, int use_table> + void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_c<flt_t,acc_t,1>(buffers); + } else { + make_rho_c<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_g<flt_t,acc_t,1>(buffers); + } else { + make_rho_g<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_a<flt_t,acc_t,1>(buffers); + } else { + make_rho_a<flt_t,acc_t,0>(buffers); + } + } + + + template<class flt_t, class acc_t, int use_table> + void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_none<flt_t,acc_t,1>(buffers); + } else { + make_rho_none<flt_t,acc_t,0>(buffers); + } + } + + + template<class flt_t, class acc_t, int use_table> + void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_c_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_c_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_c_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_c_ad<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_g_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_g_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_g_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_g_ad<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_a_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_a_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_a_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_a_ad<flt_t,acc_t,0>(buffers); + } + } + template<class flt_t, class acc_t, int use_table> + void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_none_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_none_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_none_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_none_ad<flt_t,acc_t,0>(buffers); + } + } + + void precompute_rho(); + + }; + +} +#endif +#endif + + diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp index c420a23bf4..42bdec46ee 100644 --- a/src/USER-INTEL/pppm_intel.cpp +++ b/src/USER-INTEL/pppm_intel.cpp @@ -12,7 +12,9 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Rodrigo Canales (RWTH Aachen University) + Contributing authors: William McDoniel (RWTH Aachen University) + Rodrigo Canales (RWTH Aachen University) + Markus Hoehnerbach (RWTH Aachen University) W. Michael Brown (Intel) ------------------------------------------------------------------------- */ @@ -22,6 +24,7 @@ #include "pppm_intel.h" #include "atom.h" #include "error.h" +#include "fft3d_wrap.h" #include "gridcomm.h" #include "math_const.h" #include "math_special.h" @@ -54,10 +57,37 @@ enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM}; PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg) { suffix_flag |= Suffix::INTEL; + + order = 7; //sets default stencil size to 7 + + perthread_density = NULL; + particle_ekx = particle_eky = particle_ekz = NULL; + + rho_lookup = drho_lookup = NULL; + rho_points = 0; + + vdxy_brick = vdz0_brick = NULL; + work3 = NULL; + cg_pack = NULL; + + _use_table = _use_packing = _use_lrt = 0; } PPPMIntel::~PPPMIntel() { + memory->destroy(perthread_density); + memory->destroy(particle_ekx); + memory->destroy(particle_eky); + memory->destroy(particle_ekz); + + memory->destroy(rho_lookup); + memory->destroy(drho_lookup); + + memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out); + memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out); + memory->destroy(work3); + + delete cg_pack; } /* ---------------------------------------------------------------------- @@ -83,17 +113,64 @@ void PPPMIntel::init() fix->kspace_init_check(); + _use_lrt = fix->lrt(); + + // For vectorization, we need some padding in the end + // The first thread computes on the global density + if ((comm->nthreads > 1) && !_use_lrt) { + memory->destroy(perthread_density); + memory->create(perthread_density, comm->nthreads-1, + ngrid + INTEL_P3M_ALIGNED_MAXORDER, + "pppmintel:perthread_density"); + } + + _use_table = fix->pppm_table(); + if (_use_table) { + rho_points = 5000; + memory->destroy(rho_lookup); + memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmintel:rho_lookup"); + if(differentiation_flag == 1) { + memory->destroy(drho_lookup); + memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmintel:drho_lookup"); + } + precompute_rho(); + } + if (order > INTEL_P3M_MAXORDER) error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); - /* - if (fix->precision() == FixIntel::PREC_MODE_MIXED) - pack_force_const(force_const_single, fix->get_mixed_buffers()); - else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) - pack_force_const(force_const_double, fix->get_double_buffers()); - else - pack_force_const(force_const_single, fix->get_single_buffers()); - */ + _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16) + && (sizeof(FFT_SCALAR) == sizeof(float)) + && (differentiation_flag == 0); + if (_use_packing) { + memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out); + memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, + nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, + "pppmintel:vdxy_brick"); + memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out); + memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, + nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, + "pppmintel:vdz0_brick"); + memory->destroy(work3); + memory->create(work3, 2*nfft_both, "pppmintel:work3"); + + // new communicator for the double-size bricks + delete cg_pack; + int (*procneigh)[2] = comm->procneigh; + cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in, + nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1, + nylo_out,nyhi_out,nzlo_out,nzhi_out, + procneigh[0][0],procneigh[0][1],procneigh[1][0], + procneigh[1][1],procneigh[2][0],procneigh[2][1]); + + cg_pack->ghost_notify(); + cg_pack->setup(); + } } /* ---------------------------------------------------------------------- @@ -154,8 +231,18 @@ void PPPMIntel::compute_first(int eflag, int vflag) if (atom->nmax > nmax) { memory->destroy(part2grid); + if (differentiation_flag == 1) { + memory->destroy(particle_ekx); + memory->destroy(particle_eky); + memory->destroy(particle_ekz); + } nmax = atom->nmax; memory->create(part2grid,nmax,3,"pppm:part2grid"); + if (differentiation_flag == 1) { + memory->create(particle_ekx, nmax, "pppmintel:pekx"); + memory->create(particle_eky, nmax, "pppmintel:peky"); + memory->create(particle_ekz, nmax, "pppmintel:pekz"); + } } // find grid points for all my particles @@ -184,13 +271,19 @@ void PPPMIntel::compute_first(int eflag, int vflag) // return gradients (electric fields) in 3d brick decomposition // also performs per-atom calculations via poisson_peratom() - poisson(); + if (differentiation_flag == 1) poisson_ad(); + else poisson_ik_intel(); // all procs communicate E-field values // to fill ghost cells surrounding their 3d bricks if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD); - else cg->forward_comm(this,FORWARD_IK); + else { + if (_use_packing) + cg_pack->forward_comm(this,FORWARD_IK); + else + cg->forward_comm(this,FORWARD_IK); + } // extra per-atom energy/virial communication @@ -297,48 +390,60 @@ void PPPMIntel::compute_second(int eflag, int vflag) template<class flt_t, class acc_t> void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers) { - int nx,ny,nz; - ATOM_T * _noalias const x = buffers->get_x(0); int nlocal = atom->nlocal; + int nthr; + if (_use_lrt) + nthr = 1; + else + nthr = comm->nthreads; int flag = 0; if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2])) error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshift = shift; - - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) reduction(+:flag) if(!_use_lrt) #endif - for (int i = 0; i < nlocal; i++) { - - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // current particle coord can be outside global and local box - // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 - - nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET; - ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET; - nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET; - - part2grid[i][0] = nx; - part2grid[i][1] = ny; - part2grid[i][2] = nz; - - // check that entire stencil around nx,ny,nz will fit in my 3d brick - - if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || - ny+nlower < nylo_out || ny+nupper > nyhi_out || - nz+nlower < nzlo_out || nz+nupper > nzhi_out) - flag = 1; + { + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshift = shift; + + int iifrom, iito, tid; + IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T)); + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:flag) + #endif + for (int i = iifrom; i < iito; i++) { + + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // current particle coord can be outside global and local box + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + int nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET; + int ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET; + int nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET; + + part2grid[i][0] = nx; + part2grid[i][1] = ny; + part2grid[i][2] = nz; + + // check that entire stencil around nx,ny,nz will fit in my 3d brick + + if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || + ny+nlower < nylo_out || ny+nupper > nyhi_out || + nz+nlower < nzlo_out || nz+nupper > nzhi_out) + flag = 1; + } } if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM"); @@ -352,13 +457,11 @@ void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers) in global grid ------------------------------------------------------------------------- */ -template<class flt_t, class acc_t> +template<class flt_t, class acc_t, int use_table> void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) { - // clear 3d density array - - memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0, - ngrid*sizeof(FFT_SCALAR)); + FFT_SCALAR * _noalias global_density = + &(density_brick[nzlo_out][nylo_out][nxlo_out]); // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -368,52 +471,129 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) ATOM_T * _noalias const x = buffers->get_x(0); flt_t * _noalias const q = buffers->get_q(0); int nlocal = atom->nlocal; + int nthr; + if (_use_lrt) + nthr = 1; + else + nthr = comm->nthreads; - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshift = shift; - const flt_t fshiftone = shiftone; - const flt_t fdelvolinv = delvolinv; - - for (int i = 0; i < nlocal; i++) { - - int nx = part2grid[i][0]; - int ny = part2grid[i][1]; - int nz = part2grid[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; - - flt_t rho[3][INTEL_P3M_MAXORDER]; - - for (int k = nlower; k <= nupper; k++) { - FFT_SCALAR r1,r2,r3; - r1 = r2 = r3 = ZEROF; + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + const int nix = nxhi_out - nxlo_out + 1; + const int niy = nyhi_out - nylo_out + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshift = shift; + const flt_t fshiftone = shiftone; + const flt_t fdelvolinv = delvolinv; - for (int l = order-1; l >= 0; l--) { - r1 = rho_coeff[l][k] + r1*dx; - r2 = rho_coeff[l][k] + r2*dy; - r3 = rho_coeff[l][k] + r3*dz; + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? + global_density : perthread_density[tid - 1]; + // clear 3d density array + memset(my_density, 0, ngrid * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + + int nysum = nlower + ny - nylo_out; + int nxsum = nlower + nx - nxlo_out; + int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho_lookup[idx][k]; + rho[1][k] = rho_lookup[idy][k]; + rho[2][k] = rho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order-1; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + rho[0][k-nlower] = r1; + rho[1][k-nlower] = r2; + rho[2][k-nlower] = r3; + } + } + + FFT_SCALAR z0 = fdelvolinv * q[i]; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + my_density[mzyx] += x0*rho[0][l]; + } + } } - rho[0][k-nlower] = r1; - rho[1][k-nlower] = r2; - rho[2][k-nlower] = r3; } + } - FFT_SCALAR z0 = fdelvolinv * q[i]; - for (int n = nlower; n <= nupper; n++) { - int mz = n+nz; - FFT_SCALAR y0 = z0*rho[2][n-nlower]; - for (int m = nlower; m <= nupper; m++) { - int my = m+ny; - FFT_SCALAR x0 = y0*rho[1][m-nlower]; - for (int l = nlower; l <= nupper; l++) { - int mx = l+nx; - density_brick[mz][my][mx] += x0*rho[0][l-nlower]; + // reduce all the perthread_densities into global_density + if (nthr > 1) { + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; } } } @@ -424,7 +604,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) interpolate from grid to get electric field & force on my particles for ik ------------------------------------------------------------------------- */ -template<class flt_t, class acc_t> +template<class flt_t, class acc_t, int use_table, int use_packing> void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) { // loop over my charges, interpolate electric field from nearby grid points @@ -437,68 +617,151 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) flt_t * _noalias const q = buffers->get_q(0); FORCE_T * _noalias const f = buffers->get_f(); int nlocal = atom->nlocal; + int nthr; + if (_use_lrt) + nthr = 1; + else + nthr = comm->nthreads; - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshiftone = shiftone; - const flt_t fqqrd2es = qqrd2e * scale; - - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nontemporal - #pragma simd + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) #endif - for (int i = 0; i < nlocal; i++) { - int nx = part2grid[i][0]; - int ny = part2grid[i][1]; - int nz = part2grid[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; - - flt_t rho[3][INTEL_P3M_MAXORDER]; - - for (int k = nlower; k <= nupper; k++) { - FFT_SCALAR r1 = rho_coeff[order-1][k]; - FFT_SCALAR r2 = rho_coeff[order-1][k]; - FFT_SCALAR r3 = rho_coeff[order-1][k]; - for (int l = order-2; l >= 0; l--) { - r1 = rho_coeff[l][k] + r1*dx; - r2 = rho_coeff[l][k] + r2*dy; - r3 = rho_coeff[l][k] + r3*dz; + { + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshiftone = shiftone; + const flt_t fqqrd2es = qqrd2e * scale; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + + int nxsum = (use_packing ? 2 : 1) * (nx + nlower); + int nysum = ny + nlower; + int nzsum = nz + nlower;; + + FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + if (use_packing) { + rho0[2 * k] = rho_lookup[idx][k]; + rho0[2 * k + 1] = rho_lookup[idx][k]; + } else { + rho0[k] = rho_lookup[idx][k]; + } + rho1[k] = rho_lookup[idy][k]; + rho2[k] = rho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1 = rho_coeff[order-1][k]; + FFT_SCALAR r2 = rho_coeff[order-1][k]; + FFT_SCALAR r3 = rho_coeff[order-1][k]; + for (int l = order-2; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + if (use_packing) { + rho0[2 * (k-nlower)] = r1; + rho0[2 * (k-nlower) + 1] = r1; + } else { + rho0[k-nlower] = r1; + } + rho1[k-nlower] = r2; + rho2[k-nlower] = r3; + } } - rho[0][k-nlower] = r1; - rho[1][k-nlower] = r2; - rho[2][k-nlower] = r3; - } - FFT_SCALAR ekx, eky, ekz; - ekx = eky = ekz = ZEROF; - for (int n = nlower; n <= nupper; n++) { - int mz = n+nz; - FFT_SCALAR z0 = rho[2][n-nlower]; - for (int m = nlower; m <= nupper; m++) { - int my = m+ny; - FFT_SCALAR y0 = z0*rho[1][m-nlower]; - for (int l = nlower; l <= nupper; l++) { - int mx = l+nx; - FFT_SCALAR x0 = y0*rho[0][l-nlower]; - ekx -= x0*vdx_brick[mz][my][mx]; - eky -= x0*vdy_brick[mz][my][mx]; - ekz -= x0*vdz_brick[mz][my][mx]; + _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekxy_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz0_arr[2 * INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < (use_packing ? 2 : 1) * + INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + if (use_packing) { + ekxy_arr[l] -= x0*vdxy_brick[mz][my][mx]; + ekz0_arr[l] -= x0*vdz0_brick[mz][my][mx]; + } else { + ekx_arr[l] -= x0*vdx_brick[mz][my][mx]; + eky_arr[l] -= x0*vdy_brick[mz][my][mx]; + ekz_arr[l] -= x0*vdz_brick[mz][my][mx]; + } + } } } - } - // convert E-field to force + FFT_SCALAR ekx, eky, ekz; + ekx = eky = ekz = ZEROF; + + if (use_packing) { + for (int l = 0; l < 2*INTEL_P3M_ALIGNED_MAXORDER; l += 2) { + ekx += ekxy_arr[l]; + eky += ekxy_arr[l+1]; + ekz += ekz0_arr[l]; + } + } else { + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } + } - const flt_t qfactor = fqqrd2es * q[i]; - f[i].x += qfactor*ekx; - f[i].y += qfactor*eky; - if (slabflag != 2) f[i].z += qfactor*ekz; + // convert E-field to force + + const flt_t qfactor = fqqrd2es * q[i]; + f[i].x += qfactor*ekx; + f[i].y += qfactor*eky; + if (slabflag != 2) f[i].z += qfactor*ekz; + } } } @@ -506,7 +769,7 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) interpolate from grid to get electric field & force on my particles for ad ------------------------------------------------------------------------- */ -template<class flt_t, class acc_t> +template<class flt_t, class acc_t, int use_table> void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) { // loop over my charges, interpolate electric field from nearby grid points @@ -519,118 +782,434 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) const flt_t * _noalias const q = buffers->get_q(0); FORCE_T * _noalias const f = buffers->get_f(); int nlocal = atom->nlocal; + int nthr; + if (_use_lrt) + nthr = 1; + else + nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; + FFT_SCALAR * _noalias const particle_eky = this->particle_eky; + FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; - const flt_t ftwo_pi = MY_PI * 2.0; - const flt_t ffour_pi = MY_PI * 4.0; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshiftone = shiftone; - const flt_t fqqrd2es = qqrd2e * scale; - - const double *prd = domain->prd; - const double xprd = prd[0]; - const double yprd = prd[1]; - const double zprd = prd[2]; - - const flt_t hx_inv = nx_pppm/xprd; - const flt_t hy_inv = ny_pppm/yprd; - const flt_t hz_inv = nz_pppm/zprd; - - const flt_t fsf_coeff0 = sf_coeff[0]; - const flt_t fsf_coeff1 = sf_coeff[1]; - const flt_t fsf_coeff2 = sf_coeff[2]; - const flt_t fsf_coeff3 = sf_coeff[3]; - const flt_t fsf_coeff4 = sf_coeff[4]; - const flt_t fsf_coeff5 = sf_coeff[5]; - - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned nontemporal - #pragma simd + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) #endif - for (int i = 0; i < nlocal; i++) { - int nx = part2grid[i][0]; - int ny = part2grid[i][1]; - int nz = part2grid[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; - - flt_t rho[3][INTEL_P3M_MAXORDER]; - flt_t drho[3][INTEL_P3M_MAXORDER]; - - for (int k = nlower; k <= nupper; k++) { - FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; - dr1 = dr2 = dr3 = ZEROF; - - r1 = rho_coeff[order-1][k]; - r2 = rho_coeff[order-1][k]; - r3 = rho_coeff[order-1][k]; - for (int l = order-2; l >= 0; l--) { - r1 = rho_coeff[l][k] + r1 * dx; - r2 = rho_coeff[l][k] + r2 * dy; - r3 = rho_coeff[l][k] + r3 * dz; - dr1 = drho_coeff[l][k] + dr1 * dx; - dr2 = drho_coeff[l][k] + dr2 * dy; - dr3 = drho_coeff[l][k] + dr3 * dz; + { + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshiftone = shiftone; + const flt_t fqqrd2es = qqrd2e * scale; + + const double *prd = domain->prd; + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + + const flt_t hx_inv = nx_pppm/xprd; + const flt_t hy_inv = ny_pppm/yprd; + const flt_t hz_inv = nz_pppm/zprd; + + const flt_t fsf_coeff0 = sf_coeff[0]; + const flt_t fsf_coeff1 = sf_coeff[1]; + const flt_t fsf_coeff2 = sf_coeff[2]; + const flt_t fsf_coeff3 = sf_coeff[3]; + const flt_t fsf_coeff4 = sf_coeff[4]; + const flt_t fsf_coeff5 = sf_coeff[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; + + int nxsum = nx + nlower; + int nysum = ny + nlower; + int nzsum = nz + nlower; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho_lookup[idx][k]; + rho[1][k] = rho_lookup[idy][k]; + rho[2][k] = rho_lookup[idz][k]; + drho[0][k] = drho_lookup[idx][k]; + drho[1][k] = drho_lookup[idy][k]; + drho[2][k] = drho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff[order-1][k]; + r2 = rho_coeff[order-1][k]; + r3 = rho_coeff[order-1][k]; + for (int l = order-2; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1 * dx; + r2 = rho_coeff[l][k] + r2 * dy; + r3 = rho_coeff[l][k] + r3 * dz; + dr1 = drho_coeff[l][k] + dr1 * dx; + dr2 = drho_coeff[l][k] + dr2 * dy; + dr3 = drho_coeff[l][k] + dr3 * dz; + } + rho[0][k-nlower] = r1; + rho[1][k-nlower] = r2; + rho[2][k-nlower] = r3; + drho[0][k-nlower] = dr1; + drho[1][k-nlower] = dr2; + drho[2][k-nlower] = dr3; + } + } + + _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; + eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx]; + ekz[l] += rho[0][l] * ekz_p * u_brick[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx[i] += ekx[l]; + particle_eky[i] += eky[l]; + particle_ekz[i] += ekz[l]; + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx[i] *= hx_inv; + particle_eky[i] *= hy_inv; + particle_ekz[i] *= hz_inv; + + // convert E-field to force + + const flt_t qfactor = fqqrd2es * q[i]; + const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; + + const flt_t s1 = x[i].x * hx_inv; + const flt_t s2 = x[i].y * hy_inv; + const flt_t s3 = x[i].z * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= twoqsq; + f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= twoqsq; + f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= twoqsq; + + if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf; + } + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for ik + Does special things for packing mode to avoid repeated copies +------------------------------------------------------------------------- */ + +void PPPMIntel::poisson_ik_intel() +{ + if (_use_packing == 0) { + poisson_ik(); + return; + } + + int i,j,k,n; + double eng; + + // transform charge density (r -> k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] = density_fft[i]; + work1[n++] = ZEROF; + } + + fft1->compute(work1,work1,1); + + // global energy and virial contribution + + double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); + double s2 = scaleinv*scaleinv; + + if (eflag_global || vflag_global) { + if (vflag_global) { + n = 0; + for (i = 0; i < nfft; i++) { + eng = s2 * greensfn[i] * (work1[n]*work1[n] + + work1[n+1]*work1[n+1]); + for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; + if (eflag_global) energy += eng; + n += 2; + } + } else { + n = 0; + for (i = 0; i < nfft; i++) { + energy += + s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + n += 2; } - rho[0][k-nlower] = r1; - rho[1][k-nlower] = r2; - rho[2][k-nlower] = r3; - drho[0][k-nlower] = dr1; - drho[1][k-nlower] = dr2; - drho[2][k-nlower] = dr3; } + } + + // scale by 1/total-grid-pts to get rho(k) + // multiply by Green's function to get V(k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] *= scaleinv * greensfn[i]; + work1[n++] *= scaleinv * greensfn[i]; + } + + // extra FFTs for per-atom energy/virial + + if (evflag_atom) poisson_peratom(); + + // triclinic system + + if (triclinic) { + poisson_ik_triclinic(); + return; + } + + // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // FFT leaves data in 3d brick decomposition + // copy it into inner portion of vdx,vdy,vdz arrays + + // x direction gradient + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fkx[i]*work1[n+1]; + work2[n+1] = -fkx[i]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + // y direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work3[n] = fky[j]*work1[n+1]; + work3[n+1] = -fky[j]*work1[n]; + n += 2; + } + + fft2->compute(work3,work3,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdxy_brick[k][j][2*i] = work2[n]; + vdxy_brick[k][j][2*i+1] = work3[n]; + n += 2; + } + + // z direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fkz[k]*work1[n+1]; + work2[n+1] = -fkz[k]*work1[n]; + n += 2; + } - FFT_SCALAR ekx, eky, ekz; - ekx = eky = ekz = ZEROF; - for (int n = nlower; n <= nupper; n++) { - int mz = n+nz; - for (int m = nlower; m <= nupper; m++) { - int my = m+ny; - FFT_SCALAR ekx_p = rho[1][m-nlower] * rho[2][n-nlower]; - FFT_SCALAR eky_p = drho[1][m-nlower] * rho[2][n-nlower]; - FFT_SCALAR ekz_p = rho[1][m-nlower] * drho[2][n-nlower]; - for (int l = nlower; l <= nupper; l++) { - int mx = l+nx; - ekx += drho[0][l-nlower] * ekx_p * u_brick[mz][my][mx]; - eky += rho[0][l-nlower] * eky_p * u_brick[mz][my][mx]; - ekz += rho[0][l-nlower] * ekz_p * u_brick[mz][my][mx]; + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdz0_brick[k][j][2*i] = work2[n]; + vdz0_brick[k][j][2*i+1] = 0.; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + precompute rho coefficients as a lookup table to save time in make_rho + and fieldforce. Instead of doing this polynomial for every atom 6 times + per time step, precompute it for some number of points. +------------------------------------------------------------------------- */ + +void PPPMIntel::precompute_rho() +{ + + half_rho_scale = (rho_points - 1.)/2.; + half_rho_scale_plus = half_rho_scale + 0.5; + + for (int i = 0; i < rho_points; i++) { + FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k=nlower; k<=nupper;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order-1; l>=0; l--){ + r1 = rho_coeff[l][k] + r1*dx; + } + rho_lookup[i][k-nlower] = r1; + } + for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho_lookup[i][k] = 0; + } + if (differentiation_flag == 1) { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k=nlower; k<=nupper;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order-2; l>=0; l--){ + r1 = drho_coeff[l][k] + r1*dx; } + drho_lookup[i][k-nlower] = r1; + } + for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + drho_lookup[i][k] = 0; } } - ekx *= hx_inv; - eky *= hy_inv; - ekz *= hz_inv; + } +} - // convert E-field to force +/* ---------------------------------------------------------------------- + pack own values to buf to send to another proc +------------------------------------------------------------------------- */ - const flt_t qfactor = fqqrd2es * q[i]; - const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; +void PPPMIntel::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list) +{ + int n = 0; + + if ((flag == FORWARD_IK) && _use_packing) { + FFT_SCALAR *xsrc = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out]; + FFT_SCALAR *zsrc = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out]; + for (int i = 0; i < nlist; i++) { + buf[n++] = xsrc[list[i]]; + buf[n++] = zsrc[list[i]]; + } + } else { + PPPM::pack_forward(flag, buf, nlist, list); + } +} - const flt_t s1 = x[i].x * hx_inv; - const flt_t s2 = x[i].y * hy_inv; - const flt_t s3 = x[i].z * hz_inv; - flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); - sf += fsf_coeff1 * sin(ffour_pi * s1); - sf *= twoqsq; - f[i].x += qfactor * ekx - fqqrd2es * sf; +/* ---------------------------------------------------------------------- + unpack another proc's own values from buf and set own ghost values +------------------------------------------------------------------------- */ - sf = fsf_coeff2 * sin(ftwo_pi * s2); - sf += fsf_coeff3 * sin(ffour_pi * s2); - sf *= twoqsq; - f[i].y += qfactor * eky - fqqrd2es * sf; +void PPPMIntel::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list) +{ + int n = 0; + + if ((flag == FORWARD_IK) && _use_packing) { + FFT_SCALAR *xdest = &vdxy_brick[nzlo_out][nylo_out][2*nxlo_out]; + FFT_SCALAR *zdest = &vdz0_brick[nzlo_out][nylo_out][2*nxlo_out]; + for (int i = 0; i < nlist; i++) { + xdest[list[i]] = buf[n++]; + zdest[list[i]] = buf[n++]; + } + } else { + PPPM::unpack_forward(flag, buf, nlist, list); + } +} - sf = fsf_coeff4 * sin(ftwo_pi * s3); - sf += fsf_coeff5 * sin(ffour_pi * s3); - sf *= twoqsq; +/* ---------------------------------------------------------------------- + memory usage of local arrays +------------------------------------------------------------------------- */ - if (slabflag != 2) f[i].z += qfactor * ekz - fqqrd2es * sf; +double PPPMIntel::memory_usage() +{ + double bytes = PPPM::memory_usage(); + if ((comm->nthreads > 1) && !_use_lrt) { + bytes += (comm->nthreads - 1) * (ngrid + INTEL_P3M_ALIGNED_MAXORDER) * + sizeof(FFT_SCALAR); + } + if (differentiation_flag == 1) { + bytes += 3 * nmax * sizeof(FFT_SCALAR); + } + if (_use_table) { + bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR); + if (differentiation_flag == 1) { + bytes += rho_points * INTEL_P3M_ALIGNED_MAXORDER * sizeof(FFT_SCALAR); + } + } + if (_use_packing) { + bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR); + bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1) + * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR); + bytes += 2 * nfft_both * sizeof(FFT_SCALAR); + bytes += cg_pack->memory_usage(); } + return bytes; } /* ---------------------------------------------------------------------- @@ -640,13 +1219,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) void PPPMIntel::pack_buffers() { fix->start_watch(TIME_PACK); + int packthreads; + if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads; + else packthreads = 1; #if defined(_OPENMP) - #pragma omp parallel default(none) + #pragma omp parallel if(packthreads > 1) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, - comm->nthreads, + packthreads, sizeof(IntelBuffers<float,double>::atom_t)); if (fix->precision() == FixIntel::PREC_MODE_MIXED) fix->get_mixed_buffers()->thr_pack(ifrom,ito,1); diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h index 40669a5561..89bc3998e0 100644 --- a/src/USER-INTEL/pppm_intel.h +++ b/src/USER-INTEL/pppm_intel.h @@ -1,4 +1,4 @@ -/* -*- c++ -*- ---------------------------------------------------------- +/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov @@ -12,7 +12,9 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Rodrigo Canales (RWTH Aachen University) + Contributing authors: William McDoniel (RWTH Aachen University) + Rodrigo Canales (RWTH Aachen University) + Markus Hoehnerbach (RWTH Aachen University) W. Michael Brown (Intel) ------------------------------------------------------------------------- */ @@ -36,6 +38,9 @@ class PPPMIntel : public PPPM { virtual ~PPPMIntel(); virtual void init(); virtual void compute(int, int); + virtual void pack_forward(int, FFT_SCALAR *, int, int *); + virtual void unpack_forward(int, FFT_SCALAR *, int, int *); + virtual double memory_usage(); void compute_first(int, int); void compute_second(int, int); void pack_buffers(); @@ -47,18 +52,74 @@ class PPPMIntel : public PPPM { protected: FixIntel *fix; + int _use_lrt; + FFT_SCALAR **perthread_density; + FFT_SCALAR *particle_ekx; + FFT_SCALAR *particle_eky; + FFT_SCALAR *particle_ekz; + + int _use_table; + int rho_points; + FFT_SCALAR **rho_lookup; + FFT_SCALAR **drho_lookup; + FFT_SCALAR half_rho_scale, half_rho_scale_plus; + + int _use_packing; + FFT_SCALAR ***vdxy_brick; + FFT_SCALAR ***vdz0_brick; + FFT_SCALAR *work3; + class GridComm *cg_pack; + #ifdef _LMP_INTEL_OFFLOAD int _use_base; #endif + template<class flt_t, class acc_t> + void test_function(IntelBuffers<flt_t,acc_t> *buffers); + + + void precompute_rho(); template<class flt_t, class acc_t> void particle_map(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> + template<class flt_t, class acc_t, int use_table> void make_rho(IntelBuffers<flt_t,acc_t> *buffers); template<class flt_t, class acc_t> + void make_rho(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho<flt_t,acc_t,1>(buffers); + } else { + make_rho<flt_t,acc_t,0>(buffers); + } + } + void poisson_ik_intel(); + template<class flt_t, class acc_t, int use_table, int use_packing> void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers); template<class flt_t, class acc_t> + void fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + if (_use_packing == 1) { + fieldforce_ik<flt_t, acc_t, 1, 1>(buffers); + } else { + fieldforce_ik<flt_t, acc_t, 1, 0>(buffers); + } + } else { + if (_use_packing == 1) { + fieldforce_ik<flt_t, acc_t, 0, 1>(buffers); + } else { + fieldforce_ik<flt_t, acc_t, 0, 0>(buffers); + } + } + } + template<class flt_t, class acc_t, int use_table> void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_ad<flt_t,acc_t,0>(buffers); + } + } }; } diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index afb7852f98..b44870e9b0 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -78,17 +78,17 @@ void VerletLRTIntel::init() setup before run ------------------------------------------------------------------------- */ -void VerletLRTIntel::setup() +void VerletLRTIntel::setup(int flag) { if (_intel_kspace == 0) { - Verlet::setup(); + Verlet::setup(flag); return; } #ifdef _LMP_INTEL_OFFLOAD if (_intel_kspace->use_base()) { _intel_kspace = 0; - Verlet::setup(); + Verlet::setup(flag); return; } #endif diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h index a699c20796..0521b161c7 100644 --- a/src/USER-INTEL/verlet_lrt_intel.h +++ b/src/USER-INTEL/verlet_lrt_intel.h @@ -42,7 +42,7 @@ class VerletLRTIntel : public Verlet { VerletLRTIntel(class LAMMPS *, int, char **); virtual ~VerletLRTIntel(); virtual void init(); - virtual void setup(); + virtual void setup(int flag = 1); virtual void run(int); protected: diff --git a/src/atom.cpp b/src/atom.cpp index 6fa1cd8ef8..df4db0a842 100644 --- a/src/atom.cpp +++ b/src/atom.cpp @@ -40,6 +40,10 @@ #include "memory.h" #include "error.h" +#ifdef LMP_USER_INTEL +#include "neigh_request.h" +#endif + using namespace LAMMPS_NS; using namespace MathConst; @@ -1882,6 +1886,53 @@ void Atom::setup_sort_bins() bininvy = nbiny / (bboxhi[1]-bboxlo[1]); bininvz = nbinz / (bboxhi[2]-bboxlo[2]); + #ifdef LMP_USER_INTEL + int intel_neigh = 0; + if (neighbor->nrequest) { + if (neighbor->requests[0]->intel) intel_neigh = 1; + } else if (neighbor->old_nrequest) + if (neighbor->old_requests[0]->intel) intel_neigh = 1; + if (intel_neigh && userbinsize == 0.0) { + if (neighbor->binsizeflag) bininv = 1.0/neighbor->binsize_user; + + double nx_low = neighbor->bboxlo[0]; + double ny_low = neighbor->bboxlo[1]; + double nz_low = neighbor->bboxlo[2]; + double nxbbox = neighbor->bboxhi[0] - nx_low; + double nybbox = neighbor->bboxhi[1] - ny_low; + double nzbbox = neighbor->bboxhi[2] - nz_low; + int nnbinx = static_cast<int> (nxbbox * bininv); + int nnbiny = static_cast<int> (nybbox * bininv); + int nnbinz = static_cast<int> (nzbbox * bininv); + if (domain->dimension == 2) nnbinz = 1; + + if (nnbinx == 0) nnbinx = 1; + if (nnbiny == 0) nnbiny = 1; + if (nnbinz == 0) nnbinz = 1; + + double binsizex = nxbbox/nnbinx; + double binsizey = nybbox/nnbiny; + double binsizez = nzbbox/nnbinz; + + bininvx = 1.0 / binsizex; + bininvy = 1.0 / binsizey; + bininvz = 1.0 / binsizez; + + int lxo = (bboxlo[0] - nx_low) * bininvx; + int lyo = (bboxlo[1] - ny_low) * bininvy; + int lzo = (bboxlo[2] - nz_low) * bininvz; + bboxlo[0] = nx_low + static_cast<double>(lxo) / bininvx; + bboxlo[1] = ny_low + static_cast<double>(lyo) / bininvy; + bboxlo[2] = nz_low + static_cast<double>(lzo) / bininvz; + nbinx = static_cast<int>((bboxhi[0] - bboxlo[0]) * bininvx) + 1; + nbiny = static_cast<int>((bboxhi[1] - bboxlo[1]) * bininvy) + 1; + nbinz = static_cast<int>((bboxhi[2] - bboxlo[2]) * bininvz) + 1; + bboxhi[0] = bboxlo[0] + static_cast<double>(nbinx) / bininvx; + bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy; + bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz; + } + #endif + if (1.0*nbinx*nbiny*nbinz > INT_MAX) error->one(FLERR,"Too many atom sorting bins"); -- GitLab