From 3c329d170791699f01f500213c48b57cb7cff38d Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer <akohlmey@gmail.com> Date: Mon, 19 Jun 2017 13:23:01 -0400 Subject: [PATCH] massive whitespace cleanup in USER-INTEL removed are: - DOS/Windows text format carriage return characters (^M) - tabs replaced with spaces (tabs are evil!!) - trailing whitespace --- src/USER-INTEL/angle_charmm_intel.cpp | 64 +- src/USER-INTEL/angle_charmm_intel.h | 4 +- src/USER-INTEL/angle_harmonic_intel.cpp | 64 +- src/USER-INTEL/angle_harmonic_intel.h | 4 +- src/USER-INTEL/bond_fene_intel.cpp | 64 +- src/USER-INTEL/bond_fene_intel.h | 4 +- src/USER-INTEL/bond_harmonic_intel.cpp | 54 +- src/USER-INTEL/bond_harmonic_intel.h | 4 +- src/USER-INTEL/dihedral_charmm_intel.cpp | 382 +- src/USER-INTEL/dihedral_charmm_intel.h | 6 +- src/USER-INTEL/dihedral_harmonic_intel.cpp | 134 +- src/USER-INTEL/dihedral_harmonic_intel.h | 6 +- src/USER-INTEL/dihedral_opls_intel.cpp | 138 +- src/USER-INTEL/dihedral_opls_intel.h | 4 +- src/USER-INTEL/fix_intel.cpp | 194 +- src/USER-INTEL/fix_intel.h | 46 +- src/USER-INTEL/fix_nh_intel.cpp | 74 +- src/USER-INTEL/fix_nh_intel.h | 2 +- src/USER-INTEL/fix_nve_asphere_intel.cpp | 40 +- src/USER-INTEL/fix_nve_intel.cpp | 46 +- src/USER-INTEL/improper_cvff_intel.cpp | 116 +- src/USER-INTEL/improper_cvff_intel.h | 4 +- src/USER-INTEL/improper_harmonic_intel.cpp | 74 +- src/USER-INTEL/improper_harmonic_intel.h | 4 +- src/USER-INTEL/intel_buffers.cpp | 48 +- src/USER-INTEL/intel_buffers.h | 46 +- src/USER-INTEL/intel_intrinsics.h | 166 +- src/USER-INTEL/intel_preprocess.h | 1194 ++-- src/USER-INTEL/intel_simd.h | 994 +-- src/USER-INTEL/math_extra_intel.h | 698 +- src/USER-INTEL/nbin_intel.cpp | 20 +- src/USER-INTEL/npair_full_bin_intel.cpp | 36 +- src/USER-INTEL/npair_full_bin_intel.h | 2 +- .../npair_half_bin_newton_intel.cpp | 12 +- .../npair_half_bin_newton_tri_intel.cpp | 12 +- src/USER-INTEL/npair_intel.cpp | 684 +- src/USER-INTEL/npair_intel.h | 4 +- src/USER-INTEL/pair_buck_coul_cut_intel.cpp | 138 +- src/USER-INTEL/pair_buck_coul_cut_intel.h | 6 +- src/USER-INTEL/pair_buck_coul_long_intel.cpp | 230 +- src/USER-INTEL/pair_buck_coul_long_intel.h | 6 +- src/USER-INTEL/pair_buck_intel.cpp | 136 +- src/USER-INTEL/pair_buck_intel.h | 8 +- src/USER-INTEL/pair_eam_intel.cpp | 510 +- src/USER-INTEL/pair_eam_intel.h | 8 +- src/USER-INTEL/pair_gayberne_intel.cpp | 470 +- .../pair_lj_charmm_coul_long_intel.cpp | 280 +- .../pair_lj_charmm_coul_long_intel.h | 6 +- .../pair_lj_cut_coul_long_intel.cpp | 246 +- src/USER-INTEL/pair_lj_cut_coul_long_intel.h | 6 +- src/USER-INTEL/pair_lj_cut_intel.cpp | 198 +- .../pair_lj_long_coul_long_intel.cpp | 100 +- src/USER-INTEL/pair_lj_long_coul_long_intel.h | 78 +- src/USER-INTEL/pair_sw_intel.cpp | 1078 +-- src/USER-INTEL/pair_sw_intel.h | 2 +- src/USER-INTEL/pair_tersoff_intel.cpp | 208 +- src/USER-INTEL/pair_tersoff_intel.h | 6 +- src/USER-INTEL/pppm_disp_intel.cpp | 6068 ++++++++--------- src/USER-INTEL/pppm_disp_intel.h | 476 +- src/USER-INTEL/pppm_intel.cpp | 126 +- src/USER-INTEL/pppm_intel.h | 4 +- src/USER-INTEL/verlet_lrt_intel.cpp | 36 +- 62 files changed, 7939 insertions(+), 7939 deletions(-) diff --git a/src/USER-INTEL/angle_charmm_intel.cpp b/src/USER-INTEL/angle_charmm_intel.cpp index 0c493646e3..d55afd4742 100644 --- a/src/USER-INTEL/angle_charmm_intel.cpp +++ b/src/USER-INTEL/angle_charmm_intel.cpp @@ -37,7 +37,7 @@ typedef struct { int a,b,c,t; } int4_t; /* ---------------------------------------------------------------------- */ -AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) +AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) { suffix_flag |= Suffix::INTEL; } @@ -74,8 +74,8 @@ void AngleCharmmIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void AngleCharmmIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = 0; @@ -83,14 +83,14 @@ void AngleCharmmIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -103,9 +103,9 @@ void AngleCharmmIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void AngleCharmmIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void AngleCharmmIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nanglelist; @@ -133,7 +133,7 @@ void AngleCharmmIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) #endif { @@ -148,7 +148,7 @@ void AngleCharmmIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int4_t * _noalias const anglelist = + const int4_t * _noalias const anglelist = (int4_t *) neighbor->anglelist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -246,35 +246,35 @@ void AngleCharmmIntel::eval(const int vflag, { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].y += f1y; + f[i1].z += f1z; } if (NEWTON_BOND || i2 < nlocal) { f[i2].x -= f1x + f3x; - f[i2].y -= f1y + f3y; - f[i2].z -= f1z + f3z; + f[i2].y -= f1y + f3y; + f[i2].z -= f1z + f3z; } if (NEWTON_BOND || i3 < nlocal) { f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].y += f3y; + f[i3].z += f3z; } } if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF - IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, - i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, - dely1, delz1, delx2, dely2, delz2, seangle, - f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, + i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, + dely1, delz1, delx2, dely2, delz2, seangle, + f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); - #else - IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, - i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, - dely1, delz1, delx2, dely2, delz2, oeangle, - f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, + #else + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, + i3, f1x, f1y, f1z, f3x, f3y, f3z, delx1, + dely1, delz1, delx2, dely2, delz2, oeangle, + f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); #endif } @@ -282,8 +282,8 @@ void AngleCharmmIntel::eval(const int vflag, #ifdef LMP_INTEL_USE_SIMDOFF if (EFLAG) oeangle += seangle; if (VFLAG && vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; - ov3 += sv3; ov4 += sv4; ov5 += sv5; + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } #endif } // omp parallel @@ -291,7 +291,7 @@ void AngleCharmmIntel::eval(const int vflag, if (EFLAG) energy += oeangle; if (VFLAG && vflag) { virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); @@ -348,11 +348,11 @@ void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes, - Memory *memory) { + Memory *memory) { if (nangletypes != _nangletypes) { if (_nangletypes > 0) _memory->destroy(fc); - + if (nangletypes > 0) _memory->create(fc,nangletypes,"anglecharmmintel.fc"); } diff --git a/src/USER-INTEL/angle_charmm_intel.h b/src/USER-INTEL/angle_charmm_intel.h index a98007b3ef..342af31b8c 100644 --- a/src/USER-INTEL/angle_charmm_intel.h +++ b/src/USER-INTEL/angle_charmm_intel.h @@ -45,8 +45,8 @@ class AngleCharmmIntel : public AngleCharmm { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp index 198431d552..47e0add690 100644 --- a/src/USER-INTEL/angle_harmonic_intel.cpp +++ b/src/USER-INTEL/angle_harmonic_intel.cpp @@ -37,7 +37,7 @@ typedef struct { int a,b,c,t; } int4_t; /* ---------------------------------------------------------------------- */ -AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp) +AngleHarmonicIntel::AngleHarmonicIntel(LAMMPS *lmp) : AngleHarmonic(lmp) { suffix_flag |= Suffix::INTEL; } @@ -74,8 +74,8 @@ void AngleHarmonicIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void AngleHarmonicIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = 0; @@ -83,14 +83,14 @@ void AngleHarmonicIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -103,9 +103,9 @@ void AngleHarmonicIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void AngleHarmonicIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void AngleHarmonicIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nanglelist; @@ -133,7 +133,7 @@ void AngleHarmonicIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5) #endif { @@ -148,7 +148,7 @@ void AngleHarmonicIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int4_t * _noalias const anglelist = + const int4_t * _noalias const anglelist = (int4_t *) neighbor->anglelist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -228,35 +228,35 @@ void AngleHarmonicIntel::eval(const int vflag, { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].y += f1y; + f[i1].z += f1z; } - if (NEWTON_BOND || i2 < nlocal) { + if (NEWTON_BOND || i2 < nlocal) { f[i2].x -= f1x + f3x; - f[i2].y -= f1y + f3y; - f[i2].z -= f1z + f3z; + f[i2].y -= f1y + f3y; + f[i2].z -= f1z + f3z; } if (NEWTON_BOND || i3 < nlocal) { f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].y += f3y; + f[i3].z += f3z; } } if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF - IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, - f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, - delz1, delx2, dely2, delz2, seangle, f, - NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, + f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, + delz1, delx2, dely2, delz2, seangle, f, + NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); #else - IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, - f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, - delz1, delx2, dely2, delz2, oeangle, f, - NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, + IP_PRE_ev_tally_angle(EFLAG, VFLAG, eatom, vflag, eangle, i1, i2, i3, + f1x, f1y, f1z, f3x, f3y, f3z, delx1, dely1, + delz1, delx2, dely2, delz2, oeangle, f, + NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); #endif } @@ -264,8 +264,8 @@ void AngleHarmonicIntel::eval(const int vflag, #ifdef LMP_INTEL_USE_SIMDOFF if (EFLAG) oeangle += seangle; if (VFLAG && vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; - ov3 += sv3; ov4 += sv4; ov5 += sv5; + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } #endif } // omp parallel @@ -273,7 +273,7 @@ void AngleHarmonicIntel::eval(const int vflag, if (EFLAG) energy += oeangle; if (VFLAG && vflag) { virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); @@ -328,11 +328,11 @@ void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void AngleHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes, - Memory *memory) { + Memory *memory) { if (nangletypes != _nangletypes) { if (_nangletypes > 0) _memory->destroy(fc); - + if (nangletypes > 0) _memory->create(fc,nangletypes,"anglecharmmintel.fc"); } diff --git a/src/USER-INTEL/angle_harmonic_intel.h b/src/USER-INTEL/angle_harmonic_intel.h index 340ea4b974..301fc7cc06 100644 --- a/src/USER-INTEL/angle_harmonic_intel.h +++ b/src/USER-INTEL/angle_harmonic_intel.h @@ -45,8 +45,8 @@ class AngleHarmonicIntel : public AngleHarmonic { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp index 430142a72a..bb96135b2d 100644 --- a/src/USER-INTEL/bond_fene_intel.cpp +++ b/src/USER-INTEL/bond_fene_intel.cpp @@ -33,7 +33,7 @@ typedef struct { int a,b,t; } int3_t; /* ---------------------------------------------------------------------- */ -BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp) +BondFENEIntel::BondFENEIntel(LAMMPS *lmp) : BondFENE(lmp) { suffix_flag |= Suffix::INTEL; } @@ -70,8 +70,8 @@ void BondFENEIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void BondFENEIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = 0; @@ -79,14 +79,14 @@ void BondFENEIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -97,9 +97,9 @@ void BondFENEIntel::compute(int eflag, int vflag, } template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void BondFENEIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void BondFENEIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nbondlist; if (inum == 0) return; @@ -126,7 +126,7 @@ void BondFENEIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) #endif { @@ -141,7 +141,7 @@ void BondFENEIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int3_t * _noalias const bondlist = + const int3_t * _noalias const bondlist = (int3_t *) neighbor->bondlist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -176,7 +176,7 @@ void BondFENEIntel::eval(const int vflag, // if r -> r0, then rlogarg < 0.0 which is an error // issue a warning and reset rlogarg = epsilon // if r > 2*r0 something serious is wrong, abort - + if (rlogarg < (flt_t)0.1) { char str[128]; sprintf(str,"FENE bond too long: " BIGINT_FORMAT " " @@ -186,18 +186,18 @@ void BondFENEIntel::eval(const int vflag, if (rlogarg <= (flt_t)-3.0) error->one(FLERR,"Bad FENE bond"); rlogarg = (flt_t)0.1; } - + flt_t fbond = -k/rlogarg; - + // force from LJ term - + flt_t sr2,sr6; if (rsq < (flt_t)TWO_1_3*sigmasq) { - sr2 = sigmasq * irsq; + sr2 = sigmasq * irsq; sr6 = sr2 * sr2 * sr2; fbond += (flt_t)48.0 * epsilon * sr6 * (sr6 - (flt_t)0.5) * irsq; } - + // energy flt_t ebond; @@ -215,27 +215,27 @@ void BondFENEIntel::eval(const int vflag, { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += delx*fbond; - f[i1].y += dely*fbond; - f[i1].z += delz*fbond; + f[i1].y += dely*fbond; + f[i1].z += delz*fbond; } if (NEWTON_BOND || i2 < nlocal) { f[i2].x -= delx*fbond; - f[i2].y -= dely*fbond; - f[i2].z -= delz*fbond; + f[i2].y -= dely*fbond; + f[i2].z -= delz*fbond; } - } + } if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF - IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, - delx, dely, delz, sebond, f, NEWTON_BOND, + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, + delx, dely, delz, sebond, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); - #else - IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, - delx, dely, delz, oebond, f, NEWTON_BOND, + #else + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, fbond, + delx, dely, delz, oebond, f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); - #endif + #endif } } // for n #ifdef LMP_INTEL_USE_SIMDOFF @@ -250,7 +250,7 @@ void BondFENEIntel::eval(const int vflag, if (EFLAG) energy += oebond; if (VFLAG && vflag) { virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); @@ -307,11 +307,11 @@ void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void BondFENEIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, - Memory *memory) { + Memory *memory) { if (nbondtypes != _nbondtypes) { if (_nbondtypes > 0) _memory->destroy(fc); - + if (nbondtypes > 0) _memory->create(fc,nbondtypes,"bondfeneintel.fc"); } diff --git a/src/USER-INTEL/bond_fene_intel.h b/src/USER-INTEL/bond_fene_intel.h index d64f1e7254..89c3033096 100644 --- a/src/USER-INTEL/bond_fene_intel.h +++ b/src/USER-INTEL/bond_fene_intel.h @@ -45,8 +45,8 @@ class BondFENEIntel : public BondFENE { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/bond_harmonic_intel.cpp b/src/USER-INTEL/bond_harmonic_intel.cpp index 1cccf5fe54..beb0ebcdda 100644 --- a/src/USER-INTEL/bond_harmonic_intel.cpp +++ b/src/USER-INTEL/bond_harmonic_intel.cpp @@ -33,7 +33,7 @@ typedef struct { int a,b,t; } int3_t; /* ---------------------------------------------------------------------- */ -BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) +BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) { suffix_flag |= Suffix::INTEL; } @@ -70,8 +70,8 @@ void BondHarmonicIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void BondHarmonicIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = 0; @@ -79,14 +79,14 @@ void BondHarmonicIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -97,9 +97,9 @@ void BondHarmonicIntel::compute(int eflag, int vflag, } template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void BondHarmonicIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void BondHarmonicIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nbondlist; if (inum == 0) return; @@ -126,7 +126,7 @@ void BondHarmonicIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5) #endif { @@ -141,7 +141,7 @@ void BondHarmonicIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int3_t * _noalias const bondlist = + const int3_t * _noalias const bondlist = (int3_t *) neighbor->bondlist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -184,29 +184,29 @@ void BondHarmonicIntel::eval(const int vflag, { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += delx*fbond; - f[i1].y += dely*fbond; - f[i1].z += delz*fbond; + f[i1].y += dely*fbond; + f[i1].z += delz*fbond; } if (NEWTON_BOND || i2 < nlocal) { f[i2].x -= delx*fbond; - f[i2].y -= dely*fbond; - f[i2].z -= delz*fbond; + f[i2].y -= dely*fbond; + f[i2].z -= delz*fbond; } } if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF - IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, - fbond, delx, dely, delz, sebond, f, - NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, + fbond, delx, dely, delz, sebond, f, + NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); - #else - IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, - fbond, delx, dely, delz, oebond, f, - NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, + #else + IP_PRE_ev_tally_bond(EFLAG, VFLAG, eatom, vflag, ebond, i1, i2, + fbond, delx, dely, delz, oebond, f, + NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); - #endif + #endif } } // for n #ifdef LMP_INTEL_USE_SIMDOFF @@ -221,7 +221,7 @@ void BondHarmonicIntel::eval(const int vflag, if (EFLAG) energy += oebond; if (VFLAG && vflag) { virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); @@ -276,11 +276,11 @@ void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, - Memory *memory) { + Memory *memory) { if (nbondtypes != _nbondtypes) { if (_nbondtypes > 0) _memory->destroy(fc); - + if (nbondtypes > 0) _memory->create(fc,nbondtypes,"bondharmonicintel.fc"); } diff --git a/src/USER-INTEL/bond_harmonic_intel.h b/src/USER-INTEL/bond_harmonic_intel.h index 0de844cddf..8fc04f432a 100644 --- a/src/USER-INTEL/bond_harmonic_intel.h +++ b/src/USER-INTEL/bond_harmonic_intel.h @@ -45,8 +45,8 @@ class BondHarmonicIntel : public BondHarmonic { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/dihedral_charmm_intel.cpp b/src/USER-INTEL/dihedral_charmm_intel.cpp index df8834c283..715cef4d37 100644 --- a/src/USER-INTEL/dihedral_charmm_intel.cpp +++ b/src/USER-INTEL/dihedral_charmm_intel.cpp @@ -80,8 +80,8 @@ void DihedralCharmmIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void DihedralCharmmIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -95,14 +95,14 @@ void DihedralCharmmIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -115,9 +115,9 @@ void DihedralCharmmIntel::compute(int eflag, int vflag, #ifndef LMP_USE_AVXCD_DHC template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void DihedralCharmmIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void DihedralCharmmIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->ndihedrallist; @@ -148,9 +148,9 @@ void DihedralCharmmIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \ - opv0,opv1,opv2,opv3,opv4,opv5) + opv0,opv1,opv2,opv3,opv4,opv5) #endif { #if defined(LMP_SIMD_COMPILER_TEST) @@ -165,7 +165,7 @@ void DihedralCharmmIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int5_t * _noalias const dihedrallist = + const int5_t * _noalias const dihedrallist = (int5_t *) neighbor->dihedrallist[0]; const flt_t qqrd2e = force->qqrd2e; @@ -180,7 +180,7 @@ void DihedralCharmmIntel::eval(const int vflag, #if defined(LMP_SIMD_COMPILER_TEST) #pragma vector aligned #pragma simd reduction(+:sedihedral, sevdwl, secoul, sv0, sv1, sv2, \ - sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) + sv3, sv4, sv5, spv0, spv1, spv2, spv3, spv4, spv5) for (int n = nfrom; n < nto; n++) { #endif for (int n = nfrom; n < nto; n += npl) { @@ -204,7 +204,7 @@ void DihedralCharmmIntel::eval(const int vflag, const flt_t vb2zm = x[i2].z - x[i3].z; // 3rd bond - + const flt_t vb3x = x[i4].x - x[i3].x; const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3z = x[i4].z - x[i3].z; @@ -244,25 +244,25 @@ void DihedralCharmmIntel::eval(const int vflag, // error check #ifndef LMP_SIMD_COMPILER_TEST if (c > PTOLERANCE || c < MTOLERANCE) { - int me = comm->me; - - if (screen) { - char str[128]; - sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT, - me,tid,update->ntimestep, - atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); - error->warning(FLERR,str,0); - fprintf(screen," 1st atom: %d %g %g %g\n", - me,x[i1].x,x[i1].y,x[i1].z); - fprintf(screen," 2nd atom: %d %g %g %g\n", - me,x[i2].x,x[i2].y,x[i2].z); - fprintf(screen," 3rd atom: %d %g %g %g\n", - me,x[i3].x,x[i3].y,x[i3].z); - fprintf(screen," 4th atom: %d %g %g %g\n", - me,x[i4].x,x[i4].y,x[i4].z); - } + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } } #endif @@ -279,19 +279,19 @@ void DihedralCharmmIntel::eval(const int vflag, ddf1 = df1 = (flt_t)0.0; for (int i = 0; i < m; i++) { - ddf1 = p*c - df1*s; - df1 = p*s + df1*c; - p = ddf1; + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; } p = p*tcos_shift + df1*tsin_shift; df1 = df1*tcos_shift - ddf1*tsin_shift; df1 *= -m; p += (flt_t)1.0; - + if (m == 0) { - p = (flt_t)1.0 + tcos_shift; - df1 = (flt_t)0.0; + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; } const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; @@ -334,12 +334,12 @@ void DihedralCharmmIntel::eval(const int vflag, const flt_t f3z = -sz2 - f4z; if (EFLAG || VFLAG) { - flt_t deng; - if (EFLAG) deng = tk * p; - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, - i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, + flt_t deng; + if (EFLAG) deng = tk * p; + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, + i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, - vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND, + vb3x, vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); } @@ -349,15 +349,15 @@ void DihedralCharmmIntel::eval(const int vflag, #endif { if (NEWTON_BOND || i2 < nlocal) { - f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; } if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; } } @@ -372,54 +372,54 @@ void DihedralCharmmIntel::eval(const int vflag, flt_t forcecoul; if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv; else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv); - const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - - fc.ljp[itype][jtype].lj2); + const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - + fc.ljp[itype][jtype].lj2); const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv; if (NEWTON_BOND || i1 < nlocal) { - f1x += delx*fpair; - f1y += dely*fpair; - f1z += delz*fpair; + f1x += delx*fpair; + f1y += dely*fpair; + f1z += delz*fpair; } if (NEWTON_BOND || i4 < nlocal) { - f4x -= delx*fpair; - f4y -= dely*fpair; - f4z -= delz*fpair; + f4x -= delx*fpair; + f4y -= dely*fpair; + f4z -= delz*fpair; } if (EFLAG || VFLAG) { - flt_t ev_pre = (flt_t)0; - if (NEWTON_BOND || i1 < nlocal) - ev_pre += (flt_t)0.5; - if (NEWTON_BOND || i4 < nlocal) - ev_pre += (flt_t)0.5; - - if (EFLAG) { - flt_t ecoul, evdwl; - ecoul = tweight * forcecoul; - evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - - fc.ljp[itype][jtype].lj4); - secoul += ev_pre * ecoul; - sevdwl += ev_pre * evdwl; - if (eatom) { - evdwl *= (flt_t)0.5; - evdwl += (flt_t)0.5 * ecoul; - if (NEWTON_BOND || i1 < nlocal) - f[i1].w += evdwl; - if (NEWTON_BOND || i4 < nlocal) - f[i4].w += evdwl; - } - } - // IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, - // delx, dely, delz); - if (VFLAG && vflag) { - spv0 += ev_pre * delx * delx * fpair; - spv1 += ev_pre * dely * dely * fpair; - spv2 += ev_pre * delz * delz * fpair; - spv3 += ev_pre * delx * dely * fpair; - spv4 += ev_pre * delx * delz * fpair; - spv5 += ev_pre * dely * delz * fpair; - } + flt_t ev_pre = (flt_t)0; + if (NEWTON_BOND || i1 < nlocal) + ev_pre += (flt_t)0.5; + if (NEWTON_BOND || i4 < nlocal) + ev_pre += (flt_t)0.5; + + if (EFLAG) { + flt_t ecoul, evdwl; + ecoul = tweight * forcecoul; + evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - + fc.ljp[itype][jtype].lj4); + secoul += ev_pre * ecoul; + sevdwl += ev_pre * evdwl; + if (eatom) { + evdwl *= (flt_t)0.5; + evdwl += (flt_t)0.5 * ecoul; + if (NEWTON_BOND || i1 < nlocal) + f[i1].w += evdwl; + if (NEWTON_BOND || i4 < nlocal) + f[i4].w += evdwl; + } + } + // IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, + // delx, dely, delz); + if (VFLAG && vflag) { + spv0 += ev_pre * delx * delx * fpair; + spv1 += ev_pre * dely * dely * fpair; + spv2 += ev_pre * delz * delz * fpair; + spv3 += ev_pre * delx * dely * fpair; + spv4 += ev_pre * delx * delz * fpair; + spv5 += ev_pre * dely * delz * fpair; + } } // apply force to each of 4 atoms @@ -428,15 +428,15 @@ void DihedralCharmmIntel::eval(const int vflag, #endif { if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; } if (NEWTON_BOND || i4 < nlocal) { - f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; } } } // for n @@ -447,7 +447,7 @@ void DihedralCharmmIntel::eval(const int vflag, } if (VFLAG && vflag) { ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5; - opv0 += spv0; opv1 += spv1; opv2 += spv2; + opv0 += spv0; opv1 += spv1; opv2 += spv2; opv3 += spv3; opv4 += spv4; opv5 += spv5; } } // omp parallel @@ -485,9 +485,9 @@ authors for more details. ------------------------------------------------------------------------- */ template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void DihedralCharmmIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void DihedralCharmmIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t; @@ -522,20 +522,20 @@ void DihedralCharmmIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \ - opv0,opv1,opv2,opv3,opv4,opv5) + opv0,opv1,opv2,opv3,opv4,opv5) #endif { int nfrom, npl, nto, tid; IP_PRE_omp_stride_id_vec(nfrom, npl, nto, tid, inum, nthreads, - swidth); + swidth); FORCE_T * _noalias const f = f_start + (tid * f_stride); if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int * _noalias const dihedrallist = + const int * _noalias const dihedrallist = (int *) neighbor->dihedrallist[0]; const flt_t * _noalias const weight = &(fc.weight[0]); const flt_t * _noalias const x_f = &(x[0].x); @@ -574,7 +574,7 @@ void DihedralCharmmIntel::eval(const int vflag, } SIMD_int n_offset = SIMD_set(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, - 55, 60, 65, 70, 75) + (nfrom * 5); + 55, 60, 65, 70, 75) + (nfrom * 5); const int nto5 = nto * 5; const int nlocals4 = nlocal << 4; const SIMD_int simd_nlocals4 = SIMD_set(nlocals4); @@ -618,7 +618,7 @@ void DihedralCharmmIntel::eval(const int vflag, const SIMD_flt_t vb2zm = z2 - z3; // 3rd bond - + SIMD_flt_t x4, y4, z4; SIMD_int jtype; @@ -664,7 +664,7 @@ void DihedralCharmmIntel::eval(const int vflag, const SIMD_flt_t ptol = SIMD_set(PTOLERANCE); const SIMD_flt_t ntol = SIMD_set(MTOLERANCE); if (c > ptol || c < ntol) - if (screen) + if (screen) error->warning(FLERR,"Dihedral problem."); c = SIMD_set(c, c > one, one); @@ -678,14 +678,14 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_flt_t p(one); SIMD_flt_t ddf1(szero); SIMD_flt_t df1(szero); - + const int m_max = SIMD_max(m); for (int i = 0; i < m_max; i++) { - const SIMD_mask my_m = i < m; - ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s); - df1 = SIMD_set(df1, my_m, p*s + df1*c); - p = SIMD_set(p, my_m, ddf1); + const SIMD_mask my_m = i < m; + ddf1 = SIMD_set(ddf1, my_m, p*c - df1*s); + df1 = SIMD_set(df1, my_m, p*s + df1*c); + p = SIMD_set(p, my_m, ddf1); } SIMD_flt_t multf; @@ -694,7 +694,7 @@ void DihedralCharmmIntel::eval(const int vflag, df1 = df1*tcos_shift - ddf1*tsin_shift; df1 = df1 * multf; p = p + one; - + SIMD_mask mzero = (m == SIMD_set((int)0)); p = SIMD_set(p, mzero, one + tcos_shift); df1 = SIMD_set(df1, mzero, szero); @@ -740,40 +740,40 @@ void DihedralCharmmIntel::eval(const int vflag, SIMD_flt_t qdeng; if (EFLAG || VFLAG) { - SIMD_flt_t ev_pre; - if (NEWTON_BOND) ev_pre = one; - else { - ev_pre = szero; - const SIMD_flt_t quarter = SIMD_set((flt_t)0.25); - ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter); - ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter); - ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter); - ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter); - } - SIMD_zero_masked(nmask, ev_pre); - if (EFLAG) { - const SIMD_flt_t deng = tk * p; - sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng); - if (eatom) { - qdeng = deng * SIMD_set((flt_t)0.25); - SIMD_mask newton_mask; - if (NEWTON_BOND) newton_mask = nmask; - if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4); - SIMD_flt_t ieng = qdeng; - SIMD_jeng_update(newton_mask, featom, i2, ieng); - ieng = qdeng; - if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4); - SIMD_jeng_update(newton_mask, featom, i3, ieng); - } - } - if (VFLAG && vflag) { + SIMD_flt_t ev_pre; + if (NEWTON_BOND) ev_pre = one; + else { + ev_pre = szero; + const SIMD_flt_t quarter = SIMD_set((flt_t)0.25); + ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4, ev_pre, quarter); + ev_pre = SIMD_add(ev_pre, i2 < simd_nlocals4, ev_pre, quarter); + ev_pre = SIMD_add(ev_pre, i3 < simd_nlocals4, ev_pre, quarter); + ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4, ev_pre, quarter); + } + SIMD_zero_masked(nmask, ev_pre); + if (EFLAG) { + const SIMD_flt_t deng = tk * p; + sedihedral = SIMD_ev_add(sedihedral, ev_pre * deng); + if (eatom) { + qdeng = deng * SIMD_set((flt_t)0.25); + SIMD_mask newton_mask; + if (NEWTON_BOND) newton_mask = nmask; + if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i2, simd_nlocals4); + SIMD_flt_t ieng = qdeng; + SIMD_jeng_update(newton_mask, featom, i2, ieng); + ieng = qdeng; + if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i3, simd_nlocals4); + SIMD_jeng_update(newton_mask, featom, i3, ieng); + } + } + if (VFLAG && vflag) { sv0 = SIMD_ev_add(sv0, ev_pre*(vb1x*f1x-vb2xm*f3x+(vb3x-vb2xm)*f4x)); - sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y)); - sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z)); - sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y)); - sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z)); - sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z)); - } + sv1 = SIMD_ev_add(sv1, ev_pre*(vb1y*f1y-vb2ym*f3y+(vb3y-vb2ym)*f4y)); + sv2 = SIMD_ev_add(sv2, ev_pre*(vb1z*f1z-vb2zm*f3z+(vb3z-vb2zm)*f4z)); + sv3 = SIMD_ev_add(sv3, ev_pre*(vb1x*f1y-vb2xm*f3y+(vb3x-vb2xm)*f4y)); + sv4 = SIMD_ev_add(sv4, ev_pre*(vb1x*f1z-vb2xm*f3z+(vb3x-vb2xm)*f4z)); + sv5 = SIMD_ev_add(sv5, ev_pre*(vb1y*f1z-vb2ym*f3z+(vb3y-vb2ym)*f4z)); + } } SIMD_mask newton_mask; @@ -809,27 +809,27 @@ void DihedralCharmmIntel::eval(const int vflag, f4z = f4z - delz * fpair; if (EFLAG || VFLAG) { - SIMD_flt_t ev_pre; - if (NEWTON_BOND) ev_pre = one; - else { - ev_pre = szero; + SIMD_flt_t ev_pre; + if (NEWTON_BOND) ev_pre = one; + else { + ev_pre = szero; const SIMD_flt_t half = SIMD_set((flt_t)0.5); ev_pre = SIMD_add(ev_pre, i1 < simd_nlocals4,ev_pre,half); ev_pre = SIMD_add(ev_pre, i4 < simd_nlocals4,ev_pre,half); - } - SIMD_zero_masked(nmask, ev_pre); - - if (EFLAG) { - const SIMD_flt_t ecoul = tweight * forcecoul; - const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype); - const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype); - SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4); - secoul = SIMD_ev_add(secoul, ev_pre * ecoul); - sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl); - if (eatom) { - const SIMD_flt_t half = SIMD_set((flt_t)0.5); - evdwl = evdwl * half; - evdwl = evdwl + half * ecoul + qdeng; + } + SIMD_zero_masked(nmask, ev_pre); + + if (EFLAG) { + const SIMD_flt_t ecoul = tweight * forcecoul; + const SIMD_flt_t lj3 = SIMD_gather(nmask, plj3, ijtype); + const SIMD_flt_t lj4 = SIMD_gather(nmask, plj4, ijtype); + SIMD_flt_t evdwl = tweight * r6inv * (lj3 * r6inv - lj4); + secoul = SIMD_ev_add(secoul, ev_pre * ecoul); + sevdwl = SIMD_ev_add(sevdwl, ev_pre * evdwl); + if (eatom) { + const SIMD_flt_t half = SIMD_set((flt_t)0.5); + evdwl = evdwl * half; + evdwl = evdwl + half * ecoul + qdeng; if (NEWTON_BOND) newton_mask = nmask; if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i1, simd_nlocals4); @@ -838,16 +838,16 @@ void DihedralCharmmIntel::eval(const int vflag, ieng = evdwl; if (!NEWTON_BOND) newton_mask = SIMD_lt(nmask, i4, simd_nlocals4); SIMD_jeng_update(newton_mask, featom, i4, ieng); - } - } - if (VFLAG && vflag) { + } + } + if (VFLAG && vflag) { spv0 = SIMD_ev_add(spv0, ev_pre * delx * delx * fpair); - spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair); - spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair); - spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair); - spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair); - spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair); - } + spv1 = SIMD_ev_add(spv1, ev_pre * dely * dely * fpair); + spv2 = SIMD_ev_add(spv2, ev_pre * delz * delz * fpair); + spv3 = SIMD_ev_add(spv3, ev_pre * delx * dely * fpair); + spv4 = SIMD_ev_add(spv4, ev_pre * delx * delz * fpair); + spv5 = SIMD_ev_add(spv5, ev_pre * dely * delz * fpair); + } } if (NEWTON_BOND) newton_mask = nmask; @@ -863,17 +863,17 @@ void DihedralCharmmIntel::eval(const int vflag, oevdwl += SIMD_sum(sevdwl); } if (VFLAG && vflag) { - ov0 += SIMD_sum(sv0); - ov1 += SIMD_sum(sv1); - ov2 += SIMD_sum(sv2); - ov3 += SIMD_sum(sv3); - ov4 += SIMD_sum(sv4); + ov0 += SIMD_sum(sv0); + ov1 += SIMD_sum(sv1); + ov2 += SIMD_sum(sv2); + ov3 += SIMD_sum(sv3); + ov4 += SIMD_sum(sv4); ov5 += SIMD_sum(sv5); - opv0 += SIMD_sum(spv0); - opv1 += SIMD_sum(spv1); - opv2 += SIMD_sum(spv2); - opv3 += SIMD_sum(spv3); - opv4 += SIMD_sum(spv4); + opv0 += SIMD_sum(spv0); + opv1 += SIMD_sum(spv1); + opv2 += SIMD_sum(spv2); + opv3 += SIMD_sum(spv3); + opv4 += SIMD_sum(spv4); opv5 += SIMD_sum(spv5); } } // omp parallel @@ -933,7 +933,7 @@ void DihedralCharmmIntel::init_style() template <class flt_t, class acc_t> void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> *buffers) + IntelBuffers<flt_t,acc_t> *buffers) { const int tp1 = atom->ntypes + 1; @@ -944,10 +944,10 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, if (weightflag) { for (int i = 0; i < tp1; i++) { for (int j = 0; j < tp1; j++) { - fc.ljp[i][j].lj1 = lj14_1[i][j]; - fc.ljp[i][j].lj2 = lj14_2[i][j]; - fc.ljp[i][j].lj3 = lj14_3[i][j]; - fc.ljp[i][j].lj4 = lj14_4[i][j]; + fc.ljp[i][j].lj1 = lj14_1[i][j]; + fc.ljp[i][j].lj2 = lj14_2[i][j]; + fc.ljp[i][j].lj3 = lj14_3[i][j]; + fc.ljp[i][j].lj4 = lj14_4[i][j]; } } } @@ -965,8 +965,8 @@ void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes, - const int nbondtypes, - Memory *memory) { + const int nbondtypes, + Memory *memory) { if (npairtypes != _npairtypes) { if (_npairtypes > 0) _memory->destroy(ljp); @@ -979,7 +979,7 @@ void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes, _memory->destroy(bp); _memory->destroy(weight); } - + if (nbondtypes > 0) { _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight"); diff --git a/src/USER-INTEL/dihedral_charmm_intel.h b/src/USER-INTEL/dihedral_charmm_intel.h index 292faea9f9..d80b32c8ac 100644 --- a/src/USER-INTEL/dihedral_charmm_intel.h +++ b/src/USER-INTEL/dihedral_charmm_intel.h @@ -44,8 +44,8 @@ class DihedralCharmmIntel : public DihedralCharmm { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); @@ -58,7 +58,7 @@ class DihedralCharmmIntel : public DihedralCharmm { class ForceConst { public: typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1; - typedef struct { flt_t cos_shift, sin_shift, k; + typedef struct { flt_t cos_shift, sin_shift, k; int multiplicity; } fc_packed3; fc_packed1 **ljp; diff --git a/src/USER-INTEL/dihedral_harmonic_intel.cpp b/src/USER-INTEL/dihedral_harmonic_intel.cpp index 94130f4355..196b024fa6 100644 --- a/src/USER-INTEL/dihedral_harmonic_intel.cpp +++ b/src/USER-INTEL/dihedral_harmonic_intel.cpp @@ -69,8 +69,8 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void DihedralHarmonicIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -79,14 +79,14 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -97,9 +97,9 @@ void DihedralHarmonicIntel::compute(int eflag, int vflag, } template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void DihedralHarmonicIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void DihedralHarmonicIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->ndihedrallist; @@ -127,7 +127,7 @@ void DihedralHarmonicIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) #endif { @@ -142,7 +142,7 @@ void DihedralHarmonicIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int5_t * _noalias const dihedrallist = + const int5_t * _noalias const dihedrallist = (int5_t *) neighbor->dihedrallist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -175,7 +175,7 @@ void DihedralHarmonicIntel::eval(const int vflag, const flt_t vb2zm = x[i2].z - x[i3].z; // 3rd bond - + const flt_t vb3x = x[i4].x - x[i3].x; const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3z = x[i4].z - x[i3].z; @@ -207,25 +207,25 @@ void DihedralHarmonicIntel::eval(const int vflag, // error check #ifndef LMP_INTEL_USE_SIMDOFF if (c > PTOLERANCE || c < MTOLERANCE) { - int me = comm->me; - - if (screen) { - char str[128]; - sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT, - me,tid,update->ntimestep, - atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); - error->warning(FLERR,str,0); - fprintf(screen," 1st atom: %d %g %g %g\n", - me,x[i1].x,x[i1].y,x[i1].z); - fprintf(screen," 2nd atom: %d %g %g %g\n", - me,x[i2].x,x[i2].y,x[i2].z); - fprintf(screen," 3rd atom: %d %g %g %g\n", - me,x[i3].x,x[i3].y,x[i3].z); - fprintf(screen," 4th atom: %d %g %g %g\n", - me,x[i4].x,x[i4].y,x[i4].z); - } + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } } #endif @@ -242,19 +242,19 @@ void DihedralHarmonicIntel::eval(const int vflag, ddf1 = df1 = (flt_t)0.0; for (int i = 0; i < m; i++) { - ddf1 = p*c - df1*s; - df1 = p*s + df1*c; - p = ddf1; + ddf1 = p*c - df1*s; + df1 = p*s + df1*c; + p = ddf1; } p = p*tcos_shift + df1*tsin_shift; df1 = df1*tcos_shift - ddf1*tsin_shift; df1 *= -m; p += (flt_t)1.0; - + if (m == 0) { - p = (flt_t)1.0 + tcos_shift; - df1 = (flt_t)0.0; + p = (flt_t)1.0 + tcos_shift; + df1 = (flt_t)0.0; } const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm; @@ -297,20 +297,20 @@ void DihedralHarmonicIntel::eval(const int vflag, const flt_t f3z = -sz2 - f4z; if (EFLAG || VFLAG) { - flt_t deng; - if (EFLAG) deng = tk * p; - #ifdef LMP_INTEL_USE_SIMDOFF - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, - f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, - vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, - vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, - sv0, sv1, sv2, sv3, sv4, sv5); + flt_t deng; + if (EFLAG) deng = tk * p; + #ifdef LMP_INTEL_USE_SIMDOFF + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); #else - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, - f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, - vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, - vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, - ov0, ov1, ov2, ov3, ov4, ov5); + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4, + f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); #endif } @@ -319,35 +319,35 @@ void DihedralHarmonicIntel::eval(const int vflag, #endif { if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; } if (NEWTON_BOND || i2 < nlocal) { - f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; } if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; } if (NEWTON_BOND || i4 < nlocal) { - f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; } } } // for n #ifdef LMP_INTEL_USE_SIMDOFF if (EFLAG) oedihedral += sedihedral; if (VFLAG && vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; - ov3 += sv3; ov4 += sv4; ov5 += sv5; + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } #endif } // omp parallel @@ -395,7 +395,7 @@ void DihedralHarmonicIntel::init_style() template <class flt_t, class acc_t> void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> *buffers) + IntelBuffers<flt_t,acc_t> *buffers) { const int bp1 = atom->ndihedraltypes + 1; fc.set_ntypes(bp1,memory); @@ -412,11 +412,11 @@ void DihedralHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void DihedralHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, - Memory *memory) { + Memory *memory) { if (nbondtypes != _nbondtypes) { if (_nbondtypes > 0) _memory->destroy(bp); - + if (nbondtypes > 0) _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); } diff --git a/src/USER-INTEL/dihedral_harmonic_intel.h b/src/USER-INTEL/dihedral_harmonic_intel.h index 41e3d20540..0a9cfaa042 100644 --- a/src/USER-INTEL/dihedral_harmonic_intel.h +++ b/src/USER-INTEL/dihedral_harmonic_intel.h @@ -44,8 +44,8 @@ class DihedralHarmonicIntel : public DihedralHarmonic { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); @@ -57,7 +57,7 @@ class DihedralHarmonicIntel : public DihedralHarmonic { template <class flt_t> class ForceConst { public: - typedef struct { flt_t cos_shift, sin_shift, k; + typedef struct { flt_t cos_shift, sin_shift, k; int multiplicity; } fc_packed1; fc_packed1 *bp; diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp index 3248a8bfc7..1abeba1d5e 100644 --- a/src/USER-INTEL/dihedral_opls_intel.cpp +++ b/src/USER-INTEL/dihedral_opls_intel.cpp @@ -73,8 +73,8 @@ void DihedralOPLSIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void DihedralOPLSIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -83,14 +83,14 @@ void DihedralOPLSIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -101,9 +101,9 @@ void DihedralOPLSIntel::compute(int eflag, int vflag, } template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void DihedralOPLSIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void DihedralOPLSIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->ndihedrallist; @@ -131,7 +131,7 @@ void DihedralOPLSIntel::eval(const int vflag, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(f_start,f_stride,fc) \ + shared(f_start,f_stride,fc) \ reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5) #endif { @@ -146,7 +146,7 @@ void DihedralOPLSIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int5_t * _noalias const dihedrallist = + const int5_t * _noalias const dihedrallist = (int5_t *) neighbor->dihedrallist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -179,7 +179,7 @@ void DihedralOPLSIntel::eval(const int vflag, const flt_t vb2zm = x[i2].z - x[i3].z; // 3rd bond - + const flt_t vb3x = x[i4].x - x[i3].x; const flt_t vb3y = x[i4].y - x[i3].y; const flt_t vb3z = x[i4].z - x[i3].z; @@ -209,7 +209,7 @@ void DihedralOPLSIntel::eval(const int vflag, const flt_t c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3; flt_t ctmp = -vb1x*vb2xm - vb1y*vb2ym - vb1z*vb2zm; - const flt_t r12c1 = rb1 * rb2; + const flt_t r12c1 = rb1 * rb2; const flt_t c1mag = ctmp * r12c1; ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z; @@ -240,25 +240,25 @@ void DihedralOPLSIntel::eval(const int vflag, // error check #ifndef LMP_INTEL_USE_SIMDOFF if (c > PTOLERANCE || c < MTOLERANCE) { - int me = comm->me; - - if (screen) { - char str[128]; - sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT " " - TAGINT_FORMAT " " TAGINT_FORMAT, - me,tid,update->ntimestep, - atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); - error->warning(FLERR,str,0); - fprintf(screen," 1st atom: %d %g %g %g\n", - me,x[i1].x,x[i1].y,x[i1].z); - fprintf(screen," 2nd atom: %d %g %g %g\n", - me,x[i2].x,x[i2].y,x[i2].z); - fprintf(screen," 3rd atom: %d %g %g %g\n", - me,x[i3].x,x[i3].y,x[i3].z); - fprintf(screen," 4th atom: %d %g %g %g\n", - me,x[i4].x,x[i4].y,x[i4].z); - } + int me = comm->me; + + if (screen) { + char str[128]; + sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT " " + TAGINT_FORMAT " " TAGINT_FORMAT, + me,tid,update->ntimestep, + atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", + me,x[i1].x,x[i1].y,x[i1].z); + fprintf(screen," 2nd atom: %d %g %g %g\n", + me,x[i2].x,x[i2].y,x[i2].z); + fprintf(screen," 3rd atom: %d %g %g %g\n", + me,x[i3].x,x[i3].y,x[i3].z); + fprintf(screen," 4th atom: %d %g %g %g\n", + me,x[i4].x,x[i4].y,x[i4].z); + } } #endif @@ -283,14 +283,14 @@ void DihedralOPLSIntel::eval(const int vflag, const flt_t sin_4phim = (flt_t)2.0 * cos_2phi * sin_2phim; flt_t p, pd; - p = fc.bp[type].k1*((flt_t)1.0 + c) + - fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + - fc.bp[type].k3*((flt_t)1.0 + cos_3phi) + - fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ; - pd = fc.bp[type].k1 - - (flt_t)2.0 * fc.bp[type].k2 * sin_2phim + - (flt_t)3.0 * fc.bp[type].k3 * sin_3phim - - (flt_t)4.0 * fc.bp[type].k4 * sin_4phim; + p = fc.bp[type].k1*((flt_t)1.0 + c) + + fc.bp[type].k2*((flt_t)1.0 - cos_2phi) + + fc.bp[type].k3*((flt_t)1.0 + cos_3phi) + + fc.bp[type].k4*((flt_t)1.0 - cos_4phi) ; + pd = fc.bp[type].k1 - + (flt_t)2.0 * fc.bp[type].k2 * sin_2phim + + (flt_t)3.0 * fc.bp[type].k3 * sin_3phim - + (flt_t)4.0 * fc.bp[type].k4 * sin_4phim; flt_t edihed; if (EFLAG) edihed = p; @@ -327,18 +327,18 @@ void DihedralOPLSIntel::eval(const int vflag, if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, - i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, - vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, - vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, - sv0, sv1, sv2, sv3, sv4, sv5); - #else - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, - i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, - vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, - vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, - ov0, ov1, ov2, ov3, ov4, ov5); - #endif + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, + i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal, + sv0, sv1, sv2, sv3, sv4, sv5); + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, edihed, i1, i2, i3, + i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, + vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, + vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal, + ov0, ov1, ov2, ov3, ov4, ov5); + #endif } #ifdef LMP_INTEL_USE_SIMDOFF @@ -346,35 +346,35 @@ void DihedralOPLSIntel::eval(const int vflag, #endif { if (NEWTON_BOND || i1 < nlocal) { - f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].x += f1x; + f[i1].y += f1y; + f[i1].z += f1z; } if (NEWTON_BOND || i2 < nlocal) { - f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; + f[i2].x += f2x; + f[i2].y += f2y; + f[i2].z += f2z; } if (NEWTON_BOND || i3 < nlocal) { - f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].x += f3x; + f[i3].y += f3y; + f[i3].z += f3z; } if (NEWTON_BOND || i4 < nlocal) { - f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + f[i4].x += f4x; + f[i4].y += f4y; + f[i4].z += f4z; } } } // for n #ifdef LMP_INTEL_USE_SIMDOFF if (EFLAG) oedihedral += sedihedral; if (VFLAG && vflag) { - ov0 += sv0; ov1 += sv1; ov2 += sv2; - ov3 += sv3; ov4 += sv4; ov5 += sv5; + ov0 += sv0; ov1 += sv1; ov2 += sv2; + ov3 += sv3; ov4 += sv4; ov5 += sv5; } #endif } // omp parallel @@ -422,7 +422,7 @@ void DihedralOPLSIntel::init_style() template <class flt_t, class acc_t> void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> *buffers) + IntelBuffers<flt_t,acc_t> *buffers) { const int bp1 = atom->ndihedraltypes + 1; fc.set_ntypes(bp1,memory); @@ -439,11 +439,11 @@ void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void DihedralOPLSIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes, - Memory *memory) { + Memory *memory) { if (nbondtypes != _nbondtypes) { if (_nbondtypes > 0) _memory->destroy(bp); - + if (nbondtypes > 0) _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp"); } diff --git a/src/USER-INTEL/dihedral_opls_intel.h b/src/USER-INTEL/dihedral_opls_intel.h index ea0930f4b8..1080bfa6c3 100644 --- a/src/USER-INTEL/dihedral_opls_intel.h +++ b/src/USER-INTEL/dihedral_opls_intel.h @@ -44,8 +44,8 @@ class DihedralOPLSIntel : public DihedralOPLS { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index e132947750..b06f76c90d 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -96,7 +96,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _allow_separate_buffers = 1; _offload_ghost = -1; _lrt = 0; - + int iarg = 4; while (iarg < narg) { if (strcmp(arg[iarg],"omp") == 0) { @@ -141,7 +141,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) else error->all(FLERR,"Illegal package intel command"); iarg += 2; } - + // undocumented options else if (strcmp(arg[iarg],"offload_affinity_balanced") == 0) { @@ -179,7 +179,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _real_space_comm = MPI_COMM_WORLD; if (no_affinity == 0) if (set_host_affinity(nomp) != 0) - error->all(FLERR,"Could not set host affinity for offload tasks"); + error->all(FLERR,"Could not set host affinity for offload tasks"); } int max_offload_threads = 0, offload_cores = 0; @@ -264,7 +264,7 @@ FixIntel::~FixIntel() double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL && - overflow != NULL) { + overflow != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2,overflow:alloc_if(0) free_if(1)) } @@ -320,11 +320,11 @@ void FixIntel::init() if (strstr(hybrid->keywords[i], "/intel") != NULL) nstyles++; else - force->pair->no_virial_fdotr_compute = 1; + force->pair->no_virial_fdotr_compute = 1; } if (nstyles > 1) error->all(FLERR, - "Currently, cannot use more than one intel style with hybrid."); + "Currently, cannot use more than one intel style with hybrid."); check_neighbor_intel(); int off_mode = 0; @@ -349,13 +349,13 @@ void FixIntel::setup(int vflag) { if (neighbor->style != BIN) error->all(FLERR, - "Currently, neighbor style BIN must be used with Intel package."); + "Currently, neighbor style BIN must be used with Intel package."); if (neighbor->exclude_setting() != 0) error->all(FLERR, - "Currently, cannot use neigh_modify exclude with Intel package."); + "Currently, cannot use neigh_modify exclude with Intel package."); if (vflag_atom) error->all(FLERR, - "Cannot currently get per-atom virials with Intel package."); + "Cannot currently get per-atom virials with Intel package."); #ifdef _LMP_INTEL_OFFLOAD post_force(vflag); #endif @@ -392,7 +392,7 @@ void FixIntel::pair_init_check(const bool cdmessage) double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL && - overflow != NULL) { + overflow != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \ in(overflow:length(5) alloc_if(1) free_if(0)) @@ -407,7 +407,7 @@ void FixIntel::pair_init_check(const bool cdmessage) error->warning(FLERR, "Unknown Intel Compiler Version\n"); #else if (__INTEL_COMPILER_BUILD_DATE != 20131008 && - __INTEL_COMPILER_BUILD_DATE < 20141023) + __INTEL_COMPILER_BUILD_DATE < 20141023) error->warning(FLERR, "Unsupported Intel Compiler."); #endif #if !defined(__INTEL_COMPILER) @@ -438,24 +438,24 @@ void FixIntel::pair_init_check(const bool cdmessage) if (comm->me == 0) { if (screen) { fprintf(screen, - "----------------------------------------------------------\n"); + "----------------------------------------------------------\n"); if (_offload_balance != 0.0) { fprintf(screen,"Using Intel Coprocessor with %d threads per core, ", - _offload_tpc); + _offload_tpc); fprintf(screen,"%d threads per task\n",_offload_threads); } else { - fprintf(screen,"Using Intel Package without Coprocessor.\n"); + fprintf(screen,"Using Intel Package without Coprocessor.\n"); } fprintf(screen,"Precision: %s\n",kmode); if (cdmessage) { - #ifdef LMP_USE_AVXCD - fprintf(screen,"AVX512 CD Optimizations: Enabled\n"); - #else - fprintf(screen,"AVX512 CD Optimizations: Disabled\n"); - #endif + #ifdef LMP_USE_AVXCD + fprintf(screen,"AVX512 CD Optimizations: Enabled\n"); + #else + fprintf(screen,"AVX512 CD Optimizations: Disabled\n"); + #endif } fprintf(screen, - "----------------------------------------------------------\n"); + "----------------------------------------------------------\n"); } } } @@ -464,7 +464,7 @@ void FixIntel::pair_init_check(const bool cdmessage) void FixIntel::bond_init_check() { - if (_offload_balance != 0.0 && atom->molecular && + if (_offload_balance != 0.0 && atom->molecular && force->newton_pair != force->newton_bond) error->all(FLERR, "USER-INTEL package requires same setting for newton bond and non-bond."); @@ -573,7 +573,7 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar) int o_range, f_stride; if (force->newton_pair) o_range = atom->nlocal + atom->nghost; - else + else o_range = atom->nlocal; IP_PRE_get_stride(f_stride, o_range, (sizeof(acc_t)*4), lmp->atom->torque); @@ -588,18 +588,18 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar) _use_simd_pragma("vector aligned") _use_simd_pragma("simd") for (int n = 0; n < o_range; n++) - f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; + f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; } else if (_nthreads == 2) { _use_simd_pragma("vector aligned") _use_simd_pragma("simd") for (int n = 0; n < o_range; n++) - f_scalar[n] += f_scalar2[n]; + f_scalar[n] += f_scalar2[n]; } else { acc_t *f_scalar3 = f_scalar2 + f_stride4; _use_simd_pragma("vector aligned") _use_simd_pragma("simd") for (int n = 0; n < o_range; n++) - f_scalar[n] += f_scalar2[n] + f_scalar3[n]; + f_scalar[n] += f_scalar2[n] + f_scalar3[n]; } } else { #if defined(_OPENMP) @@ -608,13 +608,13 @@ void FixIntel::reduce_results(acc_t * _noalias const f_scalar) { int iifrom, iito, tid; IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads, - sizeof(acc_t)); + sizeof(acc_t)); acc_t *f_scalar2 = f_scalar + f_stride4; for (int t = 1; t < _nthreads; t++) { - _use_simd_pragma("vector aligned") - _use_simd_pragma("simd") - for (int n = iifrom; n < iito; n++) + _use_simd_pragma("vector aligned") + _use_simd_pragma("simd") + for (int n = iifrom; n < iito; n++) f_scalar[n] += f_scalar2[n]; f_scalar2 += f_stride4; } @@ -648,33 +648,33 @@ template <class ft, class acc_t> void FixIntel::add_results(const ft * _noalias const f_in, const acc_t * _noalias const ev_global, const int eatom, const int vatom, - const int offload) { + const int offload) { start_watch(TIME_PACK); int f_length; #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { if (offload) { if (force->newton_pair) { - add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); - const acc_t * _noalias const enull = 0; - int offset = _offload_nlocal; - if (atom->torque) offset *= 2; - add_oresults(f_in + offset, enull, eatom, vatom, - _offload_min_ghost, _offload_nghost); + add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal); + const acc_t * _noalias const enull = 0; + int offset = _offload_nlocal; + if (atom->torque) offset *= 2; + add_oresults(f_in + offset, enull, eatom, vatom, + _offload_min_ghost, _offload_nghost); } else - add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair()); + add_oresults(f_in, ev_global, eatom, vatom, 0, offload_end_pair()); } else { if (force->newton_pair) { - add_oresults(f_in, ev_global, eatom, vatom, - _host_min_local, _host_used_local); - const acc_t * _noalias const enull = 0; - int offset = _host_used_local; - if (atom->torque) offset *= 2; - add_oresults(f_in + offset, enull, eatom, - vatom, _host_min_ghost, _host_used_ghost); + add_oresults(f_in, ev_global, eatom, vatom, + _host_min_local, _host_used_local); + const acc_t * _noalias const enull = 0; + int offset = _host_used_local; + if (atom->torque) offset *= 2; + add_oresults(f_in + offset, enull, eatom, + vatom, _host_min_ghost, _host_used_ghost); } else { - int start = host_start_pair(); - add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start); + int start = host_start_pair(); + add_oresults(f_in, ev_global, eatom, vatom, start, atom->nlocal-start); } } stop_watch(TIME_PACK); @@ -685,9 +685,9 @@ void FixIntel::add_results(const ft * _noalias const f_in, start = 0; if (force->newton_pair) { if (_offload_noghost == 0) - f_length = atom->nlocal + atom->nghost; + f_length = atom->nlocal + atom->nghost; else - f_length = atom->nlocal; + f_length = atom->nlocal; } else f_length = offload_end_pair(); } else { @@ -714,9 +714,9 @@ void FixIntel::add_results(const ft * _noalias const f_in, template <class ft, class acc_t> void FixIntel::add_oresults(const ft * _noalias const f_in, - const acc_t * _noalias const ev_global, - const int eatom, const int vatom, - const int out_offset, const int nall) { + const acc_t * _noalias const ev_global, + const int eatom, const int vatom, + const int out_offset, const int nall) { lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset; if (atom->torque) { if (f_in[1].w) @@ -744,12 +744,12 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, if (atom->torque) { int ii = ifrom * 2; lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] + - out_offset; + out_offset; if (eatom) { - double * _noalias const lmp_eatom = force->pair->eatom + out_offset; + double * _noalias const lmp_eatom = force->pair->eatom + out_offset; #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif + #pragma novector + #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[ii].x; f[i].y += f_in[ii].y; @@ -762,8 +762,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, } } else { #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif + #pragma novector + #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[ii].x; f[i].y += f_in[ii].y; @@ -776,10 +776,10 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, } } else { if (eatom) { - double * _noalias const lmp_eatom = force->pair->eatom + out_offset; + double * _noalias const lmp_eatom = force->pair->eatom + out_offset; #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif + #pragma novector + #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[i].x; f[i].y += f_in[i].y; @@ -788,8 +788,8 @@ void FixIntel::add_oresults(const ft * _noalias const f_in, } } else { #if defined(LMP_SIMD_COMPILER) - #pragma novector - #endif + #pragma novector + #endif for (int i = ifrom; i < ito; i++) { f[i].x += f_in[i].x; f[i].y += f_in[i].y; @@ -931,7 +931,7 @@ void FixIntel::output_timing_data() { balance_out[0] = _balance_pair; balance_out[1] = _balance_neighbor; MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM, - 0, _real_space_comm); + 0, _real_space_comm); balance_in[0] /= size; balance_in[1] /= size; @@ -958,25 +958,25 @@ void FixIntel::output_timing_data() { balance_in[1]); fprintf(_tscreen, " Offload Pair Balance %f\n", balance_in[0]); - fprintf(_tscreen, " Offload Ghost Atoms "); - if (_offload_noghost) fprintf(_tscreen,"No\n"); - else fprintf(_tscreen,"Yes\n"); + fprintf(_tscreen, " Offload Ghost Atoms "); + if (_offload_noghost) fprintf(_tscreen,"No\n"); + else fprintf(_tscreen,"Yes\n"); #ifdef TIME_BALANCE fprintf(_tscreen, " Offload Imbalance Seconds %f\n", timers[TIME_IMBALANCE]); - fprintf(_tscreen, " Offload Min/Max Seconds "); - for (int i = 0; i < NUM_ITIMERS; i++) - fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]); - fprintf(_tscreen, "\n"); + fprintf(_tscreen, " Offload Min/Max Seconds "); + for (int i = 0; i < NUM_ITIMERS; i++) + fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]); + fprintf(_tscreen, "\n"); #endif - double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] + - timers[TIME_OFFLOAD_WAIT]; - double ct = timers[TIME_OFFLOAD_NEIGHBOR] + - timers[TIME_OFFLOAD_PAIR]; - double tt = MAX(ht,ct); - if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0) - error->warning(FLERR, - "Leaving a core free can improve performance for offload"); + double ht = timers[TIME_HOST_NEIGHBOR] + timers[TIME_HOST_PAIR] + + timers[TIME_OFFLOAD_WAIT]; + double ct = timers[TIME_OFFLOAD_NEIGHBOR] + + timers[TIME_OFFLOAD_PAIR]; + double tt = MAX(ht,ct); + if (timers[TIME_OFFLOAD_LATENCY] / tt > 0.07 && _separate_coi == 0) + error->warning(FLERR, + "Leaving a core free can improve performance for offload"); } fprintf(_tscreen, "------------------------------------------------\n"); } @@ -999,14 +999,14 @@ int FixIntel::get_ppn(int &node_rank) { node_name[name_length] = '\0'; char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs]; MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names, - MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm); + MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm); int ppn = 0; node_rank = 0; for (int i = 0; i < nprocs; i++) { if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) { ppn++; if (i < rank) - node_rank++; + node_rank++; } } @@ -1068,19 +1068,19 @@ void FixIntel::set_offload_affinity() kmp_create_affinity_mask(&mask); int proc = offload_threads * node_rank + tnum; #ifdef __AVX512F__ - proc = (proc / offload_tpc) + (proc % offload_tpc) * - ((offload_cores) / 4); + proc = (proc / offload_tpc) + (proc % offload_tpc) * + ((offload_cores) / 4); proc += 68; #else if (offload_affinity_balanced) - proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1; + proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1; else - proc += (proc / 4) * (4 - offload_tpc) + 1; + proc += (proc / 4) * (4 - offload_tpc) + 1; #endif kmp_set_affinity_mask_proc(proc, &mask); if (kmp_set_affinity(&mask) != 0) - printf("Could not set affinity on rank %d thread %d to %d\n", - node_rank, tnum, proc); + printf("Could not set affinity on rank %d thread %d to %d\n", + node_rank, tnum, proc); } } @@ -1110,7 +1110,7 @@ int FixIntel::set_host_affinity(const int nomp) char cmd[512]; char readbuf[INTEL_MAX_HOST_CORE_COUNT*5]; sprintf(cmd, "lscpu -p | grep -v '#' |" - "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'"); + "sort -t, -k 3,3n -k 2,2n | awk -F, '{print $1}'"); p = popen(cmd, "r"); if (p == NULL) return -1; ncores = 0; @@ -1147,7 +1147,7 @@ int FixIntel::set_host_affinity(const int nomp) if (subscription > ncores) { if (rank == 0) error->warning(FLERR, - "More MPI tasks/OpenMP threads than available cores"); + "More MPI tasks/OpenMP threads than available cores"); return 0; } if (subscription == ncores) @@ -1173,10 +1173,10 @@ int FixIntel::set_host_affinity(const int nomp) int first = coi_cores + node_rank * mpi_cores; CPU_ZERO(&cpuset); for (int i = first; i < first + mpi_cores; i++) - CPU_SET(proc_list[i], &cpuset); + CPU_SET(proc_list[i], &cpuset); if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) { - fail = 1; - break; + fail = 1; + break; } plwp++; } @@ -1189,13 +1189,13 @@ int FixIntel::set_host_affinity(const int nomp) buf1 = (float*) malloc(sizeof(float)*pragma_size); #pragma offload target (mic:0) mandatory \ - in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \ + in(buf1:length(pragma_size) alloc_if(1) free_if(0)) \ signal(&sig1) { buf1[0] = 0.0; } #pragma offload_wait target(mic:0) wait(&sig1) #pragma offload target (mic:0) mandatory \ - out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \ + out(buf1:length(pragma_size) alloc_if(0) free_if(1)) \ signal(&sig2) { buf1[0] = 1.0; } #pragma offload_wait target(mic:0) wait(&sig2) @@ -1211,11 +1211,11 @@ int FixIntel::set_host_affinity(const int nomp) CPU_ZERO(&cpuset); for(int i=0; i<coi_cores; i++) - CPU_SET(proc_list[i], &cpuset); + CPU_SET(proc_list[i], &cpuset); if (sched_setaffinity(lwp, sizeof(cpu_set_t), &cpuset)) { - fail = 1; - break; + fail = 1; + break; } } pclose(p); @@ -1228,7 +1228,7 @@ int FixIntel::set_host_affinity(const int nomp) if (screen && rank == 0) { if (coi_cores) fprintf(screen,"Intel Package: Affinitizing %d Offload Threads to %d Cores\n", - mlwp, coi_cores); + mlwp, coi_cores); fprintf(screen,"Intel Package: Affinitizing MPI Tasks to %d Cores Each\n",mpi_cores); } if (fail) return -1; diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index 92d1311256..068e5ed890 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -72,7 +72,7 @@ class FixIntel : public Fix { inline void nbor_pack_width(const int w) { _nbor_pack_width = w; } inline int three_body_neighbor() { return _three_body_neighbor; } inline void three_body_neighbor(const int i) { _three_body_neighbor = 1; } - + inline int need_zero(const int tid) { if (_need_reduce == 0 && tid > 0) return 1; return 0; @@ -84,11 +84,11 @@ class FixIntel : public Fix { } inline int pppm_table() { if (force->kspace_match("pppm/intel", 0) || - force->kspace_match("pppm/disp/intel",0)) + force->kspace_match("pppm/disp/intel",0)) return INTEL_P3M_TABLE; else return 0; } - + protected: IntelBuffers<float,float> *_single_buffers; @@ -103,17 +103,17 @@ class FixIntel : public Fix { inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, double *ev_in, const int offload, const int eatom = 0, const int vatom = 0, - const int rflag = 0); + const int rflag = 0); inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, double *ev_in, const int offload, const int eatom = 0, const int vatom = 0, - const int rflag = 0); + const int rflag = 0); inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in, float *ev_in, const int offload, const int eatom = 0, const int vatom = 0, - const int rflag = 0); + const int rflag = 0); inline void get_buffern(const int offload, int &nlocal, int &nall, - int &minlocal); + int &minlocal); #ifdef _LMP_INTEL_OFFLOAD void post_force(int vflag); @@ -213,13 +213,13 @@ class FixIntel : public Fix { inline void add_results(const ft * _noalias const f_in, const acc_t * _noalias const ev_global, const int eatom, const int vatom, - const int offload); + const int offload); template <class ft, class acc_t> inline void add_oresults(const ft * _noalias const f_in, - const acc_t * _noalias const ev_global, - const int eatom, const int vatom, - const int out_offset, const int nall); + const acc_t * _noalias const ev_global, + const int eatom, const int vatom, + const int out_offset, const int nall); int _offload_affinity_balanced, _offload_threads, _offload_tpc; #ifdef _LMP_INTEL_OFFLOAD @@ -235,16 +235,16 @@ class FixIntel : public Fix { /* ---------------------------------------------------------------------- */ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, - int &minlocal) { + int &minlocal) { #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { if (offload) { if (neighbor->ago != 0) { - nlocal = _offload_nlocal; - nall = _offload_nall; + nlocal = _offload_nlocal; + nall = _offload_nall; } else { - nlocal = atom->nlocal; - nall = nlocal + atom->nghost; + nlocal = atom->nlocal; + nall = nlocal + atom->nghost; } minlocal = 0; } else { @@ -253,7 +253,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, if (force->newton) minlocal = _host_min_local; else - minlocal = host_start_pair(); + minlocal = host_start_pair(); } return; } @@ -271,7 +271,7 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, double *ev_in, const int offload, const int eatom, const int vatom, - const int rflag) { + const int rflag) { #ifdef _LMP_INTEL_OFFLOAD if (offload) { _off_results_eatom = eatom; @@ -299,7 +299,7 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, double *ev_in, const int offload, const int eatom, const int vatom, - const int rflag) { + const int rflag) { #ifdef _LMP_INTEL_OFFLOAD if (offload) { _off_results_eatom = eatom; @@ -361,12 +361,12 @@ int FixIntel::offload_end_neighbor() { if (atom->nlocal < 2) error->one(FLERR,"Too few atoms for load balancing offload"); double granularity = 1.0 / atom->nlocal; - if (_balance_neighbor < granularity) + if (_balance_neighbor < granularity) _balance_neighbor = granularity + 1e-10; - else if (_balance_neighbor > 1.0 - granularity) + else if (_balance_neighbor > 1.0 - granularity) _balance_neighbor = 1.0 - granularity + 1e-10; } - return _balance_neighbor * atom->nlocal; + return _balance_neighbor * atom->nlocal; } int FixIntel::offload_end_pair() { @@ -517,7 +517,7 @@ The newton setting must be the same for both pairwise and bonded forces. E: Intel styles for bond/angle/dihedral/improper require intel pair style." -You cannot use the USER-INTEL package for bond calculations without a +You cannot use the USER-INTEL package for bond calculations without a USER-INTEL supported pair style. E: Intel styles for kspace require intel pair style. diff --git a/src/USER-INTEL/fix_nh_intel.cpp b/src/USER-INTEL/fix_nh_intel.cpp index 3f76e53c1f..6e44b38ef1 100644 --- a/src/USER-INTEL/fix_nh_intel.cpp +++ b/src/USER-INTEL/fix_nh_intel.cpp @@ -45,7 +45,7 @@ typedef struct { double x,y,z; } dbl3_t; NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion ---------------------------------------------------------------------- */ -FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) : +FixNHIntel::FixNHIntel(LAMMPS *lmp, int narg, char **arg) : FixNH(lmp, narg, arg) { _dtfm = 0; @@ -118,12 +118,12 @@ void FixNHIntel::remap() #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & dilate_group_bit) { - const double d0 = x[i].x - b0; - const double d1 = x[i].y - b1; - const double d2 = x[i].z - b2; - x[i].x = hi0*d0 + hi5*d1 + hi4*d2; - x[i].y = hi1*d1 + hi3*d2; - x[i].z = hi2*d2; + const double d0 = x[i].x - b0; + const double d1 = x[i].y - b1; + const double d2 = x[i].z - b2; + x[i].x = hi0*d0 + hi5*d1 + hi4*d2; + x[i].y = hi1*d1 + hi3*d2; + x[i].z = hi2*d2; } } } @@ -294,9 +294,9 @@ void FixNHIntel::remap() #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & dilate_group_bit) { - x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0; - x[i].y = h1*x[i].y + h3*x[i].z + nb1; - x[i].z = h2*x[i].z + nb2; + x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0; + x[i].y = h1*x[i].y + h3*x[i].z + nb1; + x[i].z = h2*x[i].z + nb2; } } } @@ -318,7 +318,7 @@ void FixNHIntel::reset_dt() dto = dthalf; // If using respa, then remap is performed in innermost level - + if (strstr(update->integrate_style,"respa")) dto = 0.5*step_respa[0]; @@ -329,7 +329,7 @@ void FixNHIntel::reset_dt() tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain); const int * const mask = atom->mask; - const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : atom->nlocal; if (nlocal > _nlocal_max) { @@ -345,9 +345,9 @@ void FixNHIntel::reset_dt() const double * const rmass = atom->rmass; int n = 0; for (int i = 0; i < nlocal; i++) { - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; } } else { const double * const mass = atom->mass; @@ -364,29 +364,29 @@ void FixNHIntel::reset_dt() const double * const rmass = atom->rmass; int n = 0; for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; + if (mask[i] & groupbit) { + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; } else { - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - } + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } } else { const double * const mass = atom->mass; const int * const type = atom->type; int n = 0; for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { - _dtfm[n++] = dtf / mass[type[i]]; - _dtfm[n++] = dtf / mass[type[i]]; - _dtfm[n++] = dtf / mass[type[i]]; + if (mask[i] & groupbit) { + _dtfm[n++] = dtf / mass[type[i]]; + _dtfm[n++] = dtf / mass[type[i]]; + _dtfm[n++] = dtf / mass[type[i]]; } else { - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - } + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } } } } @@ -431,9 +431,9 @@ void FixNHIntel::nh_v_press() #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { - v[i].x *= f0; - v[i].y *= f1; - v[i].z *= f2; + v[i].x *= f0; + v[i].y *= f1; + v[i].z *= f2; } } } @@ -506,7 +506,7 @@ void FixNHIntel::nh_v_temp() #pragma simd #endif for (int i = 0; i < _nlocal3; i++) - v[i] *= factor_eta; + v[i] *= factor_eta; } else { #if defined(LMP_SIMD_COMPILER) #pragma vector aligned @@ -514,12 +514,12 @@ void FixNHIntel::nh_v_temp() #endif for (int i = 0; i < _nlocal3; i++) { if (_dtfm[i] != 0.0) - v[i] *= factor_eta; + v[i] *= factor_eta; } } } -double FixNHIntel::memory_usage() +double FixNHIntel::memory_usage() { return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double); } diff --git a/src/USER-INTEL/fix_nh_intel.h b/src/USER-INTEL/fix_nh_intel.h index 32ed6c8534..cc6ba8c481 100644 --- a/src/USER-INTEL/fix_nh_intel.h +++ b/src/USER-INTEL/fix_nh_intel.h @@ -35,7 +35,7 @@ class FixNHIntel : public FixNH { int _nlocal3, _nlocal_max; virtual void remap(); - virtual void nve_x(); + virtual void nve_x(); virtual void nve_v(); virtual void nh_v_press(); virtual void nh_v_temp(); diff --git a/src/USER-INTEL/fix_nve_asphere_intel.cpp b/src/USER-INTEL/fix_nve_asphere_intel.cpp index 6563165454..8ad63f7326 100644 --- a/src/USER-INTEL/fix_nve_asphere_intel.cpp +++ b/src/USER-INTEL/fix_nve_asphere_intel.cpp @@ -36,7 +36,7 @@ using namespace FixConst; /* ---------------------------------------------------------------------- */ FixNVEAsphereIntel::FixNVEAsphereIntel(LAMMPS *lmp, int narg, char **arg) : - FixNVE(lmp, narg, arg) + FixNVE(lmp, narg, arg) { _dtfm = 0; _nlocal3 = 0; @@ -129,9 +129,9 @@ void FixNVEAsphereIntel::initial_integrate(int vflag) #endif for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { - double *quat = bonus[ellipsoid[i]].quat; - ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i], - _inertia1[i], _inertia2[i]); + double *quat = bonus[ellipsoid[i]].quat; + ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i], + _inertia1[i], _inertia2[i]); } } } @@ -168,7 +168,7 @@ void FixNVEAsphereIntel::reset_dt() { dtf = 0.5 * update->dt * force->ftm2v; const int * const mask = atom->mask; - const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : atom->nlocal; if (nlocal > _nlocal_max) { @@ -211,27 +211,27 @@ void FixNVEAsphereIntel::reset_dt() { for (int i = 0; i < nlocal; i++) { if (mask[i] & groupbit) { _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; - double *shape = bonus[ellipsoid[i]].shape; - double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); - if (idot != 0.0) idot = 1.0 / idot; - _inertia0[i] = idot; - idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); - if (idot != 0.0) idot = 1.0 / idot; - _inertia1[i] = idot; - idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); - if (idot != 0.0) idot = 1.0 / idot; - _inertia2[i] = idot; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + double *shape = bonus[ellipsoid[i]].shape; + double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia0[i] = idot; + idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia1[i] = idot; + idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]); + if (idot != 0.0) idot = 1.0 / idot; + _inertia2[i] = idot; } else { _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; } } } } -double FixNVEAsphereIntel::memory_usage() +double FixNVEAsphereIntel::memory_usage() { return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double); } diff --git a/src/USER-INTEL/fix_nve_intel.cpp b/src/USER-INTEL/fix_nve_intel.cpp index 3fb290b3ab..c0f6da06ae 100644 --- a/src/USER-INTEL/fix_nve_intel.cpp +++ b/src/USER-INTEL/fix_nve_intel.cpp @@ -29,7 +29,7 @@ using namespace FixConst; /* ---------------------------------------------------------------------- */ FixNVEIntel::FixNVEIntel(LAMMPS *lmp, int narg, char **arg) : - FixNVE(lmp, narg, arg) + FixNVE(lmp, narg, arg) { _dtfm = 0; _nlocal3 = 0; @@ -91,7 +91,7 @@ void FixNVEIntel::initial_integrate(int vflag) for (int i = 0; i < _nlocal3; i++) { if (_dtfm[i] != 0.0) { v[i] += _dtfm[i] * f[i]; - x[i] += dtv * v[i]; + x[i] += dtv * v[i]; } } } @@ -130,7 +130,7 @@ void FixNVEIntel::reset_dt() { dtf = 0.5 * update->dt * force->ftm2v; const int * const mask = atom->mask; - const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : + const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : atom->nlocal; if (nlocal > _nlocal_max) { @@ -146,9 +146,9 @@ void FixNVEIntel::reset_dt() { const double * const rmass = atom->rmass; int n = 0; for (int i = 0; i < nlocal; i++) { - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; } } else { const double * const mass = atom->mass; @@ -165,34 +165,34 @@ void FixNVEIntel::reset_dt() { const double * const rmass = atom->rmass; int n = 0; for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; - _dtfm[n++] = dtf / rmass[i]; + if (mask[i] & groupbit) { + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; + _dtfm[n++] = dtf / rmass[i]; } else { - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - } + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } } else { const double * const mass = atom->mass; const int * const type = atom->type; int n = 0; for (int i = 0; i < nlocal; i++) - if (mask[i] & groupbit) { - _dtfm[n++] = dtf / mass[type[i]]; - _dtfm[n++] = dtf / mass[type[i]]; - _dtfm[n++] = dtf / mass[type[i]]; + if (mask[i] & groupbit) { + _dtfm[n++] = dtf / mass[type[i]]; + _dtfm[n++] = dtf / mass[type[i]]; + _dtfm[n++] = dtf / mass[type[i]]; } else { - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - _dtfm[n++] = 0.0; - } + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + _dtfm[n++] = 0.0; + } } } } -double FixNVEIntel::memory_usage() +double FixNVEIntel::memory_usage() { return FixNVE::memory_usage() + _nlocal_max * 3 * sizeof(double); } diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp index df13cd5d66..dc9765d913 100644 --- a/src/USER-INTEL/improper_cvff_intel.cpp +++ b/src/USER-INTEL/improper_cvff_intel.cpp @@ -42,7 +42,7 @@ typedef struct { int a,b,c,d,t; } int5_t; /* ---------------------------------------------------------------------- */ -ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) : +ImproperCvffIntel::ImproperCvffIntel(LAMMPS *lmp) : ImproperCvff(lmp) { suffix_flag |= Suffix::INTEL; @@ -80,8 +80,8 @@ void ImproperCvffIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void ImproperCvffIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = 0; @@ -89,14 +89,14 @@ void ImproperCvffIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -109,9 +109,9 @@ void ImproperCvffIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void ImproperCvffIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void ImproperCvffIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nimproperlist; if (inum == 0) return; @@ -153,7 +153,7 @@ void ImproperCvffIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int5_t * _noalias const improperlist = + const int5_t * _noalias const improperlist = (int5_t *) neighbor->improperlist[0]; #ifdef LMP_INTEL_USE_SIMDOFF_FIX @@ -230,22 +230,22 @@ void ImproperCvffIntel::eval(const int vflag, #ifndef LMP_INTEL_USE_SIMDOFF_FIX if (c > PTOLERANCE || c < MTOLERANCE) { int me; - MPI_Comm_rank(world,&me); - if (screen) { + MPI_Comm_rank(world,&me); + if (screen) { char str[128]; - sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " + sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT, me,update->ntimestep, atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); - error->warning(FLERR,str,0); - fprintf(screen," 1st atom: %d %g %g %g\n", + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", me,x[i1].x,x[i1].y,x[i1].z); - fprintf(screen," 2nd atom: %d %g %g %g\n", + fprintf(screen," 2nd atom: %d %g %g %g\n", me,x[i2].x,x[i2].y,x[i2].z); - fprintf(screen," 3rd atom: %d %g %g %g\n", + fprintf(screen," 3rd atom: %d %g %g %g\n", me,x[i3].x,x[i3].y,x[i3].z); - fprintf(screen," 4th atom: %d %g %g %g\n", + fprintf(screen," 4th atom: %d %g %g %g\n", me,x[i4].x,x[i4].y,x[i4].z); } } @@ -268,35 +268,35 @@ void ImproperCvffIntel::eval(const int vflag, { if (m == 2) { p = (flt_t)2.0*c*c; - pd = (flt_t)2.0*c; + pd = (flt_t)2.0*c; } else if (m == 3) { - const flt_t rc2 = c*c; - p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0; - pd = (flt_t)6.0*rc2 - (flt_t)1.5; + const flt_t rc2 = c*c; + p = ((flt_t)4.0*rc2-(flt_t)3.0)*c + (flt_t)1.0; + pd = (flt_t)6.0*rc2 - (flt_t)1.5; } else if (m == 4) { const flt_t rc2 = c*c; - p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0; - pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c; + p = (flt_t)8.0*(rc2-1)*rc2 + (flt_t)2.0; + pd = ((flt_t)16.0*rc2-(flt_t)8.0)*c; } else if (m == 6) { const flt_t rc2 = c*c; - p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2; - pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c; + p = (((flt_t)32.0*rc2-(flt_t)48.0)*rc2 + (flt_t)18.0)*rc2; + pd = ((flt_t)96.0*(rc2-(flt_t)1.0)*rc2 + (flt_t)18.0)*c; } else if (m == 1) { - p = c + (flt_t)1.0; - pd = (flt_t)0.5; + p = c + (flt_t)1.0; + pd = (flt_t)0.5; } else if (m == 5) { - const flt_t rc2 = c*c; - p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0; - pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5; + const flt_t rc2 = c*c; + p = (((flt_t)16.0*rc2-(flt_t)20.0)*rc2 + (flt_t)5.0)*c + (flt_t)1.0; + pd = ((flt_t)40.0*rc2-(flt_t)30.0)*rc2 + (flt_t)2.5; } else if (m == 0) { p = (flt_t)2.0; - pd = (flt_t)0.0; + pd = (flt_t)0.0; } } if (fc.fc[type].sign == -1) { - p = (flt_t)2.0 - p; - pd = -pd; + p = (flt_t)2.0 - p; + pd = -pd; } flt_t eimproper; @@ -340,43 +340,43 @@ void ImproperCvffIntel::eval(const int vflag, { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].y += f1y; + f[i1].z += f1z; } if (NEWTON_BOND || i2 < nlocal) { f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; + f[i2].y += f2y; + f[i2].z += f2z; } - if (NEWTON_BOND || i3 < nlocal) { + if (NEWTON_BOND || i3 < nlocal) { f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].y += f3y; + f[i3].z += f3z; } if (NEWTON_BOND || i4 < nlocal) { f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + f[i4].y += f4y; + f[i4].z += f4z; } } if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF_FIX - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, - i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, - f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, - vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, + f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, + vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); - #else - IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, - i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, - f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, - vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, + #else + IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, + f4z, vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, + vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); - #endif + #endif } } // for n #ifdef LMP_INTEL_USE_SIMDOFF_FIX @@ -390,7 +390,7 @@ void ImproperCvffIntel::eval(const int vflag, if (EFLAG) energy += oeimproper; if (VFLAG && vflag) { virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); @@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style() template <class flt_t, class acc_t> void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> *buffers) + IntelBuffers<flt_t,acc_t> *buffers) { const int bp1 = atom->nimpropertypes + 1; fc.set_ntypes(bp1,memory); @@ -444,11 +444,11 @@ void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void ImproperCvffIntel::ForceConst<flt_t>::set_ntypes(const int nimproper, - Memory *memory) { + Memory *memory) { if (nimproper != _nimpropertypes) { if (_nimpropertypes > 0) _memory->destroy(fc); - + if (nimproper > 0) _memory->create(fc,nimproper,"improperharmonicintel.fc"); } diff --git a/src/USER-INTEL/improper_cvff_intel.h b/src/USER-INTEL/improper_cvff_intel.h index 95ccd8f9d2..cb5da25f99 100644 --- a/src/USER-INTEL/improper_cvff_intel.h +++ b/src/USER-INTEL/improper_cvff_intel.h @@ -45,8 +45,8 @@ class ImproperCvffIntel : public ImproperCvff { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/improper_harmonic_intel.cpp b/src/USER-INTEL/improper_harmonic_intel.cpp index cc854091f5..fe0efca5ec 100644 --- a/src/USER-INTEL/improper_harmonic_intel.cpp +++ b/src/USER-INTEL/improper_harmonic_intel.cpp @@ -43,7 +43,7 @@ typedef struct { int a,b,c,d,t; } int5_t; /* ---------------------------------------------------------------------- */ -ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : +ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : ImproperHarmonic(lmp) { suffix_flag |= Suffix::INTEL; @@ -81,8 +81,8 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void ImproperHarmonicIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) ev_setup(eflag,vflag); else evflag = 0; @@ -90,14 +90,14 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag, if (evflag) { if (vflag && !eflag) { if (force->newton_bond) - eval<0,1,1>(vflag, buffers, fc); + eval<0,1,1>(vflag, buffers, fc); else - eval<0,1,0>(vflag, buffers, fc); + eval<0,1,0>(vflag, buffers, fc); } else { if (force->newton_bond) - eval<1,1,1>(vflag, buffers, fc); + eval<1,1,1>(vflag, buffers, fc); else - eval<1,1,0>(vflag, buffers, fc); + eval<1,1,0>(vflag, buffers, fc); } } else { if (force->newton_bond) @@ -110,9 +110,9 @@ void ImproperHarmonicIntel::compute(int eflag, int vflag, /* ---------------------------------------------------------------------- */ template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t> -void ImproperHarmonicIntel::eval(const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) +void ImproperHarmonicIntel::eval(const int vflag, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { const int inum = neighbor->nimproperlist; if (inum == 0) return; @@ -154,7 +154,7 @@ void ImproperHarmonicIntel::eval(const int vflag, if (fix->need_zero(tid)) memset(f, 0, f_stride * sizeof(FORCE_T)); - const int5_t * _noalias const improperlist = + const int5_t * _noalias const improperlist = (int5_t *) neighbor->improperlist[0]; #ifdef LMP_INTEL_USE_SIMDOFF @@ -221,22 +221,22 @@ void ImproperHarmonicIntel::eval(const int vflag, #ifndef LMP_INTEL_USE_SIMDOFF if (c > PTOLERANCE || c < MTOLERANCE) { int me; - MPI_Comm_rank(world,&me); - if (screen) { + MPI_Comm_rank(world,&me); + if (screen) { char str[128]; - sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " + sprintf(str,"Improper problem: %d " BIGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT " " TAGINT_FORMAT, me,update->ntimestep, atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]); - error->warning(FLERR,str,0); - fprintf(screen," 1st atom: %d %g %g %g\n", + error->warning(FLERR,str,0); + fprintf(screen," 1st atom: %d %g %g %g\n", me,x[i1].x,x[i1].y,x[i1].z); - fprintf(screen," 2nd atom: %d %g %g %g\n", + fprintf(screen," 2nd atom: %d %g %g %g\n", me,x[i2].x,x[i2].y,x[i2].z); - fprintf(screen," 3rd atom: %d %g %g %g\n", + fprintf(screen," 3rd atom: %d %g %g %g\n", me,x[i3].x,x[i3].y,x[i3].z); - fprintf(screen," 4th atom: %d %g %g %g\n", + fprintf(screen," 4th atom: %d %g %g %g\n", me,x[i4].x,x[i4].y,x[i4].z); } } @@ -296,43 +296,43 @@ void ImproperHarmonicIntel::eval(const int vflag, { if (NEWTON_BOND || i1 < nlocal) { f[i1].x += f1x; - f[i1].y += f1y; - f[i1].z += f1z; + f[i1].y += f1y; + f[i1].z += f1z; } - if (NEWTON_BOND || i2 < nlocal) { + if (NEWTON_BOND || i2 < nlocal) { f[i2].x += f2x; - f[i2].y += f2y; - f[i2].z += f2z; + f[i2].y += f2y; + f[i2].z += f2z; } if (NEWTON_BOND || i3 < nlocal) { f[i3].x += f3x; - f[i3].y += f3y; - f[i3].z += f3z; + f[i3].y += f3y; + f[i3].z += f3z; } if (NEWTON_BOND || i4 < nlocal) { f[i4].x += f4x; - f[i4].y += f4y; - f[i4].z += f4z; + f[i4].y += f4y; + f[i4].z += f4z; } } if (EFLAG || VFLAG) { #ifdef LMP_INTEL_USE_SIMDOFF IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, - i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, - vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, + vb3x, vb3y, vb3z, seimproper, f, NEWTON_BOND, nlocal, sv0, sv1, sv2, sv3, sv4, sv5); #else IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, eimproper, i1, i2, - i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, - vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, + vb3x, vb3y, vb3z, oeimproper, f, NEWTON_BOND, nlocal, ov0, ov1, ov2, ov3, ov4, ov5); - #endif + #endif } } // for n #ifdef LMP_INTEL_USE_SIMDOFF @@ -346,7 +346,7 @@ void ImproperHarmonicIntel::eval(const int vflag, if (EFLAG) energy += oeimproper; if (VFLAG && vflag) { virial[0] += ov0; virial[1] += ov1; virial[2] += ov2; - virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; + virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; } fix->set_reduce_flag(); @@ -384,7 +384,7 @@ void ImproperHarmonicIntel::init_style() template <class flt_t, class acc_t> void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> *buffers) + IntelBuffers<flt_t,acc_t> *buffers) { const int bp1 = atom->nimpropertypes + 1; fc.set_ntypes(bp1,memory); @@ -399,11 +399,11 @@ void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper, - Memory *memory) { + Memory *memory) { if (nimproper != _nimpropertypes) { if (_nimpropertypes > 0) _memory->destroy(fc); - + if (nimproper > 0) _memory->create(fc,nimproper,"improperharmonicintel.fc"); } diff --git a/src/USER-INTEL/improper_harmonic_intel.h b/src/USER-INTEL/improper_harmonic_intel.h index 4e38383863..0b759b4e43 100644 --- a/src/USER-INTEL/improper_harmonic_intel.h +++ b/src/USER-INTEL/improper_harmonic_intel.h @@ -45,8 +45,8 @@ class ImproperHarmonicIntel : public ImproperHarmonic { void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t> - void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc); + void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index bacc8a8bad..3664bc248b 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -71,8 +71,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers() if (ev_global != 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(x:alloc_if(0) free_if(1)) \ - nocopy(f_start:alloc_if(0) free_if(1)) \ - nocopy(ev_global:alloc_if(0) free_if(1)) + nocopy(f_start:alloc_if(0) free_if(1)) \ + nocopy(ev_global:alloc_if(0) free_if(1)) } if (q != 0) { @@ -105,8 +105,8 @@ void IntelBuffers<flt_t, acc_t>::free_buffers() template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, - const int nthreads, - const int offload_end) + const int nthreads, + const int offload_end) { free_buffers(); _buf_size = static_cast<double>(nall) * 1.1 + 1; @@ -151,15 +151,15 @@ void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \ - nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ - nocopy(ev_global:length(8) alloc_if(1) free_if(0)) + nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ + nocopy(ev_global:length(8) alloc_if(1) free_if(0)) } } else { if (x != NULL && f_start != NULL && ev_global != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \ nocopy(f_start:length(f_stride*fm) alloc_if(1) free_if(0))\ - nocopy(ev_global:length(8) alloc_if(1) free_if(0)) + nocopy(ev_global:length(8) alloc_if(1) free_if(0)) } } if (lmp->atom->ellipsoid != NULL) { @@ -186,7 +186,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax() if (tag != 0 && special != 0 && nspecial !=0) { #pragma offload_transfer target(mic:_cop) \ nocopy(tag:alloc_if(0) free_if(1)) \ - nocopy(special,nspecial:alloc_if(0) free_if(1)) + nocopy(special,nspecial:alloc_if(0) free_if(1)) } _off_map_nmax = 0; _host_nmax = 0; @@ -261,7 +261,7 @@ void IntelBuffers<flt_t, acc_t>::free_list_local() template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list, - const int offload_end) + const int offload_end) { free_list_local(); int size = list->get_maxlocal(); @@ -276,7 +276,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list, if (cnumneigh != 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ - nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ + nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ nocopy(cnumneigh:length(size) alloc_if(1) free_if(0)) } _off_map_ilist = ilist; @@ -309,14 +309,14 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list() template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list, const int nlocal, - const int nthreads, - const int offload_end, - const int pack_width) + const int nthreads, + const int offload_end, + const int pack_width) { free_nbor_list(); _list_alloc_atoms = 1.10 * nlocal; int nt = MAX(nthreads, _off_threads); - int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * + int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * get_max_nbors(); lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc"); #ifdef _LMP_INTEL_OFFLOAD @@ -380,8 +380,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache() template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag, - const int nthreads, - const int width) + const int nthreads, + const int width) { #ifdef _LMP_INTEL_OFFLOAD if (_ccachex && off_flag && _off_ccache == 0) @@ -418,7 +418,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag, int *ccachej = _ccachej; if (ccachex != NULL && ccachey !=NULL && ccachez != NULL && - ccachew != NULL && ccachei != NULL && ccachej !=NULL) { + ccachew != NULL && ccachei != NULL && ccachej !=NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \ @@ -471,7 +471,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache() template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag, - const int nthreads) + const int nthreads) { const int nsize = get_max_nbors() * 3; int esize = MIN(sizeof(int), sizeof(flt_t)); @@ -507,7 +507,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag, int *ncachejtype = _ncachejtype; if (ncachex != NULL && ncachey !=NULL && ncachez != NULL && - ncachej != NULL && ncachejtype != NULL) { + ncachej != NULL && ncachejtype != NULL) { #pragma offload_transfer target(mic:_cop) \ nocopy(ncachex,ncachey:length(vsize) alloc_if(1) free_if(0)) \ nocopy(ncachez,ncachej:length(vsize) alloc_if(1) free_if(0)) \ @@ -522,9 +522,9 @@ void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag, #ifndef _LMP_INTEL_OFFLOAD template <class flt_t, class acc_t> -void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, +void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, - acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) + acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) { IP_PRE_fdotr_acc_force_l5(lf, lt, 0, nthreads, _f, f_stride, _x, ov0, ov1, ov2, ov3, ov4, ov5); @@ -535,13 +535,13 @@ void IntelBuffers<flt_t, acc_t>::fdotr_reduce_l5(const int lf, const int lt, #ifndef _LMP_INTEL_OFFLOAD template <class flt_t, class acc_t> -void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall, - const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, +void IntelBuffers<flt_t, acc_t>::fdotr_reduce(const int nall, + const int nthreads, const int f_stride, acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5) { int iifrom, iito, tid; IP_PRE_fdotr_acc_force(nall, 0, nthreads, _f, f_stride, _x, 0, 2, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); } #endif diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index 9b73a65f60..135309fe44 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -62,7 +62,7 @@ class IntelBuffers { void free_buffers(); void free_nmax(); - inline void set_bininfo(int *atombin, int *binpacked) + inline void set_bininfo(int *atombin, int *binpacked) { _atombin = atombin; _binpacked = binpacked; } inline void grow(const int nall, const int nlocal, const int nthreads, const int offload_end) { @@ -126,7 +126,7 @@ class IntelBuffers { inline void grow_nbor_list(NeighList *list, const int nlocal, const int nthreads, const int offload_end, - const int pack_width) { + const int pack_width) { if (nlocal > _list_alloc_atoms) _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width); } @@ -165,7 +165,7 @@ class IntelBuffers { inline int get_off_threads() { return _off_threads; } #ifdef _LMP_INTEL_OFFLOAD inline void set_off_params(const int n, const int cop, - const int separate_buffers) + const int separate_buffers) { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; } inline vec3_acc_t * get_off_f() { return _off_f; } #endif @@ -191,17 +191,17 @@ class IntelBuffers { } #ifndef _LMP_INTEL_OFFLOAD - void fdotr_reduce_l5(const int lf, const int lt, const int nthreads, - const int f_stride, acc_t &ov0, acc_t &ov1, - acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5); - void fdotr_reduce(const int nall, const int nthreads, const int f_stride, - acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, - acc_t &ov4, acc_t &ov5); + void fdotr_reduce_l5(const int lf, const int lt, const int nthreads, + const int f_stride, acc_t &ov0, acc_t &ov1, + acc_t &ov2, acc_t &ov3, acc_t &ov4, acc_t &ov5); + void fdotr_reduce(const int nall, const int nthreads, const int f_stride, + acc_t &ov0, acc_t &ov1, acc_t &ov2, acc_t &ov3, + acc_t &ov4, acc_t &ov5); #endif #ifdef _LMP_INTEL_OFFLOAD inline void thr_pack_cop(const int ifrom, const int ito, - const int offset, const bool dotype = false) { + const int offset, const bool dotype = false) { double ** x = lmp->atom->x + offset; if (dotype == false) { #pragma vector nontemporal @@ -214,16 +214,16 @@ class IntelBuffers { int *type = lmp->atom->type + offset; #pragma vector nontemporal for (int i = ifrom; i < ito; i++) { - _x[i].x = x[i][0]; - _x[i].y = x[i][1]; - _x[i].z = x[i][2]; - _x[i].w = type[i]; + _x[i].x = x[i][0]; + _x[i].y = x[i][1]; + _x[i].z = x[i][2]; + _x[i].w = type[i]; } } } inline void thr_pack_host(const int ifrom, const int ito, - const int offset) { + const int offset) { double ** x = lmp->atom->x + offset; for (int i = ifrom; i < ito; i++) { _host_x[i].x = x[i][0]; @@ -233,13 +233,13 @@ class IntelBuffers { } inline void pack_sep_from_single(const int host_min_local, - const int used_local, - const int host_min_ghost, - const int used_ghost) { + const int used_local, + const int host_min_ghost, + const int used_ghost) { memcpy(_host_x + host_min_local, _x + host_min_local, - used_local * sizeof(atom_t)); + used_local * sizeof(atom_t)); memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost, - used_ghost * sizeof(atom_t)); + used_ghost * sizeof(atom_t)); int nall = used_local + used_ghost + host_min_local; _host_x[nall].x = INTEL_BIGP; _host_x[nall].y = INTEL_BIGP; @@ -247,9 +247,9 @@ class IntelBuffers { _host_x[nall].w = 1; if (lmp->atom->q != NULL) { memcpy(_host_q + host_min_local, _q + host_min_local, - used_local * sizeof(flt_t)); + used_local * sizeof(flt_t)); memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost, - used_ghost * sizeof(flt_t)); + used_ghost * sizeof(flt_t)); } } @@ -310,7 +310,7 @@ class IntelBuffers { _alignvar(acc_t _ev_global_host[8],64); void _grow(const int nall, const int nlocal, const int nthreads, - const int offload_end); + const int offload_end); void _grow_nmax(const int offload_end); void _grow_list_local(NeighList *list, const int offload_end); void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads, diff --git a/src/USER-INTEL/intel_intrinsics.h b/src/USER-INTEL/intel_intrinsics.h index 56b488aa20..069eb5bed5 100644 --- a/src/USER-INTEL/intel_intrinsics.h +++ b/src/USER-INTEL/intel_intrinsics.h @@ -46,23 +46,23 @@ struct lmp_intel_an_fvec { lmp_intel_an_fvec(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; } lmp_intel_an_fvec& operator =(const lmp_intel_an_fvec &a) { data[:] = a.data[:]; return *this; } const lmp_intel_an_fvec operator +(const lmp_intel_an_fvec &b) const { - lmp_intel_an_fvec ret = *this; - ret.data[:] += b.data[:]; + lmp_intel_an_fvec ret = *this; + ret.data[:] += b.data[:]; return ret; } const lmp_intel_an_fvec operator -(const lmp_intel_an_fvec &b) const { - lmp_intel_an_fvec ret = *this; - ret.data[:] -= b.data[:]; + lmp_intel_an_fvec ret = *this; + ret.data[:] -= b.data[:]; return ret; } const lmp_intel_an_fvec operator *(const lmp_intel_an_fvec &b) const { - lmp_intel_an_fvec ret = *this; - ret.data[:] *= b.data[:]; + lmp_intel_an_fvec ret = *this; + ret.data[:] *= b.data[:]; return ret; } const lmp_intel_an_fvec operator /(const lmp_intel_an_fvec &b) const { - lmp_intel_an_fvec ret = *this; - ret.data[:] /= b.data[:]; + lmp_intel_an_fvec ret = *this; + ret.data[:] /= b.data[:]; return ret; } lmp_intel_an_fvec& operator +=(const lmp_intel_an_fvec &b) { @@ -103,18 +103,18 @@ struct lmp_intel_an_ivec { explicit lmp_intel_an_ivec(int i) { data[:] = i; } explicit lmp_intel_an_ivec(const int * a) { data[:] = a[0:VL]; } const lmp_intel_an_ivec operator &(const lmp_intel_an_ivec &b) { - lmp_intel_an_ivec ret = *this; - ret.data[:] &= b.data[:]; + lmp_intel_an_ivec ret = *this; + ret.data[:] &= b.data[:]; return ret; } const lmp_intel_an_ivec operator |(const lmp_intel_an_ivec &b) { - lmp_intel_an_ivec ret = *this; - ret.data[:] |= b.data[:]; + lmp_intel_an_ivec ret = *this; + ret.data[:] |= b.data[:]; return ret; } const lmp_intel_an_ivec operator +(const lmp_intel_an_ivec &b) { - lmp_intel_an_ivec ret = *this; - ret.data[:] += b.data[:]; + lmp_intel_an_ivec ret = *this; + ret.data[:] += b.data[:]; return ret; } }; @@ -171,13 +171,13 @@ enum CalculationMode { KNC, AVX, AVX2, SSE, NONE, AN }; // This is used in the selection logic template<CalculationMode mode> -struct vector_traits { - static const bool support_integer_and_gather_ops = true; +struct vector_traits { + static const bool support_integer_and_gather_ops = true; }; template<> -struct vector_traits<AVX> { - static const bool support_integer_and_gather_ops = false; +struct vector_traits<AVX> { + static const bool support_integer_and_gather_ops = false; }; // This is the base template for all the different architectures @@ -198,10 +198,10 @@ struct ivec32x16 { } explicit ivec32x16(int i) { vec = _mm512_set1_epi32(i); } operator __m512i() const { return vec; } - friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) { + friend ivec32x16 operator &(const ivec32x16 &a, const ivec32x16 &b) { return _mm512_and_epi32(a, b); } - friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) { + friend ivec32x16 operator |(const ivec32x16 &a, const ivec32x16 &b) { return _mm512_or_epi32(a, b); } friend ivec32x16 operator +(const ivec32x16 &a, const ivec32x16 &b) { @@ -326,7 +326,7 @@ struct vector_ops<double, KNC> { *z = gather<1>(*z, mask, idxs, &base->z); *w = int_gather<1>(*w, mask, idxs, &base->w); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8); @@ -337,7 +337,7 @@ struct vector_ops<double, KNC> { *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 48); *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 56); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 8); @@ -464,7 +464,7 @@ struct vector_ops<float, KNC> { *z = gather<1>(*z, mask, idxs, &base->z); *w = int_gather<1>(*w, mask, idxs, &base->w); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); @@ -475,7 +475,7 @@ struct vector_ops<float, KNC> { *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24); *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); @@ -519,10 +519,10 @@ struct ivec32x8 { } explicit ivec32x8(int i) { vec = _mm256_set1_epi32(i); } operator __m256i() const { return vec; } - friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) { + friend ivec32x8 operator &(const ivec32x8 &a, const ivec32x8 &b) { return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); } - friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) { + friend ivec32x8 operator |(const ivec32x8 &a, const ivec32x8 &b) { return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); } friend ivec32x8 operator +(const ivec32x8 &a, const ivec32x8 &b) { @@ -545,10 +545,10 @@ struct avx_bvec { operator F64vec4() const { return _mm256_castsi256_pd(vec); } operator F32vec8() const { return _mm256_castsi256_ps(vec); } operator ivec32x8() const { return vec; } - friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) { + friend avx_bvec operator &(const avx_bvec &a, const avx_bvec &b) { return _mm256_castpd_si256(_mm256_and_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); } - friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) { + friend avx_bvec operator |(const avx_bvec &a, const avx_bvec &b) { return _mm256_castpd_si256(_mm256_or_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))); } friend avx_bvec operator ~(const avx_bvec &a) { return _mm256_castpd_si256(_mm256_andnot_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(avx_bvec(0xFFFFFFFF)))); } @@ -582,8 +582,8 @@ struct vector_ops<double, AVX> { _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx); _mm256_store_pd(reinterpret_cast<double*>(src), from); for (int i = 0; i < VL; i++) { - result[i] = mask_test_at(mask, i) - ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]) + result[i] = mask_test_at(mask, i) + ? *reinterpret_cast<const double*>(reinterpret_cast<const char*>(base) + scale * idxs[2*i]) : src[i]; } return _mm256_load_pd(reinterpret_cast<double*>(result)); @@ -605,18 +605,18 @@ struct vector_ops<double, AVX> { __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20); __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31); __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31); - *x = blend(mask, *x, c0); - *y = blend(mask, *y, c1); - *z = blend(mask, *z, c2); + *x = blend(mask, *x, c0); + *y = blend(mask, *y, c1); + *z = blend(mask, *z, c2); *w = int_blend(mask, *w, _mm256_castps_si256(_mm256_permute_ps(_mm256_castpd_ps(c3), 0xA0))); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { iarr i, m; _mm256_store_si256(reinterpret_cast<__m256i*>(i), idxs); @@ -642,10 +642,10 @@ struct vector_ops<double, AVX> { __m256d c1 = _mm256_permute2f128_pd(b1, b3, 0x20); __m256d c2 = _mm256_permute2f128_pd(b0, b2, 0x31); __m256d c3 = _mm256_permute2f128_pd(b1, b3, 0x31); - *r0 = blend(mask, *r0, c0); - *r1 = blend(mask, *r1, c1); - *r2 = blend(mask, *r2, c2); - *r3 = blend(mask, *r3, c3); + *r0 = blend(mask, *r0, c0); + *r1 = blend(mask, *r1, c1); + *r2 = blend(mask, *r2, c2); + *r3 = blend(mask, *r3, c3); } static fvec blend(const bvec &mask, const fvec &a, const fvec &b) { return (b & mask) | (a & ~ mask); @@ -809,8 +809,8 @@ struct vector_ops<float, AVX> { _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx); _mm256_store_ps(reinterpret_cast<float*>(src), from); for (int i = 0; i < VL; i++) { - result[i] = mask_test_at(mask, i) - ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) + result[i] = mask_test_at(mask, i) + ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) : src[i]; } return _mm256_load_ps(reinterpret_cast<float*>(result)); @@ -842,18 +842,18 @@ struct vector_ops<float, AVX> { __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE); __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44); __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE); - *x = blend(mask, *x, c0); - *y = blend(mask, *y, c1); - *z = blend(mask, *z, c2); + *x = blend(mask, *x, c0); + *y = blend(mask, *y, c1); + *z = blend(mask, *z, c2); *w = int_blend(mask, *w, _mm256_castps_si256(c3)); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { iarr i, m; int_store(i, idxs); @@ -880,10 +880,10 @@ struct vector_ops<float, AVX> { __m256 c1 = _mm256_shuffle_ps(b0, b2, 0xEE); __m256 c2 = _mm256_shuffle_ps(b1, b3, 0x44); __m256 c3 = _mm256_shuffle_ps(b1, b3, 0xEE); - *r0 = blend(mask, *r0, c0); - *r1 = blend(mask, *r1, c1); - *r2 = blend(mask, *r2, c2); - *r3 = blend(mask, *r3, c3); + *r0 = blend(mask, *r0, c0); + *r1 = blend(mask, *r1, c1); + *r2 = blend(mask, *r2, c2); + *r3 = blend(mask, *r3, c3); } static fvec blend(const bvec &mask, const fvec &a, const fvec &b) { return (b & mask) | (a & ~ mask); @@ -961,8 +961,8 @@ struct vector_ops<float, AVX> { _mm256_store_si256(reinterpret_cast<__m256i*>(idxs), idx); _mm256_store_si256(reinterpret_cast<__m256i*>(src), from); for (int i = 0; i < VL; i++) { - result[i] = mask_test_at(mask, i) - ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) + result[i] = mask_test_at(mask, i) + ? *reinterpret_cast<const int*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) : src[i]; } return _mm256_load_si256(reinterpret_cast<__m256i*>(result)); @@ -1038,10 +1038,10 @@ struct avx2_ivec32 { } explicit avx2_ivec32(int i) { vec = _mm256_set1_epi32(i); } operator __m256i() const { return vec; } - friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) { + friend avx2_ivec32 operator &(const avx2_ivec32 &a, const avx2_ivec32 &b) { return _mm256_and_si256(a, b); } - friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) { + friend avx2_ivec32 operator |(const avx2_ivec32 &a, const avx2_ivec32 &b) { return _mm256_or_si256(a, b); } friend avx2_ivec32 operator +(const avx2_ivec32 &a, const avx2_ivec32 &b) { @@ -1060,14 +1060,14 @@ struct avx2_bvec { operator F64vec4() const { return _mm256_castsi256_pd(vec); } operator F32vec8() const { return _mm256_castsi256_ps(vec); } operator avx2_ivec32() const { return vec; } - friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) { + friend avx2_bvec operator &(const avx2_bvec &a, const avx2_bvec &b) { return _mm256_and_si256(a, b); } - friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) { + friend avx2_bvec operator |(const avx2_bvec &a, const avx2_bvec &b) { return _mm256_or_si256(a, b); } friend avx2_bvec operator ~(const avx2_bvec &a) { - return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF)); + return _mm256_andnot_si256(a, avx2_bvec(0xFFFFFFFF)); } avx2_bvec& operator &=(const avx2_bvec &a) { return *this = _mm256_and_si256(vec,a); } }; @@ -1106,13 +1106,13 @@ struct vector_ops<double, AVX2> { *z = _mm256_mask_i32gather_pd(*z, &base->z, _mm256_castsi256_si128(idx1), mask, 1); *w = _mm256_mask_i32gather_epi32(*w, &base->w, idx, mask, 1); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7); } - static void gather_4(const ivec &idx, const bvec &mask, const void *base, + static void gather_4(const ivec &idx, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { ivec idx0 = _mm256_shuffle_epi32(idx, 0xD8); // 11011000 ->3120 ivec idx1 = _mm256_permute4x64_epi64(idx0, 0xD8); @@ -1253,7 +1253,7 @@ struct vector_ops<float, AVX2> { *z = _mm256_mask_i32gather_ps(*z, reinterpret_cast<const float*>(base) + 2, idx, mask, 1); *w = _mm256_mask_i32gather_epi32(*w, reinterpret_cast<const int*>(base) + 3, idx, mask, 1); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); @@ -1264,7 +1264,7 @@ struct vector_ops<float, AVX2> { *r6 = gather<4>(*r6, mask, idxs, reinterpret_cast<const char *>(base) + 24); *r7 = gather<4>(*r7, mask, idxs, reinterpret_cast<const char *>(base) + 28); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char *>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char *>(base) + 4); @@ -1401,10 +1401,10 @@ struct ivec32x4 { } explicit ivec32x4(int i) { vec = _mm_set1_epi32(i); } operator __m128i() const { return vec; } - friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) { + friend ivec32x4 operator &(const ivec32x4 &a, const ivec32x4 &b) { return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); } - friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) { + friend ivec32x4 operator |(const ivec32x4 &a, const ivec32x4 &b) { return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); } friend ivec32x4 operator +(const ivec32x4 &a, const ivec32x4 &b) { @@ -1420,10 +1420,10 @@ struct sse_bvecx4 { operator __m128i() const { return vec; } operator F64vec2() const { return _mm_castsi128_pd(vec); } operator ivec32x4() const { return vec; } - friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) { + friend sse_bvecx4 operator &(const sse_bvecx4 &a, const sse_bvecx4 &b) { return _mm_castpd_si128(_mm_and_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); } - friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) { + friend sse_bvecx4 operator |(const sse_bvecx4 &a, const sse_bvecx4 &b) { return _mm_castpd_si128(_mm_or_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b))); } friend sse_bvecx4 operator ~(const sse_bvecx4 &a) { return _mm_castpd_si128(_mm_andnot_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(sse_bvecx4(0xFFFFFFFF)))); } @@ -1477,18 +1477,18 @@ struct vector_ops<double, SSE> { __m128d c1 = _mm_unpackhi_pd(a0lo, a1lo); __m128d c2 = _mm_unpacklo_pd(a0hi, a1hi); __m128d c3 = _mm_unpackhi_pd(a0hi, a1hi); - *x = blend(mask, *x, c0); - *y = blend(mask, *y, c1); - *z = blend(mask, *z, c2); + *x = blend(mask, *x, c0); + *y = blend(mask, *y, c1); + *z = blend(mask, *z, c2); *w = int_blend(mask, *w, _mm_shuffle_epi32(_mm_castpd_si128(c3), 0xA0)); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 32, r4, r5, r6, r7); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 8); @@ -1634,8 +1634,8 @@ struct vector_ops<float, SSE> { _mm_store_si128(reinterpret_cast<__m128i*>(idxs), idx); _mm_store_ps(reinterpret_cast<float*>(src), from); for (int i = 0; i < VL; i++) { - result[i] = m[i] - ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) + result[i] = m[i] + ? *reinterpret_cast<const float*>(reinterpret_cast<const char*>(base) + scale * idxs[i]) : src[i]; } return _mm_load_ps(reinterpret_cast<float*>(result)); @@ -1647,13 +1647,13 @@ struct vector_ops<float, SSE> { *z = gather<1>(*z, mask, idxs, &base->z); *w = int_gather<1>(*w, mask, idxs, &base->w); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 16, r4, r5, r6, r7); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 4); @@ -1816,13 +1816,13 @@ struct vector_ops<flt_t, NONE> { *z = gather<1>(*z, mask, idxs, &base->z); *w = int_gather<1>(*w, mask, idxs, &base->w); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal)); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal)); @@ -1946,13 +1946,13 @@ struct vector_ops<flt_t, AN> { *z = gather<1>(*z, mask, idxs, &base->z); *w = int_gather<1>(*w, mask, idxs, &base->w); } - static void gather_8(const ivec &idxs, const bvec &mask, const void *base, + static void gather_8(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3, fvec *r4, fvec *r5, fvec *r6, fvec *r7) { fvec a = zero(), b = zero(), c = zero(), d = zero(); gather_4(idxs, mask, base, r0, r1, r2, r3); gather_4(idxs, mask, reinterpret_cast<const char*>(base) + 4 * sizeof(fscal), r4, r5, r6, r7); } - static void gather_4(const ivec &idxs, const bvec &mask, const void *base, + static void gather_4(const ivec &idxs, const bvec &mask, const void *base, fvec *r0, fvec *r1, fvec *r2, fvec *r3) { *r0 = gather<4>(*r0, mask, idxs, reinterpret_cast<const char*>(base) + 0 * sizeof(fscal)); *r1 = gather<4>(*r1, mask, idxs, reinterpret_cast<const char*>(base) + 1 * sizeof(fscal)); @@ -2113,7 +2113,7 @@ struct AccumulatorTwiceMixin { typedef avec_t avec; typedef typename HIGH::fscal aarr[BASE::VL] __attribute__((aligned(BASE::ALIGN))); - + static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) { typename HIGH::fvec blo = BASE::cvtup_lo(b); typename HIGH::fvec bhi = BASE::cvtup_hi(b); @@ -2121,7 +2121,7 @@ struct AccumulatorTwiceMixin { BASE::mask_cvtup(m, &mlo, &mhi); return avec(HIGH::mask_add(src.lo, mlo, a.lo, blo), HIGH::mask_add(src.hi, mhi, a.hi, bhi)); } - + static typename HIGH::fscal acc_reduce_add(const avec &a) { return HIGH::reduce_add(a.lo + a.hi); } @@ -2143,13 +2143,13 @@ template<class BASE_flt_t, class HIGH_flt_t, CalculationMode mic> struct AccumulatorTwiceMixinNone { typedef vector_ops<BASE_flt_t, mic> BASE; typedef vector_ops<HIGH_flt_t, mic> HIGH; - + typedef typename HIGH::fvec avec; typedef typename HIGH::fscal aarr[BASE::VL]; - + static avec acc_mask_add(const avec &src, const typename BASE::bvec &m, const avec &a, const typename BASE::fvec &b) { return HIGH::mask_add(src, m, a, static_cast<typename HIGH::fvec>(b)); - } + } static typename HIGH::fscal acc_reduce_add(const avec &a) { return HIGH::reduce_add(a); } diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index 93787cd6c8..d5cf6f5be2 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -134,374 +134,374 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define INTEL_HTHREADS 2 #endif -#define IP_PRE_get_stride(stride, n, datasize, torque) \ - { \ - int blength = n; \ - if (torque) blength *= 2; \ - const int bytes = blength * datasize; \ +#define IP_PRE_get_stride(stride, n, datasize, torque) \ + { \ + int blength = n; \ + if (torque) blength *= 2; \ + const int bytes = blength * datasize; \ stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN); \ - stride = blength + stride / datasize; \ + stride = blength + stride / datasize; \ } #if defined(_OPENMP) -#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ - { \ - int idelta = inum/nthreads; \ - const int imod = inum % nthreads; \ - ifrom = tid * idelta; \ - ito = ifrom + idelta; \ - if (tid < imod) { \ - ito+=tid+1; \ - ifrom+=tid; \ - } else { \ - ito+=imod; \ - ifrom+=imod; \ - } \ +#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ + { \ + int idelta = inum/nthreads; \ + const int imod = inum % nthreads; \ + ifrom = tid * idelta; \ + ito = ifrom + idelta; \ + if (tid < imod) { \ + ito+=tid+1; \ + ifrom+=tid; \ + } else { \ + ito+=imod; \ + ifrom+=imod; \ + } \ } -#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ - { \ - tid = omp_get_thread_num(); \ - IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \ +#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ + { \ + tid = omp_get_thread_num(); \ + IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \ } -#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr) \ - { \ - if (nthr <= INTEL_HTHREADS) { \ - ifrom = tid; \ - ito = inum; \ - ip = nthr; \ - } else if (nthr % INTEL_HTHREADS == 0) { \ - int nd = nthr / INTEL_HTHREADS; \ - int td = tid / INTEL_HTHREADS; \ - int tm = tid % INTEL_HTHREADS; \ - IP_PRE_omp_range(ifrom, ito, td, inum, nd); \ - ifrom += tm; \ - ip = INTEL_HTHREADS; \ - } else { \ - IP_PRE_omp_range(ifrom, ito, tid, inum, nthr); \ - ip = 1; \ - } \ +#define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr) \ + { \ + if (nthr <= INTEL_HTHREADS) { \ + ifrom = tid; \ + ito = inum; \ + ip = nthr; \ + } else if (nthr % INTEL_HTHREADS == 0) { \ + int nd = nthr / INTEL_HTHREADS; \ + int td = tid / INTEL_HTHREADS; \ + int tm = tid % INTEL_HTHREADS; \ + IP_PRE_omp_range(ifrom, ito, td, inum, nd); \ + ifrom += tm; \ + ip = INTEL_HTHREADS; \ + } else { \ + IP_PRE_omp_range(ifrom, ito, tid, inum, nthr); \ + ip = 1; \ + } \ } -#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \ - { \ - tid = omp_get_thread_num(); \ - IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr); \ +#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \ + { \ + tid = omp_get_thread_num(); \ + IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr); \ } #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ datasize) \ { \ int chunk_size = INTEL_DATA_ALIGN / datasize; \ - int idelta = static_cast<int>(ceil(static_cast<float>(inum) \ - /chunk_size/nthreads)); \ - idelta *= chunk_size; \ + int idelta = static_cast<int>(ceil(static_cast<float>(inum) \ + /chunk_size/nthreads)); \ + idelta *= chunk_size; \ ifrom = tid*idelta; \ ito = ifrom + idelta; \ if (ito > inum) ito = inum; \ } #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum, \ - nthreads, datasize) \ - { \ - tid = omp_get_thread_num(); \ + nthreads, datasize) \ + { \ + tid = omp_get_thread_num(); \ IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ - datasize); \ + datasize); \ } #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \ - nthreads, vecsize) \ - { \ - tid = omp_get_thread_num(); \ - int idelta = static_cast<int>(ceil(static_cast<float>(inum) \ - /vecsize/nthreads)); \ - idelta *= vecsize; \ - ifrom = tid*idelta; \ - ito = ifrom + idelta; \ - if (ito > inum) ito = inum; \ + nthreads, vecsize) \ + { \ + tid = omp_get_thread_num(); \ + int idelta = static_cast<int>(ceil(static_cast<float>(inum) \ + /vecsize/nthreads)); \ + idelta *= vecsize; \ + ifrom = tid*idelta; \ + ito = ifrom + idelta; \ + if (ito > inum) ito = inum; \ } -#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ - nthr, vecsize) \ - { \ - tid = omp_get_thread_num(); \ - if (nthr <= INTEL_HTHREADS) { \ - ifrom = tid*vecsize; \ - ito = inum; \ - ip = nthr*vecsize; \ - } else if (nthr % INTEL_HTHREADS == 0) { \ - int nd = nthr / INTEL_HTHREADS; \ - int td = tid / INTEL_HTHREADS; \ - int tm = tid % INTEL_HTHREADS; \ +#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \ + nthr, vecsize) \ + { \ + tid = omp_get_thread_num(); \ + if (nthr <= INTEL_HTHREADS) { \ + ifrom = tid*vecsize; \ + ito = inum; \ + ip = nthr*vecsize; \ + } else if (nthr % INTEL_HTHREADS == 0) { \ + int nd = nthr / INTEL_HTHREADS; \ + int td = tid / INTEL_HTHREADS; \ + int tm = tid % INTEL_HTHREADS; \ IP_PRE_omp_range_id_vec(ifrom, ito, td, inum, nd, \ - vecsize); \ - ifrom += tm * vecsize; \ - ip = INTEL_HTHREADS * vecsize; \ - } else { \ + vecsize); \ + ifrom += tm * vecsize; \ + ip = INTEL_HTHREADS * vecsize; \ + } else { \ IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, nthr, \ - vecsize); \ - ip = vecsize; \ - } \ + vecsize); \ + ip = vecsize; \ + } \ } #else -#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ - { \ - ifrom = 0; \ - ito = inum; \ +#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \ + { \ + ifrom = 0; \ + ito = inum; \ } -#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ - { \ - tid = 0; \ - ifrom = 0; \ - ito = inum; \ +#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ } -#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads) \ - { \ - ifrom = 0; \ - ito = inum; \ - ip = 1; \ +#define IP_PRE_omp_range(ifrom, ip, ito, tid, inum, nthreads) \ + { \ + ifrom = 0; \ + ito = inum; \ + ip = 1; \ } -#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \ - { \ - tid = 0; \ - ifrom = 0; \ - ito = inum; \ - ip = 1; \ +#define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ + ip = 1; \ } #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \ datasize) \ { \ - ifrom = 0; \ - ito = inum; \ + ifrom = 0; \ + ito = inum; \ } #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum, \ - nthreads, datasize) \ -{ \ - tid = 0; \ - ifrom = 0; \ - ito = inum; \ + nthreads, datasize) \ +{ \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ } #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \ - nthreads, vecsize) \ - { \ - tid = 0; \ - ifrom = 0; \ - ito = inum; \ + nthreads, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ } -#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \ - nthreads, vecsize) \ - { \ - tid = 0; \ - ifrom = 0; \ - ito = inum; \ - ip = vecsize; \ +#define IP_PRE_omp_range_id_vec(ifrom, ip, ito, tid, inum, \ + nthreads, vecsize) \ + { \ + tid = 0; \ + ifrom = 0; \ + ito = inum; \ + ip = vecsize; \ } #endif -#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ - f_stride, pos, ov0, ov1, ov2, \ - ov3, ov4, ov5) \ -{ \ - acc_t *f_scalar = &f_start[0].x; \ - flt_t *x_scalar = &pos[minlocal].x; \ - int f_stride4 = f_stride * 4; \ - _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64); \ - int vwidth; \ - if (sizeof(acc_t) == sizeof(double)) \ - vwidth = INTEL_COMPILE_WIDTH/2; \ - else \ - vwidth = INTEL_COMPILE_WIDTH; \ - if (vwidth < 4) vwidth = 4; \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("simd") \ - for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \ - int remainder = lt % vwidth; \ - if (lf > lt) remainder = 0; \ - const int v_range = lt - remainder; \ - if (nthreads == 2) { \ - acc_t *f_scalar2 = f_scalar + f_stride4; \ - for (int n = lf; n < v_range; n += vwidth) { \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("simd") \ - for (int v = 0; v < vwidth; v++) { \ - f_scalar[n+v] += f_scalar2[n+v]; \ - ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ - } \ - ov3 += f_scalar[n+1] * x_scalar[n+0]; \ - ov4 += f_scalar[n+2] * x_scalar[n+0]; \ - ov5 += f_scalar[n+2] * x_scalar[n+1]; \ - if (vwidth > 4) { \ - ov3 += f_scalar[n+5] * x_scalar[n+4]; \ - ov4 += f_scalar[n+6] * x_scalar[n+4]; \ - ov5 += f_scalar[n+6] * x_scalar[n+5]; \ - } \ - if (vwidth > 8) { \ - ov3 += f_scalar[n+9] * x_scalar[n+8]; \ - ov3 += f_scalar[n+13] * x_scalar[n+12]; \ - ov4 += f_scalar[n+10] * x_scalar[n+8]; \ - ov4 += f_scalar[n+14] * x_scalar[n+12]; \ - ov5 += f_scalar[n+10] * x_scalar[n+9]; \ - ov5 += f_scalar[n+14] * x_scalar[n+13]; \ - } \ - } \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("ivdep") \ - _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ - for (int n = v_range; n < lt; n++) \ - f_scalar[n] += f_scalar2[n]; \ - } else if (nthreads==4) { \ - acc_t *f_scalar2 = f_scalar + f_stride4; \ - acc_t *f_scalar3 = f_scalar2 + f_stride4; \ - acc_t *f_scalar4 = f_scalar3 + f_stride4; \ - for (int n = lf; n < v_range; n += vwidth) { \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("simd") \ - for (int v = 0; v < vwidth; v++) { \ - f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \ - f_scalar4[n+v]; \ - ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ - } \ - ov3 += f_scalar[n+1] * x_scalar[n+0]; \ - ov4 += f_scalar[n+2] * x_scalar[n+0]; \ - ov5 += f_scalar[n+2] * x_scalar[n+1]; \ - if (vwidth > 4) { \ - ov3 += f_scalar[n+5] * x_scalar[n+4]; \ - ov4 += f_scalar[n+6] * x_scalar[n+4]; \ - ov5 += f_scalar[n+6] * x_scalar[n+5]; \ - } \ - if (vwidth > 8) { \ - ov3 += f_scalar[n+9] * x_scalar[n+8]; \ - ov3 += f_scalar[n+13] * x_scalar[n+12]; \ - ov4 += f_scalar[n+10] * x_scalar[n+8]; \ - ov4 += f_scalar[n+14] * x_scalar[n+12]; \ - ov5 += f_scalar[n+10] * x_scalar[n+9]; \ - ov5 += f_scalar[n+14] * x_scalar[n+13]; \ - } \ - } \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("ivdep") \ - _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ - for (int n = v_range; n < lt; n++) \ - f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \ - } else if (nthreads==1) { \ - for (int n = lf; n < v_range; n += vwidth) { \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("simd") \ - for (int v = 0; v < vwidth; v++) \ - ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ - ov3 += f_scalar[n+1] * x_scalar[n+0]; \ - ov4 += f_scalar[n+2] * x_scalar[n+0]; \ - ov5 += f_scalar[n+2] * x_scalar[n+1]; \ - if (vwidth > 4) { \ - ov3 += f_scalar[n+5] * x_scalar[n+4]; \ - ov4 += f_scalar[n+6] * x_scalar[n+4]; \ - ov5 += f_scalar[n+6] * x_scalar[n+5]; \ - } \ - if (vwidth > 8) { \ - ov3 += f_scalar[n+9] * x_scalar[n+8]; \ - ov3 += f_scalar[n+13] * x_scalar[n+12]; \ - ov4 += f_scalar[n+10] * x_scalar[n+8]; \ - ov4 += f_scalar[n+14] * x_scalar[n+12]; \ - ov5 += f_scalar[n+10] * x_scalar[n+9]; \ - ov5 += f_scalar[n+14] * x_scalar[n+13]; \ - } \ - } \ - } else if (nthreads==3) { \ - acc_t *f_scalar2 = f_scalar + f_stride4; \ - acc_t *f_scalar3 = f_scalar2 + f_stride4; \ - for (int n = lf; n < v_range; n += vwidth) { \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("simd") \ - for (int v = 0; v < vwidth; v++) { \ - f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \ - ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ - } \ - ov3 += f_scalar[n+1] * x_scalar[n+0]; \ - ov4 += f_scalar[n+2] * x_scalar[n+0]; \ - ov5 += f_scalar[n+2] * x_scalar[n+1]; \ - if (vwidth > 4) { \ - ov3 += f_scalar[n+5] * x_scalar[n+4]; \ - ov4 += f_scalar[n+6] * x_scalar[n+4]; \ - ov5 += f_scalar[n+6] * x_scalar[n+5]; \ - } \ - if (vwidth > 8) { \ - ov3 += f_scalar[n+9] * x_scalar[n+8]; \ - ov3 += f_scalar[n+13] * x_scalar[n+12]; \ - ov4 += f_scalar[n+10] * x_scalar[n+8]; \ - ov4 += f_scalar[n+14] * x_scalar[n+12]; \ - ov5 += f_scalar[n+10] * x_scalar[n+9]; \ - ov5 += f_scalar[n+14] * x_scalar[n+13]; \ - } \ - } \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("ivdep") \ - _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ - for (int n = v_range; n < lt; n++) \ - f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \ - } \ - for (int n = v_range; n < lt; n += 4) { \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("ivdep") \ - for (int v = 0; v < 4; v++) \ - ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ - ov3 += f_scalar[n+1] * x_scalar[n+0]; \ - ov4 += f_scalar[n+2] * x_scalar[n+0]; \ - ov5 += f_scalar[n+2] * x_scalar[n+1]; \ - } \ - ov0 += ovv[0]; \ - ov1 += ovv[1]; \ - ov2 += ovv[2]; \ - if (vwidth > 4) { \ - ov0 += ovv[4]; \ - ov1 += ovv[5]; \ - ov2 += ovv[6]; \ - } \ - if (vwidth > 8) { \ - ov0 += ovv[8] + ovv[12]; \ - ov1 += ovv[9] + ovv[13]; \ - ov2 += ovv[10] + ovv[14]; \ - } \ +#define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \ + f_stride, pos, ov0, ov1, ov2, \ + ov3, ov4, ov5) \ +{ \ + acc_t *f_scalar = &f_start[0].x; \ + flt_t *x_scalar = &pos[minlocal].x; \ + int f_stride4 = f_stride * 4; \ + _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64); \ + int vwidth; \ + if (sizeof(acc_t) == sizeof(double)) \ + vwidth = INTEL_COMPILE_WIDTH/2; \ + else \ + vwidth = INTEL_COMPILE_WIDTH; \ + if (vwidth < 4) vwidth = 4; \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \ + int remainder = lt % vwidth; \ + if (lf > lt) remainder = 0; \ + const int v_range = lt - remainder; \ + if (nthreads == 2) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n]; \ + } else if (nthreads==4) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + acc_t *f_scalar3 = f_scalar2 + f_stride4; \ + acc_t *f_scalar4 = f_scalar3 + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \ + f_scalar4[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \ + } else if (nthreads==1) { \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + } else if (nthreads==3) { \ + acc_t *f_scalar2 = f_scalar + f_stride4; \ + acc_t *f_scalar3 = f_scalar2 + f_stride4; \ + for (int n = lf; n < v_range; n += vwidth) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int v = 0; v < vwidth; v++) { \ + f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + } \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + if (vwidth > 4) { \ + ov3 += f_scalar[n+5] * x_scalar[n+4]; \ + ov4 += f_scalar[n+6] * x_scalar[n+4]; \ + ov5 += f_scalar[n+6] * x_scalar[n+5]; \ + } \ + if (vwidth > 8) { \ + ov3 += f_scalar[n+9] * x_scalar[n+8]; \ + ov3 += f_scalar[n+13] * x_scalar[n+12]; \ + ov4 += f_scalar[n+10] * x_scalar[n+8]; \ + ov4 += f_scalar[n+14] * x_scalar[n+12]; \ + ov5 += f_scalar[n+10] * x_scalar[n+9]; \ + ov5 += f_scalar[n+14] * x_scalar[n+13]; \ + } \ + } \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \ + for (int n = v_range; n < lt; n++) \ + f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \ + } \ + for (int n = v_range; n < lt; n += 4) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("ivdep") \ + for (int v = 0; v < 4; v++) \ + ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \ + ov3 += f_scalar[n+1] * x_scalar[n+0]; \ + ov4 += f_scalar[n+2] * x_scalar[n+0]; \ + ov5 += f_scalar[n+2] * x_scalar[n+1]; \ + } \ + ov0 += ovv[0]; \ + ov1 += ovv[1]; \ + ov2 += ovv[2]; \ + if (vwidth > 4) { \ + ov0 += ovv[4]; \ + ov1 += ovv[5]; \ + ov2 += ovv[6]; \ + } \ + if (vwidth > 8) { \ + ov0 += ovv[8] + ovv[12]; \ + ov1 += ovv[9] + ovv[13]; \ + ov2 += ovv[10] + ovv[14]; \ + } \ } -#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ - f_stride, pos, offload, vflag, ov0, ov1, \ - ov2, ov3, ov4, ov5) \ -{ \ - int o_range = (nall - minlocal) * 4; \ - IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \ - sizeof(acc_t)); \ - \ - acc_t *f_scalar = &f_start[0].x; \ - int f_stride4 = f_stride * 4; \ - int t; \ - if (vflag == 2) t = 4; else t = 1; \ - acc_t *f_scalar2 = f_scalar + f_stride4 * t; \ - for ( ; t < nthreads; t++) { \ - _use_simd_pragma("vector aligned") \ - _use_simd_pragma("simd") \ - for (int n = iifrom; n < iito; n++) \ - f_scalar[n] += f_scalar2[n]; \ - f_scalar2 += f_stride4; \ - } \ - \ - if (vflag == 2) { \ - int nt_min = MIN(4,nthreads); \ - IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \ - f_stride, pos, ov0, ov1, ov2, ov3, ov4, \ - ov5); \ - } \ +#define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ + f_stride, pos, offload, vflag, ov0, ov1, \ + ov2, ov3, ov4, ov5) \ +{ \ + int o_range = (nall - minlocal) * 4; \ + IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \ + sizeof(acc_t)); \ + \ + acc_t *f_scalar = &f_start[0].x; \ + int f_stride4 = f_stride * 4; \ + int t; \ + if (vflag == 2) t = 4; else t = 1; \ + acc_t *f_scalar2 = f_scalar + f_stride4 * t; \ + for ( ; t < nthreads; t++) { \ + _use_simd_pragma("vector aligned") \ + _use_simd_pragma("simd") \ + for (int n = iifrom; n < iito; n++) \ + f_scalar[n] += f_scalar2[n]; \ + f_scalar2 += f_stride4; \ + } \ + \ + if (vflag == 2) { \ + int nt_min = MIN(4,nthreads); \ + IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \ + f_stride, pos, ov0, ov1, ov2, ov3, ov4, \ + ov5); \ + } \ } #ifdef _LMP_INTEL_OFFLOAD @@ -517,131 +517,131 @@ inline double MIC_Wtime() { return time; } -#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ - nlocal, nall) \ -{ \ - if (fix->separate_buffers() && ago != 0) { \ - fix->start_watch(TIME_PACK); \ - if (offload) { \ - int packthreads; \ +#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ + nlocal, nall) \ +{ \ + if (fix->separate_buffers() && ago != 0) { \ + fix->start_watch(TIME_PACK); \ + if (offload) { \ + int packthreads; \ if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\ - else packthreads = 1; \ - _use_omp_pragma("omp parallel if(packthreads > 1)") \ - { \ - int ifrom, ito, tid; \ - IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \ - packthreads, sizeof(flt_t)); \ - buffers->thr_pack_cop(ifrom, ito, 0); \ - int nghost = nall - nlocal; \ - if (nghost) { \ - IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \ - packthreads, sizeof(flt_t)); \ - buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \ - fix->offload_min_ghost() - nlocal, \ - ago == 1); \ - } \ - } \ - } else { \ - buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); \ - buffers->thr_pack_host(nlocal, nall, \ - fix->host_min_ghost()-nlocal); \ - } \ - fix->stop_watch(TIME_PACK); \ - } \ + else packthreads = 1; \ + _use_omp_pragma("omp parallel if(packthreads > 1)") \ + { \ + int ifrom, ito, tid; \ + IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \ + packthreads, sizeof(flt_t)); \ + buffers->thr_pack_cop(ifrom, ito, 0); \ + int nghost = nall - nlocal; \ + if (nghost) { \ + IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \ + packthreads, sizeof(flt_t)); \ + buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \ + fix->offload_min_ghost() - nlocal, \ + ago == 1); \ + } \ + } \ + } else { \ + buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); \ + buffers->thr_pack_host(nlocal, nall, \ + fix->host_min_ghost()-nlocal); \ + } \ + fix->stop_watch(TIME_PACK); \ + } \ } -#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \ - buffers, offload, fix, separate_flag, \ - x_size, q_size, ev_size, f_stride) \ -{ \ - separate_flag = 0; \ - if (ago == 0) { \ - x_size = 0; \ - q_size = nall; \ - if (offload) { \ - if (fix->separate_buffers()) { \ - if (lmp->atom->torque) \ - separate_flag = 2; \ - else \ - separate_flag = 1; \ - } else \ - separate_flag = 3; \ - } \ - } else { \ - x_size = nall; \ - q_size = 0; \ - } \ - ev_size = 0; \ - if (eflag) ev_size = 2; \ - if (vflag) ev_size = 8; \ - if (newton) \ - f_stride = buffers->get_stride(nall); \ - else \ - f_stride = buffers->get_stride(inum); \ +#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \ + buffers, offload, fix, separate_flag, \ + x_size, q_size, ev_size, f_stride) \ +{ \ + separate_flag = 0; \ + if (ago == 0) { \ + x_size = 0; \ + q_size = nall; \ + if (offload) { \ + if (fix->separate_buffers()) { \ + if (lmp->atom->torque) \ + separate_flag = 2; \ + else \ + separate_flag = 1; \ + } else \ + separate_flag = 3; \ + } \ + } else { \ + x_size = nall; \ + q_size = 0; \ + } \ + ev_size = 0; \ + if (eflag) ev_size = 2; \ + if (vflag) ev_size = 8; \ + if (newton) \ + f_stride = buffers->get_stride(nall); \ + else \ + f_stride = buffers->get_stride(inum); \ } -#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ - ev_global) \ -{ \ - if (offload) { \ - tc = buffers->get_off_threads(); \ - f_start = buffers->get_off_f(); \ - ev_global = buffers->get_ev_global(); \ - } else { \ - tc = comm->nthreads; \ - f_start = buffers->get_f(); \ - fix->start_watch(TIME_HOST_PAIR); \ - ev_global = buffers->get_ev_global_host(); \ - } \ +#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ + ev_global) \ +{ \ + if (offload) { \ + tc = buffers->get_off_threads(); \ + f_start = buffers->get_off_f(); \ + ev_global = buffers->get_ev_global(); \ + } else { \ + tc = comm->nthreads; \ + f_start = buffers->get_f(); \ + fix->start_watch(TIME_HOST_PAIR); \ + ev_global = buffers->get_ev_global_host(); \ + } \ } -#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ - f_stride, x, q) \ -{ \ - if (separate_flag) { \ - if (separate_flag < 3) { \ - int all_local = nlocal; \ - int ghost_min = overflow[LMP_GHOST_MIN]; \ - nlocal = overflow[LMP_LOCAL_MAX] + 1; \ - int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; \ - if (nghost < 0) nghost = 0; \ - nall = nlocal + nghost; \ - separate_flag--; \ - int flength; \ - if (newton) flength = nall; \ - else flength = nlocal; \ - IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), \ - separate_flag); \ - if (nghost) { \ - if (nlocal < all_local || ghost_min > all_local) { \ - memmove(x + nlocal, x + ghost_min, \ - (nall - nlocal) * sizeof(ATOM_T)); \ - if (q != 0) \ - memmove((void *)(q + nlocal), (void *)(q + ghost_min), \ - (nall - nlocal) * sizeof(flt_t)); \ - } \ - } \ - } \ - x[nall].x = INTEL_BIGP; \ - x[nall].y = INTEL_BIGP; \ - x[nall].z = INTEL_BIGP; \ - } \ +#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ + f_stride, x, q) \ +{ \ + if (separate_flag) { \ + if (separate_flag < 3) { \ + int all_local = nlocal; \ + int ghost_min = overflow[LMP_GHOST_MIN]; \ + nlocal = overflow[LMP_LOCAL_MAX] + 1; \ + int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; \ + if (nghost < 0) nghost = 0; \ + nall = nlocal + nghost; \ + separate_flag--; \ + int flength; \ + if (newton) flength = nall; \ + else flength = nlocal; \ + IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), \ + separate_flag); \ + if (nghost) { \ + if (nlocal < all_local || ghost_min > all_local) { \ + memmove(x + nlocal, x + ghost_min, \ + (nall - nlocal) * sizeof(ATOM_T)); \ + if (q != 0) \ + memmove((void *)(q + nlocal), (void *)(q + ghost_min), \ + (nall - nlocal) * sizeof(flt_t)); \ + } \ + } \ + } \ + x[nall].x = INTEL_BIGP; \ + x[nall].y = INTEL_BIGP; \ + x[nall].z = INTEL_BIGP; \ + } \ } -#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \ - f_start, f_stride, x, offload, vflag, \ - ov0, ov1, ov2, ov3, ov4, ov5) \ -{ \ - if (newton) { \ - _use_omp_pragma("omp barrier"); \ - IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ - f_stride, x, offload, vflag, ov0, ov1, ov2, \ - ov3, ov4, ov5); \ - } \ +#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \ + f_start, f_stride, x, offload, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + if (newton) { \ + _use_omp_pragma("omp barrier"); \ + IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \ + f_stride, x, offload, vflag, ov0, ov1, ov2, \ + ov3, ov4, ov5); \ + } \ } -#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \ - ov0, ov1, ov2, ov3, ov4, ov5) +#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) #else @@ -649,164 +649,164 @@ inline double MIC_Wtime() { #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ nlocal, nall) -#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \ - buffers, offload, fix, separate_flag, \ - x_size, q_size, ev_size, f_stride) \ +#define IP_PRE_get_transfern(ago, newton, eflag, vflag, \ + buffers, offload, fix, separate_flag, \ + x_size, q_size, ev_size, f_stride) \ { \ - separate_flag = 0; \ + separate_flag = 0; \ int f_length; \ if (newton) \ f_length = nall; \ else \ f_length = nlocal; \ - f_stride = buffers->get_stride(f_length); \ + f_stride = buffers->get_stride(f_length); \ } -#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ - ev_global) \ -{ \ - tc = comm->nthreads; \ - f_start = buffers->get_f(); \ - fix->start_watch(TIME_HOST_PAIR); \ - ev_global = buffers->get_ev_global_host(); \ +#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ + ev_global) \ +{ \ + tc = comm->nthreads; \ + f_start = buffers->get_f(); \ + fix->start_watch(TIME_HOST_PAIR); \ + ev_global = buffers->get_ev_global_host(); \ } -#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ - f_stride, x, q) - -#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \ - f_start, f_stride, x, offload, vflag, \ - ov0, ov1, ov2, ov3, ov4, ov5) \ -{ \ - if (newton) { \ - if (vflag == 2 && nthreads > INTEL_HTHREADS) { \ - _use_omp_pragma("omp barrier"); \ - buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2, \ - ov3, ov4, ov5); \ - } \ - } \ +#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \ + f_stride, x, q) + +#define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \ + f_start, f_stride, x, offload, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + if (newton) { \ + if (vflag == 2 && nthreads > INTEL_HTHREADS) { \ + _use_omp_pragma("omp barrier"); \ + buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2, \ + ov3, ov4, ov5); \ + } \ + } \ } -#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \ - ov0, ov1, ov2, ov3, ov4, ov5) \ -{ \ - if (newton) { \ - if (vflag == 2 && nthreads <= INTEL_HTHREADS) { \ - int lt = nall * 4; \ - buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1, \ - ov2, ov3, ov4, ov5); \ - } \ - } \ +#define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \ + ov0, ov1, ov2, ov3, ov4, ov5) \ +{ \ + if (newton) { \ + if (vflag == 2 && nthreads <= INTEL_HTHREADS) { \ + int lt = nall * 4; \ + buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1, \ + ov2, ov3, ov4, ov5); \ + } \ + } \ } #endif -#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz) \ +#define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz) \ { \ if (vflag == 1) { \ - sv0 += delx * delx * fpair; \ - sv1 += dely * dely * fpair; \ - sv2 += delz * delz * fpair; \ - sv3 += delx * dely * fpair; \ - sv4 += delx * delz * fpair; \ - sv5 += dely * delz * fpair; \ + sv0 += delx * delx * fpair; \ + sv1 += dely * dely * fpair; \ + sv2 += delz * delz * fpair; \ + sv3 += delx * dely * fpair; \ + sv4 += delx * delz * fpair; \ + sv5 += dely * delz * fpair; \ } \ } -#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz) \ +#define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz) \ { \ if (vflag == 1) { \ - sv0 += dx * fpx; \ - sv1 += dy * fpy; \ - sv2 += dz * fpz; \ - sv3 += dx * fpy; \ - sv4 += dx * fpz; \ - sv5 += dy * fpz; \ + sv0 += dx * fpx; \ + sv1 += dy * fpy; \ + sv2 += dz * fpz; \ + sv3 += dx * fpy; \ + sv4 += dx * fpz; \ + sv5 += dy * fpz; \ } \ } -#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2) \ +#define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2) \ { \ if (vflag == 1) { \ sv0 += delx * fj[0] + delr2[0] * fk[0]; \ - sv1 += dely * fj[1] + delr2[1] * fk[1]; \ - sv2 += delz * fj[2] + delr2[2] * fk[2]; \ - sv3 += delx * fj[1] + delr2[0] * fk[1]; \ - sv4 += delx * fj[2] + delr2[0] * fk[2]; \ - sv5 += dely * fj[2] + delr2[1] * fk[2]; \ + sv1 += dely * fj[1] + delr2[1] * fk[1]; \ + sv2 += delz * fj[2] + delr2[2] * fk[2]; \ + sv3 += delx * fj[1] + delr2[0] * fk[1]; \ + sv4 += delx * fj[2] + delr2[0] * fk[2]; \ + sv5 += dely * fj[2] + delr2[1] * fk[2]; \ } \ } #define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz) \ { \ if (vflag == 1) { \ - sv0 += delx * fj0; \ - sv1 += dely * fj1; \ - sv2 += delz * fj2; \ - sv3 += delx * fj1; \ - sv4 += delx * fj2; \ - sv5 += dely * fj2; \ + sv0 += delx * fj0; \ + sv1 += dely * fj1; \ + sv2 += delz * fj2; \ + sv3 += delx * fj1; \ + sv4 += delx * fj2; \ + sv5 += dely * fj2; \ } \ } #define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \ - fbond, delx, dely, delz, obond, force, \ - newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ - ov5) \ + fbond, delx, dely, delz, obond, force, \ + newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ + ov5) \ { \ - flt_t ev_pre; \ - if (newton) ev_pre = (flt_t)1.0; \ - else { \ - ev_pre = (flt_t)0.0; \ - if (i1 < nlocal) ev_pre += (flt_t)0.5; \ - if (i2 < nlocal) ev_pre += (flt_t)0.5; \ - } \ - \ - if (eflag) { \ - obond += ev_pre * ebond; \ - if (eatom) { \ - flt_t halfeng = ebond * (flt_t)0.5; \ - if (newton || i1 < nlocal) f[i1].w += halfeng; \ - if (newton || i2 < nlocal) f[i2].w += halfeng; \ - } \ - } \ - \ - if (VFLAG && vflag) { \ - ov0 += ev_pre * (delx * delx * fbond); \ - ov1 += ev_pre * (dely * dely * fbond); \ - ov2 += ev_pre * (delz * delz * fbond); \ - ov3 += ev_pre * (delx * dely * fbond); \ - ov4 += ev_pre * (delx * delz * fbond); \ - ov5 += ev_pre * (dely * delz * fbond); \ + flt_t ev_pre; \ + if (newton) ev_pre = (flt_t)1.0; \ + else { \ + ev_pre = (flt_t)0.0; \ + if (i1 < nlocal) ev_pre += (flt_t)0.5; \ + if (i2 < nlocal) ev_pre += (flt_t)0.5; \ + } \ + \ + if (eflag) { \ + obond += ev_pre * ebond; \ + if (eatom) { \ + flt_t halfeng = ebond * (flt_t)0.5; \ + if (newton || i1 < nlocal) f[i1].w += halfeng; \ + if (newton || i2 < nlocal) f[i2].w += halfeng; \ + } \ + } \ + \ + if (VFLAG && vflag) { \ + ov0 += ev_pre * (delx * delx * fbond); \ + ov1 += ev_pre * (dely * dely * fbond); \ + ov2 += ev_pre * (delz * delz * fbond); \ + ov3 += ev_pre * (delx * dely * fbond); \ + ov4 += ev_pre * (delx * delz * fbond); \ + ov5 += ev_pre * (dely * delz * fbond); \ } \ } #define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1, \ - i2, i3, f1x, f1y, f1z, f3x, f3y, f3z, \ - delx1, dely1, delz1, delx2, dely2, delz2, \ - oeangle, force, newton, nlocal, ov0, ov1, \ - ov2, ov3, ov4, ov5) \ + i2, i3, f1x, f1y, f1z, f3x, f3y, f3z, \ + delx1, dely1, delz1, delx2, dely2, delz2, \ + oeangle, force, newton, nlocal, ov0, ov1, \ + ov2, ov3, ov4, ov5) \ { \ - flt_t ev_pre; \ - if (newton) ev_pre = (flt_t)1.0; \ - else { \ - ev_pre = (flt_t)0.0; \ - if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ - if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ - if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ - } \ - \ - if (eflag) { \ - oeangle += ev_pre * eangle; \ - if (eatom) { \ - flt_t thirdeng = eangle * (flt_t)0.3333333333333333; \ - if (newton || i1 < nlocal) f[i1].w += thirdeng; \ - if (newton || i2 < nlocal) f[i2].w += thirdeng; \ - if (newton || i3 < nlocal) f[i3].w += thirdeng; \ - } \ - } \ - \ - if (VFLAG && vflag) { \ + flt_t ev_pre; \ + if (newton) ev_pre = (flt_t)1.0; \ + else { \ + ev_pre = (flt_t)0.0; \ + if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ + if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ + if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333; \ + } \ + \ + if (eflag) { \ + oeangle += ev_pre * eangle; \ + if (eatom) { \ + flt_t thirdeng = eangle * (flt_t)0.3333333333333333; \ + if (newton || i1 < nlocal) f[i1].w += thirdeng; \ + if (newton || i2 < nlocal) f[i2].w += thirdeng; \ + if (newton || i3 < nlocal) f[i3].w += thirdeng; \ + } \ + } \ + \ + if (VFLAG && vflag) { \ ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \ ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \ ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \ @@ -817,74 +817,74 @@ inline double MIC_Wtime() { } #define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \ - i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\ - f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, \ - vb2z, vb3x, vb3y, vb3z, oedihedral, force,\ - newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ - ov5) \ + i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\ + f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, \ + vb2z, vb3x, vb3y, vb3z, oedihedral, force,\ + newton, nlocal, ov0, ov1, ov2, ov3, ov4, \ + ov5) \ { \ - flt_t ev_pre; \ - if (newton) ev_pre = (flt_t)1.0; \ - else { \ - ev_pre = (flt_t)0.0; \ - if (i1 < nlocal) ev_pre += (flt_t)0.25; \ - if (i2 < nlocal) ev_pre += (flt_t)0.25; \ - if (i3 < nlocal) ev_pre += (flt_t)0.25; \ - if (i4 < nlocal) ev_pre += (flt_t)0.25; \ - } \ - \ - if (eflag) { \ - oedihedral += ev_pre * deng; \ - if (eatom) { \ - flt_t qdeng = deng * (flt_t)0.25; \ - if (newton || i1 < nlocal) f[i1].w += qdeng; \ - if (newton || i2 < nlocal) f[i2].w += qdeng; \ - if (newton || i3 < nlocal) f[i3].w += qdeng; \ - if (newton || i4 < nlocal) f[i4].w += qdeng; \ - } \ - } \ - \ - if (VFLAG && vflag) { \ - ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \ - ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \ - ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \ - ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \ - ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \ - ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \ + flt_t ev_pre; \ + if (newton) ev_pre = (flt_t)1.0; \ + else { \ + ev_pre = (flt_t)0.0; \ + if (i1 < nlocal) ev_pre += (flt_t)0.25; \ + if (i2 < nlocal) ev_pre += (flt_t)0.25; \ + if (i3 < nlocal) ev_pre += (flt_t)0.25; \ + if (i4 < nlocal) ev_pre += (flt_t)0.25; \ + } \ + \ + if (eflag) { \ + oedihedral += ev_pre * deng; \ + if (eatom) { \ + flt_t qdeng = deng * (flt_t)0.25; \ + if (newton || i1 < nlocal) f[i1].w += qdeng; \ + if (newton || i2 < nlocal) f[i2].w += qdeng; \ + if (newton || i3 < nlocal) f[i3].w += qdeng; \ + if (newton || i4 < nlocal) f[i4].w += qdeng; \ + } \ + } \ + \ + if (VFLAG && vflag) { \ + ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \ + ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \ + ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \ + ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \ + ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \ + ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \ } \ } -#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp) \ -{ \ - if (eflag) { \ - f[i].w += fwtmp; \ - oevdwl += sevdwl; \ - } \ - if (newton == 0 && vflag == 1) { \ - ov0 += sv0; \ - ov1 += sv1; \ - ov2 += sv2; \ - ov3 += sv3; \ - ov4 += sv4; \ - ov5 += sv5; \ - } \ +#define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp) \ +{ \ + if (eflag) { \ + f[i].w += fwtmp; \ + oevdwl += sevdwl; \ + } \ + if (newton == 0 && vflag == 1) { \ + ov0 += sv0; \ + ov1 += sv1; \ + ov2 += sv2; \ + ov3 += sv3; \ + ov4 += sv4; \ + ov5 += sv5; \ + } \ } -#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp) \ -{ \ - if (eflag) { \ - f[i].w += fwtmp; \ - oevdwl += sevdwl; \ - oecoul += secoul; \ - } \ - if (newton == 0 && vflag == 1) { \ - ov0 += sv0; \ - ov1 += sv1; \ - ov2 += sv2; \ - ov3 += sv3; \ - ov4 += sv4; \ - ov5 += sv5; \ - } \ +#define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp) \ +{ \ + if (eflag) { \ + f[i].w += fwtmp; \ + oevdwl += sevdwl; \ + oecoul += secoul; \ + } \ + if (newton == 0 && vflag == 1) { \ + ov0 += sv0; \ + ov1 += sv1; \ + ov2 += sv2; \ + ov3 += sv3; \ + ov4 += sv4; \ + ov5 += sv5; \ + } \ } } diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h index aa03a6f136..4616f628e7 100644 --- a/src/USER-INTEL/intel_simd.h +++ b/src/USER-INTEL/intel_simd.h @@ -42,25 +42,25 @@ namespace ip_simd { struct SIMD_int { __m512i v; SIMD_int() {} - SIMD_int(const __m512i in) : v(in) {} + SIMD_int(const __m512i in) : v(in) {} operator __m512i() const { return v;} }; struct SIMD_float { __m512 v; SIMD_float() {} - SIMD_float(const __m512 in) : v(in) {} + SIMD_float(const __m512 in) : v(in) {} operator __m512() const { return v;} }; struct SIMD_double { __m512d v; SIMD_double() {} - SIMD_double(const __m512d in) : v(in) {} + SIMD_double(const __m512d in) : v(in) {} operator __m512d() const { return v;} }; - template<class flt_t> + template<class flt_t> class SIMD_type { }; @@ -92,20 +92,20 @@ namespace ip_simd { // ------- Set Operations - inline SIMD_int SIMD_set(const int l0, const int l1, const int l2, - const int l3, const int l4, const int l5, - const int l6, const int l7, const int l8, - const int l9, const int l10, const int l11, - const int l12, const int l13, const int l14, - const int l15) { + inline SIMD_int SIMD_set(const int l0, const int l1, const int l2, + const int l3, const int l4, const int l5, + const int l6, const int l7, const int l8, + const int l9, const int l10, const int l11, + const int l12, const int l13, const int l14, + const int l15) { return _mm512_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7, - l8,l9,l10,l11,l12,l13,l14,l15); + l8,l9,l10,l11,l12,l13,l14,l15); } inline SIMD_int SIMD_set(const int l) { return _mm512_set1_epi32(l); } - + inline SIMD_float SIMD_set(const float l) { return _mm512_set1_ps(l); } @@ -113,28 +113,28 @@ namespace ip_simd { inline SIMD_double SIMD_set(const double l) { return _mm512_set1_pd(l); } - + inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) { return _mm512_maskz_mov_epi32(m, one); } - inline SIMD_float SIMD_zero_masked(const SIMD_mask &m, - const SIMD_float &one) { + inline SIMD_float SIMD_zero_masked(const SIMD_mask &m, + const SIMD_float &one) { return _mm512_maskz_mov_ps(m, one); } - inline SIMD_double SIMD_zero_masked(const SIMD_mask &m, - const SIMD_double &one) { + inline SIMD_double SIMD_zero_masked(const SIMD_mask &m, + const SIMD_double &one) { return _mm512_maskz_mov_pd(m, one); } - inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, - const SIMD_float &one) { + inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, + const SIMD_float &one) { return _mm512_mask_mov_ps(src,m,one); } - inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, - const SIMD_double &one) { + inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, + const SIMD_double &one) { return _mm512_mask_mov_pd(src,m,one); } @@ -147,11 +147,11 @@ namespace ip_simd { inline SIMD_float SIMD_load(const float *p) { return _mm512_load_ps(p); } - + inline SIMD_double SIMD_load(const double *p) { return _mm512_load_pd(p); } - + inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) { return _mm512_maskz_load_epi32(m, p); } @@ -159,7 +159,7 @@ namespace ip_simd { inline SIMD_float SIMD_loadz(const SIMD_mask &m, const float *p) { return _mm512_maskz_load_ps(m, p); } - + inline SIMD_double SIMD_loadz(const SIMD_mask &m, const double *p) { return _mm512_maskz_load_pd(m, p); } @@ -168,7 +168,7 @@ namespace ip_simd { return _mm512_i32gather_epi32(i, p, _MM_SCALE_4); } - inline SIMD_float SIMD_gather(const float *p, const SIMD_int &i) { + inline SIMD_float SIMD_gather(const float *p, const SIMD_int &i) { return _mm512_i32gather_ps(i, p, _MM_SCALE_4); } @@ -177,56 +177,56 @@ namespace ip_simd { } inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, p, - _MM_SCALE_4); + _MM_SCALE_4); } inline SIMD_float SIMD_gather(const SIMD_mask &m, const float *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, p, - _MM_SCALE_4); + _MM_SCALE_4); } inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, p, - _MM_SCALE_8); + _MM_SCALE_8); } template <typename T> inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p, - const SIMD_int &i) { + const SIMD_int &i) { } template <> inline SIMD_int SIMD_gatherz_offset<float>(const SIMD_mask &m, const int *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p, - _MM_SCALE_4); + _MM_SCALE_4); } template <> inline SIMD_int SIMD_gatherz_offset<double>(const SIMD_mask &m, const int *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p, - _MM_SCALE_8); + _MM_SCALE_8); } inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p, - _MM_SCALE_4); + _MM_SCALE_4); } inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p, - const SIMD_int &i) { + const SIMD_int &i) { return _mm512_mask_i32logather_pd( _mm512_set1_pd(0.0), m, i, p, - _MM_SCALE_8); + _MM_SCALE_8); } // ------- Store Operations - + inline void SIMD_store(int *p, const SIMD_int &one) { return _mm512_store_epi32(p,one); } @@ -240,17 +240,17 @@ namespace ip_simd { } inline void SIMD_scatter(const SIMD_mask &m, int *p, - const SIMD_int &i, const SIMD_int &vec) { + const SIMD_int &i, const SIMD_int &vec) { _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4); } inline void SIMD_scatter(const SIMD_mask &m, float *p, - const SIMD_int &i, const SIMD_float &vec) { + const SIMD_int &i, const SIMD_float &vec) { _mm512_mask_i32scatter_ps(p, m, i, vec, _MM_SCALE_4); } inline void SIMD_scatter(const SIMD_mask &m, double *p, - const SIMD_int &i, const SIMD_double &vec) { + const SIMD_int &i, const SIMD_double &vec) { _mm512_mask_i32loscatter_pd(p, m, i, vec, _MM_SCALE_8); } @@ -263,76 +263,76 @@ namespace ip_simd { inline SIMD_float operator+(const SIMD_float &one, const SIMD_float &two) { return _mm512_add_ps(one,two); } - + inline SIMD_double operator+(const SIMD_double &one, const SIMD_double &two) { return _mm512_add_pd(one,two); } - + inline SIMD_int operator+(const SIMD_int &one, const int two) { return _mm512_add_epi32(one,SIMD_set(two)); } - + inline SIMD_float operator+(const SIMD_float &one, const float two) { return _mm512_add_ps(one,SIMD_set(two)); } - + inline SIMD_double operator+(const SIMD_double &one, const double two) { return _mm512_add_pd(one,SIMD_set(two)); } inline SIMD_int SIMD_add(const SIMD_mask &m, - const SIMD_int &one, const int two) { + const SIMD_int &one, const int two) { return _mm512_mask_add_epi32(one,m,one,SIMD_set(two)); } inline SIMD_float SIMD_add(const SIMD_mask &m, - const SIMD_float &one, const float two) { + const SIMD_float &one, const float two) { return _mm512_mask_add_ps(one,m,one,SIMD_set(two)); } inline SIMD_double SIMD_add(const SIMD_mask &m, - const SIMD_double &one, const double two) { + const SIMD_double &one, const double two) { return _mm512_mask_add_pd(one,m,one,SIMD_set(two)); } inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m, - const SIMD_int &one, const SIMD_int &two) { + const SIMD_int &one, const SIMD_int &two) { return _mm512_mask_add_epi32(s,m,one,two); } inline SIMD_float SIMD_add(const SIMD_float &s, const SIMD_mask &m, - const SIMD_float &one, const SIMD_float &two) { + const SIMD_float &one, const SIMD_float &two) { return _mm512_mask_add_ps(s,m,one,two); } inline SIMD_double SIMD_add(const SIMD_double &s, const SIMD_mask &m, - const SIMD_double &one, const SIMD_double &two) { + const SIMD_double &one, const SIMD_double &two) { return _mm512_mask_add_pd(s,m,one,two); } inline SIMD_int SIMD_sub(const SIMD_int &s, const SIMD_mask &m, - const SIMD_int &one, const SIMD_int &two) { + const SIMD_int &one, const SIMD_int &two) { return _mm512_mask_sub_epi32(s,m,one,two); } inline SIMD_float SIMD_sub(const SIMD_float &s, const SIMD_mask &m, - const SIMD_float &one, const SIMD_float &two) { + const SIMD_float &one, const SIMD_float &two) { return _mm512_mask_sub_ps(s,m,one,two); } inline SIMD_double SIMD_sub(const SIMD_double &s, const SIMD_mask &m, - const SIMD_double &one, const SIMD_double &two) { + const SIMD_double &one, const SIMD_double &two) { return _mm512_mask_sub_pd(s,m,one,two); } inline SIMD_int operator-(const SIMD_int &one) { return _mm512_sub_epi32(SIMD_set((int)0),one); } - + inline SIMD_float operator-(const SIMD_float &one) { return _mm512_sub_ps(SIMD_set((float)0),one); } - + inline SIMD_double operator-(const SIMD_double &one) { return _mm512_sub_pd(SIMD_set((double)0),one); } @@ -340,80 +340,80 @@ namespace ip_simd { inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) { return _mm512_sub_epi32(one,two); } - + inline SIMD_float operator-(const SIMD_float &one, const SIMD_float &two) { return _mm512_sub_ps(one,two); } - + inline SIMD_double operator-(const SIMD_double &one, const SIMD_double &two) { return _mm512_sub_pd(one,two); } - + inline SIMD_int operator-(const SIMD_int &one, const int two) { return _mm512_sub_epi32(one,SIMD_set(two)); } - + inline SIMD_float operator-(const SIMD_float &one, const float two) { return _mm512_sub_ps(one,SIMD_set(two)); } - + inline SIMD_double operator-(const SIMD_double &one, const double two) { return _mm512_sub_pd(one,SIMD_set(two)); } - + inline SIMD_int operator*(const SIMD_int &one, const SIMD_int &two) { return _mm512_mullo_epi32(one,two); } - + inline SIMD_float operator*(const SIMD_float &one, const SIMD_float &two) { return _mm512_mul_ps(one,two); } - + inline SIMD_double operator*(const SIMD_double &one, const SIMD_double &two) { return _mm512_mul_pd(one,two); } - + inline SIMD_int operator*(const SIMD_int &one, const int two) { return _mm512_mullo_epi32(one,SIMD_set(two)); } - + inline SIMD_float operator*(const SIMD_float &one, const float two) { return _mm512_mul_ps(one,SIMD_set(two)); } - + inline SIMD_double operator*(const SIMD_double &one, const double two) { return _mm512_mul_pd(one,SIMD_set(two)); } - + inline SIMD_float operator/(const SIMD_float &one, const SIMD_float &two) { return _mm512_div_ps(one,two); } - + inline SIMD_double operator/(const SIMD_double &one, const SIMD_double &two) { return _mm512_div_pd(one,two); } - + inline SIMD_float SIMD_fma(const SIMD_float &one, const SIMD_float &two, - const SIMD_float &three) { + const SIMD_float &three) { return _mm512_fmadd_ps(one,two,three); } inline SIMD_double SIMD_fma(const SIMD_double &one, const SIMD_double &two, - const SIMD_double &three) { + const SIMD_double &three) { return _mm512_fmadd_pd(one,two,three); } inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two, - const SIMD_float &three) { + const SIMD_float &three) { return _mm512_fmsub_ps(one,two,three); } inline SIMD_double SIMD_fms(const SIMD_double &one, const SIMD_double &two, - const SIMD_double &three) { + const SIMD_double &three) { return _mm512_fmsub_pd(one,two,three); } - - // ------- SVML operations + + // ------- SVML operations inline SIMD_float SIMD_rcp(const SIMD_float &one) { #ifdef __AVX512ER__ @@ -489,33 +489,33 @@ namespace ip_simd { // ------- Comparison operations - inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one, - const SIMD_int &two) { + inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one, + const SIMD_int &two) { return _mm512_mask_cmplt_epi32_mask(m, one, two); } - inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one, - const SIMD_float &two) { + inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one, + const SIMD_float &two) { return _mm512_mask_cmplt_ps_mask(m, one, two); } - inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one, - const SIMD_double &two) { + inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one, + const SIMD_double &two) { return _mm512_mask_cmplt_pd_mask(m, one, two); } - inline SIMD_mask SIMD_lt(SIMD_mask m, const int one, - const SIMD_int &two) { + inline SIMD_mask SIMD_lt(SIMD_mask m, const int one, + const SIMD_int &two) { return _mm512_mask_cmplt_epi32_mask(m, SIMD_set(one), two); } - inline SIMD_mask SIMD_lt(SIMD_mask m, const float one, - const SIMD_float &two) { + inline SIMD_mask SIMD_lt(SIMD_mask m, const float one, + const SIMD_float &two) { return _mm512_mask_cmplt_ps_mask(m, SIMD_set(one), two); } - inline SIMD_mask SIMD_lt(SIMD_mask m, const double one, - const SIMD_double &two) { + inline SIMD_mask SIMD_lt(SIMD_mask m, const double one, + const SIMD_double &two) { return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two); } @@ -629,112 +629,112 @@ namespace ip_simd { // i indices should be positive inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i, - SIMD_float &v1) { + SIMD_float &v1) { SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); if (todo_mask) { SIMD_int lz = _mm512_lzcnt_epi32(cd); SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), - _mm512_lzcnt_epi32(cd)); - + _mm512_lzcnt_epi32(cd)); + while(todo_mask) { - SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); - SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, - todo_bcast); - SIMD_float am_perm; - am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), - now_mask, lid, v1); - v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); - todo_mask = _mm512_kxor(todo_mask, now_mask); + SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); + SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, + todo_bcast); + SIMD_float am_perm; + am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), + now_mask, lid, v1); + v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); + todo_mask = _mm512_kxor(todo_mask, now_mask); } } } // i indices should be positive inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i, - SIMD_double &v1) { + SIMD_double &v1) { SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); if (todo_mask) { SIMD_int lz = _mm512_lzcnt_epi32(cd); SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), - _mm512_lzcnt_epi32(cd)); + _mm512_lzcnt_epi32(cd)); lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid)); - + while(todo_mask) { - SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); - SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, - todo_bcast); - SIMD_double am_perm; - am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), - now_mask, lid, v1); - v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); - todo_mask = _mm512_kxor(todo_mask, now_mask); + SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); + SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, + todo_bcast); + SIMD_double am_perm; + am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), + now_mask, lid, v1); + v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); + todo_mask = _mm512_kxor(todo_mask, now_mask); } } } // i indices should be positive inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i, - SIMD_float &v1, SIMD_float &v2, - SIMD_float &v3) { + SIMD_float &v1, SIMD_float &v2, + SIMD_float &v3) { SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); if (todo_mask) { SIMD_int lz = _mm512_lzcnt_epi32(cd); SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), - _mm512_lzcnt_epi32(cd)); - + _mm512_lzcnt_epi32(cd)); + while(todo_mask) { - SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); - SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, - todo_bcast); - SIMD_float am_perm; - am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), - now_mask, lid, v1); - v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); - am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), - now_mask, lid, v2); - v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm); - am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), - now_mask, lid, v3); - v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm); - todo_mask = _mm512_kxor(todo_mask, now_mask); + SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); + SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, + todo_bcast); + SIMD_float am_perm; + am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), + now_mask, lid, v1); + v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); + am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), + now_mask, lid, v2); + v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm); + am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), + now_mask, lid, v3); + v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm); + todo_mask = _mm512_kxor(todo_mask, now_mask); } } } // i indices should be positive inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i, - SIMD_double &v1, SIMD_double &v2, - SIMD_double &v3) { + SIMD_double &v1, SIMD_double &v2, + SIMD_double &v3) { SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); if (todo_mask) { SIMD_int lz = _mm512_lzcnt_epi32(cd); SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), - _mm512_lzcnt_epi32(cd)); + _mm512_lzcnt_epi32(cd)); lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid)); - + while(todo_mask) { - SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); - SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, - todo_bcast); - SIMD_double am_perm; - am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), - now_mask, lid, v1); - v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); - am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), - now_mask, lid, v2); - v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm); - am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), - now_mask, lid, v3); - v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm); - todo_mask = _mm512_kxor(todo_mask, now_mask); + SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); + SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, + todo_bcast); + SIMD_double am_perm; + am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), + now_mask, lid, v1); + v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); + am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), + now_mask, lid, v2); + v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm); + am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), + now_mask, lid, v3); + v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm); + todo_mask = _mm512_kxor(todo_mask, now_mask); } } } @@ -744,7 +744,7 @@ namespace ip_simd { inline SIMD_int operator&(const SIMD_int &one, const SIMD_int &two) { return _mm512_and_epi32(one,two); } - + inline SIMD_int operator>>(const SIMD_int &one, const SIMD_int &two) { return _mm512_srlv_epi32(one,two); } @@ -752,21 +752,21 @@ namespace ip_simd { inline SIMD_int operator<<(const SIMD_int &one, const unsigned two) { return _mm512_slli_epi32(one,two); } - + // -------- I/O operations inline void SIMD_print(const __m512i &vec) { - for (int i = 0; i < 16; i++) + for (int i = 0; i < 16; i++) printf("%d ",(*((int*)&(vec) + (i)))); } inline void SIMD_print(const __m512 &vec) { - for (int i = 0; i < 16; i++) + for (int i = 0; i < 16; i++) printf("%f ",(*((float*)&(vec) + (i)))); } inline void SIMD_print(const __m512d &vec) { - for (int i = 0; i < 8; i++) + for (int i = 0; i < 8; i++) printf("%f ",(*((double*)&(vec) + (i)))); } @@ -801,280 +801,280 @@ namespace ip_simd { // ---------- LAMMPS operations #ifndef SW_GATHER_TEST inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom, - const SIMD_int &i, SIMD_float &x, SIMD_float &y, - SIMD_float &z) { - x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, - _MM_SCALE_1); + const SIMD_int &i, SIMD_float &x, SIMD_float &y, + SIMD_float &z) { + x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, + _MM_SCALE_1); y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1, - _MM_SCALE_1); + _MM_SCALE_1); z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2, - _MM_SCALE_1); + _MM_SCALE_1); } inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom, - const SIMD_int &i, SIMD_float &x, SIMD_float &y, - SIMD_float &z, SIMD_int &type) { - x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, - _MM_SCALE_1); + const SIMD_int &i, SIMD_float &x, SIMD_float &y, + SIMD_float &z, SIMD_int &type) { + x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, + _MM_SCALE_1); y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1, - _MM_SCALE_1); + _MM_SCALE_1); z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2, - _MM_SCALE_1); + _MM_SCALE_1); type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3, - _MM_SCALE_1); + _MM_SCALE_1); } #endif inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom, - const SIMD_int &i, SIMD_double &x, - SIMD_double &y, SIMD_double &z) { - x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, - _MM_SCALE_2); + const SIMD_int &i, SIMD_double &x, + SIMD_double &y, SIMD_double &z) { + x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, + _MM_SCALE_2); y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1, - _MM_SCALE_2); + _MM_SCALE_2); z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2, - _MM_SCALE_2); + _MM_SCALE_2); } inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom, - const SIMD_int &i, SIMD_double &x, - SIMD_double &y, SIMD_double &z, SIMD_int &type) { - x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, - _MM_SCALE_2); + const SIMD_int &i, SIMD_double &x, + SIMD_double &y, SIMD_double &z, SIMD_int &type) { + x = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom, + _MM_SCALE_2); y = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+1, - _MM_SCALE_2); + _MM_SCALE_2); z = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, atom+2, - _MM_SCALE_2); + _MM_SCALE_2); type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3, - _MM_SCALE_2); + _MM_SCALE_2); } - inline SIMD_float SIMD_ev_add(const SIMD_float &one, - const SIMD_float &two) { + inline SIMD_float SIMD_ev_add(const SIMD_float &one, + const SIMD_float &two) { return _mm512_add_ps(one,two); } - inline SIMD_double SIMD_ev_add(const SIMD_double &one, - const SIMD_double &two) { + inline SIMD_double SIMD_ev_add(const SIMD_double &one, + const SIMD_double &two) { return _mm512_add_pd(one,two); } - inline SIMD_double SIMD_ev_add(const SIMD_double &one, - const SIMD_float &two) { + inline SIMD_double SIMD_ev_add(const SIMD_double &one, + const SIMD_float &two) { SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two)); SIMD_double ans = _mm512_add_pd(one,twod); twod = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(two,two,238))); + _mm512_shuffle_f32x4(two,two,238))); return _mm512_add_pd(ans,twod); } - inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, - const SIMD_int &joffset, SIMD_float &eng) { + inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, + const SIMD_int &joffset, SIMD_float &eng) { SIMD_float jeng; SIMD_conflict_pi_reduce1(rmask, joffset, eng); - jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, - force, _MM_SCALE_1); + jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, + force, _MM_SCALE_1); jeng = jeng + eng; _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1); } - inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, - const SIMD_int &joffset, SIMD_double &eng) { + inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, + const SIMD_int &joffset, SIMD_double &eng) { SIMD_double jeng; SIMD_conflict_pi_reduce1(rmask, joffset, eng); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); jeng = jeng + eng; _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); } - inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, - const SIMD_int &joffset, SIMD_float &eng) { + inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, + const SIMD_int &joffset, SIMD_float &eng) { SIMD_double engd, jeng; engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng)); SIMD_conflict_pi_reduce1(rmask, joffset, engd); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); jeng = jeng + engd; _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); SIMD_mask rmask2 = rmask >> 8; engd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(eng,eng,238))); + _mm512_shuffle_f32x4(eng,eng,238))); SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); SIMD_conflict_pi_reduce1(rmask2, joffset2, engd); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force, _MM_SCALE_2); jeng = jeng + engd; _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2); } - inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, - const SIMD_int &joffset1, SIMD_float &eng) { + inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, + const SIMD_int &joffset1, SIMD_float &eng) { } - inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, - const SIMD_int &joffset1, SIMD_double &eng) { + inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, + const SIMD_int &joffset1, SIMD_double &eng) { SIMD_mask rmask = mask >> 8; SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238); SIMD_double jeng; SIMD_conflict_pi_reduce1(rmask, joffset, eng); - jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); jeng = jeng + eng; _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2); } inline void SIMD_safe_jforce(const SIMD_mask &m, float *force, - const SIMD_int &i, SIMD_float &fx, - SIMD_float &fy, SIMD_float &fz) { + const SIMD_int &i, SIMD_float &fx, + SIMD_float &fy, SIMD_float &fz) { SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); SIMD_float jfrc; - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, - _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, + _MM_SCALE_1); jfrc = jfrc + fx; _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, - _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, + _MM_SCALE_1); jfrc = jfrc + fy; _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, - _MM_SCALE_1); + _MM_SCALE_1); jfrc = jfrc + fz; _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); } inline void SIMD_safe_jforce(const SIMD_mask &m, double *force, - const SIMD_int &i, SIMD_double &fx, - SIMD_double &fy, SIMD_double &fz) { + const SIMD_int &i, SIMD_double &fx, + SIMD_double &fy, SIMD_double &fz) { SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, - _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, + _MM_SCALE_2); jfrc = jfrc + fx; _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, - _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, + _MM_SCALE_2); jfrc = jfrc + fy; _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, - _MM_SCALE_2); + _MM_SCALE_2); jfrc = jfrc + fz; _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); } - inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, - const SIMD_int &joffset, SIMD_float &amx, - SIMD_float &amy, SIMD_float &amz) { + inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, + const SIMD_int &joffset, SIMD_float &amx, + SIMD_float &amy, SIMD_float &amz) { SIMD_double amxd, amyd, amzd; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy)); amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz)); SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd); SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force, _MM_SCALE_2); jfrc = jfrc + amxd; _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force + 1, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force + 1, _MM_SCALE_2); jfrc = jfrc + amyd; _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, - force + 2, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, + force + 2, _MM_SCALE_2); jfrc = jfrc + amzd; _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2); SIMD_mask rmask2 = rmask >> 8; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(amx,amx,238))); + _mm512_shuffle_f32x4(amx,amx,238))); amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( _mm512_shuffle_f32x4(amy,amy,238))); amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( _mm512_shuffle_f32x4(amz,amz,238))); SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force, _MM_SCALE_2); jfrc = jfrc + amxd; _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force + 1, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force + 1, _MM_SCALE_2); jfrc = jfrc + amyd; _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, - force + 2, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, + force + 2, _MM_SCALE_2); jfrc = jfrc + amzd; _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2); } inline void SIMD_jforce_update(const SIMD_mask &m, float *force, - const SIMD_int &i, const SIMD_float &fx, - const SIMD_float &fy, const SIMD_float &fz) { + const SIMD_int &i, const SIMD_float &fx, + const SIMD_float &fy, const SIMD_float &fz) { SIMD_float jfrc; - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, - _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, + _MM_SCALE_1); jfrc = jfrc - fx; _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, - _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, + _MM_SCALE_1); jfrc = jfrc - fy; _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, - _MM_SCALE_1); + _MM_SCALE_1); jfrc = jfrc - fz; _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); } template <class ft> inline void SIMD_scalar_update(const int jj, const int* ejnum, ft *force, - const int* i, const double *fx, - const double *fy, const double *fz, - const double *fx2, const double *fy2, - const double *fz2) { + const int* i, const double *fx, + const double *fy, const double *fz, + const double *fx2, const double *fy2, + const double *fz2) { #pragma novector for (int k=0; k<8; k++) { if (jj < ejnum[k]) { - const int j = i[k]; - force[j].x -= fx[k]; - force[j].y -= fy[k]; - force[j].z -= fz[k]; + const int j = i[k]; + force[j].x -= fx[k]; + force[j].y -= fy[k]; + force[j].z -= fz[k]; } } - + #pragma novector for (int k=8; k<16; k++) { if (jj < ejnum[k]) { - const int j = i[k]; - force[j].x -= fx2[k-8]; - force[j].y -= fy2[k-8]; - force[j].z -= fz2[k-8]; + const int j = i[k]; + force[j].x -= fx2[k-8]; + force[j].y -= fy2[k-8]; + force[j].z -= fz2[k-8]; } } } inline void SIMD_jforce_update(const SIMD_mask &m, double *force, - const SIMD_int &i, const SIMD_double &fx, - const SIMD_double &fy, const SIMD_double &fz) { + const SIMD_int &i, const SIMD_double &fx, + const SIMD_double &fy, const SIMD_double &fz) { SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, - _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, + _MM_SCALE_2); jfrc = jfrc - fx; _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, - _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, + _MM_SCALE_2); jfrc = jfrc - fy; _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, - _MM_SCALE_2); + _MM_SCALE_2); jfrc = jfrc - fz; _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); } - inline void SIMD_jforce_update(const SIMD_mask &rmask, + inline void SIMD_jforce_update(const SIMD_mask &rmask, double *force, const SIMD_int &joffset, SIMD_float &amx, - SIMD_float &amy, SIMD_float &amz) { + SIMD_float &amy, SIMD_float &amz) { SIMD_double amxd, amyd, amzd; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy)); @@ -1084,7 +1084,7 @@ namespace ip_simd { SIMD_mask rmask2 = rmask >> 8; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(amx,amx,238))); + _mm512_shuffle_f32x4(amx,amx,238))); amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( _mm512_shuffle_f32x4(amy,amy,238))); amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( @@ -1095,8 +1095,8 @@ namespace ip_simd { } inline void SIMD_cache3(float *pr, const int offset, - const SIMD_float &fx, - const SIMD_float &fy, const SIMD_float &fz) { + const SIMD_float &fx, + const SIMD_float &fy, const SIMD_float &fz) { float *p = pr; SIMD_float t; t = SIMD_load(p); @@ -1113,8 +1113,8 @@ namespace ip_simd { } inline void SIMD_cache3(double *pr, const int offset, - const SIMD_double &fx, - const SIMD_double &fy, const SIMD_double &fz) { + const SIMD_double &fx, + const SIMD_double &fy, const SIMD_double &fz) { double *p = pr; SIMD_double t; t = SIMD_load(p); @@ -1131,8 +1131,8 @@ namespace ip_simd { } inline void SIMD_cache3(double *pr, const int foffset, - const SIMD_float &fx, - const SIMD_float &fy, const SIMD_float &fz) { + const SIMD_float &fx, + const SIMD_float &fy, const SIMD_float &fz) { const int offset = foffset >> 1; double *p = pr; SIMD_double t, fd; @@ -1142,7 +1142,7 @@ namespace ip_simd { t = t + fd; SIMD_store(p,t); fd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fx,fx,238))); + _mm512_shuffle_f32x4(fx,fx,238))); p = p + offset; t = SIMD_load(p); t = t + fd; @@ -1154,7 +1154,7 @@ namespace ip_simd { t = t + fd; SIMD_store(p,t); fd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fy,fy,238))); + _mm512_shuffle_f32x4(fy,fy,238))); p = p + offset; t = SIMD_load(p); t = t + fd; @@ -1166,7 +1166,7 @@ namespace ip_simd { t = t + fd; SIMD_store(p,t); fd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fz,fz,238))); + _mm512_shuffle_f32x4(fz,fz,238))); p = p + offset; t = SIMD_load(p); t = t + fd; @@ -1174,15 +1174,15 @@ namespace ip_simd { } inline void SIMD_cache3(float *pr, const int offset, - const SIMD_float &fx, const SIMD_float &fy, - const SIMD_float &fz, const SIMD_float &fx2, - const SIMD_float &fy2, const SIMD_float &fz2) { + const SIMD_float &fx, const SIMD_float &fy, + const SIMD_float &fz, const SIMD_float &fx2, + const SIMD_float &fy2, const SIMD_float &fz2) { } inline void SIMD_cache3(double *pr, const int foffset, - const SIMD_double &fx, const SIMD_double &fy, - const SIMD_double &fz, const SIMD_double &fx2, - const SIMD_double &fy2, const SIMD_double &fz2) { + const SIMD_double &fx, const SIMD_double &fy, + const SIMD_double &fz, const SIMD_double &fx2, + const SIMD_double &fy2, const SIMD_double &fz2) { const int offset = foffset >> 1; double *p = pr; SIMD_double t; @@ -1214,14 +1214,14 @@ namespace ip_simd { SIMD_store(p,t); } - inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, - const SIMD_float &fjy, const SIMD_float &fjz, - SIMD_float &fxtmp, SIMD_float &fytmp, - SIMD_float &fztmp, SIMD_float &fjxtmp, - SIMD_float &fjytmp, SIMD_float &fjztmp, - SIMD_float &fxtmp2, SIMD_float &fytmp2, - SIMD_float &fztmp2, SIMD_float &fjxtmp2, - SIMD_float &fjytmp2, SIMD_float &fjztmp2) { + inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, + const SIMD_float &fjy, const SIMD_float &fjz, + SIMD_float &fxtmp, SIMD_float &fytmp, + SIMD_float &fztmp, SIMD_float &fjxtmp, + SIMD_float &fjytmp, SIMD_float &fjztmp, + SIMD_float &fxtmp2, SIMD_float &fytmp2, + SIMD_float &fztmp2, SIMD_float &fjxtmp2, + SIMD_float &fjytmp2, SIMD_float &fjztmp2) { fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx); fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy); @@ -1230,14 +1230,14 @@ namespace ip_simd { fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz); } - inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx, - const SIMD_double &fjy, const SIMD_double &fjz, - SIMD_double &fxtmp, SIMD_double &fytmp, - SIMD_double &fztmp, SIMD_double &fjxtmp, - SIMD_double &fjytmp, SIMD_double &fjztmp, - SIMD_double &fxtmp2, SIMD_double &fytmp2, - SIMD_double &fztmp2, SIMD_double &fjxtmp2, - SIMD_double &fjytmp2, SIMD_double &fjztmp2) { + inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx, + const SIMD_double &fjy, const SIMD_double &fjz, + SIMD_double &fxtmp, SIMD_double &fytmp, + SIMD_double &fztmp, SIMD_double &fjxtmp, + SIMD_double &fjytmp, SIMD_double &fjztmp, + SIMD_double &fxtmp2, SIMD_double &fytmp2, + SIMD_double &fztmp2, SIMD_double &fjxtmp2, + SIMD_double &fjytmp2, SIMD_double &fjztmp2) { fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx); fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy); @@ -1246,20 +1246,20 @@ namespace ip_simd { fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz); } - inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, - const SIMD_float &fjy, const SIMD_float &fjz, - SIMD_double &fxtmp, SIMD_double &fytmp, - SIMD_double &fztmp, SIMD_double &fjxtmp, - SIMD_double &fjytmp, SIMD_double &fjztmp, - SIMD_double &fxtmp2, SIMD_double &fytmp2, - SIMD_double &fztmp2, SIMD_double &fjxtmp2, - SIMD_double &fjytmp2, SIMD_double &fjztmp2) { + inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, + const SIMD_float &fjy, const SIMD_float &fjz, + SIMD_double &fxtmp, SIMD_double &fytmp, + SIMD_double &fztmp, SIMD_double &fjxtmp, + SIMD_double &fjytmp, SIMD_double &fjztmp, + SIMD_double &fxtmp2, SIMD_double &fytmp2, + SIMD_double &fztmp2, SIMD_double &fjxtmp2, + SIMD_double &fjytmp2, SIMD_double &fjztmp2) { SIMD_mask kmask2 = kmask >> 8; SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx)); fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd); fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd); delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fjx,fjx,238))); + _mm512_shuffle_f32x4(fjx,fjx,238))); fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd); fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd); @@ -1267,7 +1267,7 @@ namespace ip_simd { fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd); fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd); delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fjy,fjy,238))); + _mm512_shuffle_f32x4(fjy,fjy,238))); fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd); fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd); @@ -1275,22 +1275,22 @@ namespace ip_simd { fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd); fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd); delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fjz,fjz,238))); + _mm512_shuffle_f32x4(fjz,fjz,238))); fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd); fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd); } - inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, - const SIMD_float &fjy, const SIMD_float &fjz, - const SIMD_float &fkx, const SIMD_float &fky, - const SIMD_float &fkz, - SIMD_float &fxtmp, SIMD_float &fytmp, - SIMD_float &fztmp, SIMD_float &fjxtmp, - SIMD_float &fjytmp, SIMD_float &fjztmp, - SIMD_float &fxtmp2, SIMD_float &fytmp2, - SIMD_float &fztmp2, SIMD_float &fjxtmp2, - SIMD_float &fjytmp2, SIMD_float &fjztmp2, - float *pr, const int offset) { + inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, + const SIMD_float &fjy, const SIMD_float &fjz, + const SIMD_float &fkx, const SIMD_float &fky, + const SIMD_float &fkz, + SIMD_float &fxtmp, SIMD_float &fytmp, + SIMD_float &fztmp, SIMD_float &fjxtmp, + SIMD_float &fjytmp, SIMD_float &fjztmp, + SIMD_float &fxtmp2, SIMD_float &fytmp2, + SIMD_float &fztmp2, SIMD_float &fjxtmp2, + SIMD_float &fjytmp2, SIMD_float &fjztmp2, + float *pr, const int offset) { fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx); fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky); @@ -1312,17 +1312,17 @@ namespace ip_simd { SIMD_store(p, t); } - inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx, - const SIMD_double &fjy, const SIMD_double &fjz, - const SIMD_double &fkx, const SIMD_double &fky, - const SIMD_double &fkz, - SIMD_double &fxtmp, SIMD_double &fytmp, - SIMD_double &fztmp, SIMD_double &fjxtmp, - SIMD_double &fjytmp, SIMD_double &fjztmp, - SIMD_double &fxtmp2, SIMD_double &fytmp2, - SIMD_double &fztmp2, SIMD_double &fjxtmp2, - SIMD_double &fjytmp2, SIMD_double &fjztmp2, - double *pr, const int offset) { + inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx, + const SIMD_double &fjy, const SIMD_double &fjz, + const SIMD_double &fkx, const SIMD_double &fky, + const SIMD_double &fkz, + SIMD_double &fxtmp, SIMD_double &fytmp, + SIMD_double &fztmp, SIMD_double &fjxtmp, + SIMD_double &fjytmp, SIMD_double &fjztmp, + SIMD_double &fxtmp2, SIMD_double &fytmp2, + SIMD_double &fztmp2, SIMD_double &fjxtmp2, + SIMD_double &fjytmp2, SIMD_double &fjztmp2, + double *pr, const int offset) { fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx); fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky); @@ -1344,17 +1344,17 @@ namespace ip_simd { SIMD_store(p, t); } - inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, - const SIMD_float &fjy, const SIMD_float &fjz, - const SIMD_float &fkx, const SIMD_float &fky, - const SIMD_float &fkz, - SIMD_double &fxtmp, SIMD_double &fytmp, - SIMD_double &fztmp, SIMD_double &fjxtmp, - SIMD_double &fjytmp, SIMD_double &fjztmp, - SIMD_double &fxtmp2, SIMD_double &fytmp2, - SIMD_double &fztmp2, SIMD_double &fjxtmp2, - SIMD_double &fjytmp2, SIMD_double &fjztmp2, - double *pr, const int foffset) { + inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, + const SIMD_float &fjy, const SIMD_float &fjz, + const SIMD_float &fkx, const SIMD_float &fky, + const SIMD_float &fkz, + SIMD_double &fxtmp, SIMD_double &fytmp, + SIMD_double &fztmp, SIMD_double &fjxtmp, + SIMD_double &fjytmp, SIMD_double &fjztmp, + SIMD_double &fxtmp2, SIMD_double &fytmp2, + SIMD_double &fztmp2, SIMD_double &fjxtmp2, + SIMD_double &fjytmp2, SIMD_double &fjztmp2, + double *pr, const int foffset) { SIMD_mask kmask2 = kmask >> 8; const int offset = foffset >> 1; double *p = pr; @@ -1368,9 +1368,9 @@ namespace ip_simd { fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd - delfdk); fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd); delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fjx,fjx,238))); + _mm512_shuffle_f32x4(fjx,fjx,238))); delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fkx,fkx,238))); + _mm512_shuffle_f32x4(fkx,fkx,238))); p = p + offset; t = SIMD_load(p); t = t + delfdk; @@ -1387,9 +1387,9 @@ namespace ip_simd { fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd - delfdk); fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd); delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fjy,fjy,238))); + _mm512_shuffle_f32x4(fjy,fjy,238))); delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fky,fky,238))); + _mm512_shuffle_f32x4(fky,fky,238))); p = p + offset; t = SIMD_load(p); t = t + delfdk; @@ -1406,9 +1406,9 @@ namespace ip_simd { fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd - delfdk); fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd); delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fjz,fjz,238))); + _mm512_shuffle_f32x4(fjz,fjz,238))); delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(fkz,fkz,238))); + _mm512_shuffle_f32x4(fkz,fkz,238))); p = p + offset; t = SIMD_load(p); t = t + delfdk; @@ -1417,11 +1417,11 @@ namespace ip_simd { fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd); } - inline void SIMD_acc_energy3(const SIMD_mask &hmask, - const SIMD_float &evdwl, const int eatom, - SIMD_float &sevdwl, SIMD_float &fwtmp, - SIMD_float &fjtmp, SIMD_float &fwtmp2, - SIMD_float &fjtmp2) { + inline void SIMD_acc_energy3(const SIMD_mask &hmask, + const SIMD_float &evdwl, const int eatom, + SIMD_float &sevdwl, SIMD_float &fwtmp, + SIMD_float &fjtmp, SIMD_float &fwtmp2, + SIMD_float &fjtmp2) { sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl); if (eatom) { const SIMD_float hevdwl = evdwl * (float)0.5; @@ -1430,11 +1430,11 @@ namespace ip_simd { } } - inline void SIMD_acc_energy3(const SIMD_mask &hmask, - const SIMD_double &evdwl, const int eatom, - SIMD_double &sevdwl, SIMD_double &fwtmp, - SIMD_double &fjtmp, SIMD_double &fwtmp2, - SIMD_double &fjtmp2) { + inline void SIMD_acc_energy3(const SIMD_mask &hmask, + const SIMD_double &evdwl, const int eatom, + SIMD_double &sevdwl, SIMD_double &fwtmp, + SIMD_double &fjtmp, SIMD_double &fwtmp2, + SIMD_double &fjtmp2) { sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl); if (eatom) { const SIMD_double hevdwl = evdwl * (double)0.5; @@ -1443,11 +1443,11 @@ namespace ip_simd { } } - inline void SIMD_acc_energy3(const SIMD_mask &hmask, - const SIMD_float &evdwl, const int eatom, - SIMD_double &sevdwl, SIMD_double &fwtmp, - SIMD_double &fjtmp, SIMD_double &fwtmp2, - SIMD_double &fjtmp2) { + inline void SIMD_acc_energy3(const SIMD_mask &hmask, + const SIMD_float &evdwl, const int eatom, + SIMD_double &sevdwl, SIMD_double &fwtmp, + SIMD_double &fjtmp, SIMD_double &fwtmp2, + SIMD_double &fjtmp2) { SIMD_double evdwld; evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(evdwl)); sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwld); @@ -1458,7 +1458,7 @@ namespace ip_simd { } SIMD_mask hmask2 = hmask >> 8; evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(evdwl,evdwl,238))); + _mm512_shuffle_f32x4(evdwl,evdwl,238))); sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, evdwld); if (eatom) { const SIMD_double hevdwl = evdwld * (double)0.5; @@ -1467,48 +1467,48 @@ namespace ip_simd { } } - inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, - const int eatom, SIMD_float &sevdwl, - SIMD_float &fwtmp, SIMD_float &fjtmp, - SIMD_float &fwtmp2, SIMD_float &fjtmp2, - const SIMD_int &k, float *force) { + inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, + const int eatom, SIMD_float &sevdwl, + SIMD_float &fwtmp, SIMD_float &fjtmp, + SIMD_float &fwtmp2, SIMD_float &fjtmp2, + const SIMD_int &k, float *force) { sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad); if (eatom) { SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333); fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); SIMD_conflict_pi_reduce1(hmask, k, hevdwl); - SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, - k, force + 3, _MM_SCALE_1); + SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, + k, force + 3, _MM_SCALE_1); keng = keng + hevdwl; _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1); } } inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad, - const int eatom, SIMD_double &sevdwl, - SIMD_double &fwtmp, SIMD_double &fjtmp, - SIMD_double &fwtmp2, SIMD_double &fjtmp2, - const SIMD_int &k, double *force) { + const int eatom, SIMD_double &sevdwl, + SIMD_double &fwtmp, SIMD_double &fjtmp, + SIMD_double &fwtmp2, SIMD_double &fjtmp2, + const SIMD_int &k, double *force) { sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad); if (eatom) { SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333); fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); SIMD_conflict_pi_reduce1(hmask, k, hevdwl); - SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), - hmask, k, force + 3, - _MM_SCALE_2); + SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), + hmask, k, force + 3, + _MM_SCALE_2); keng = keng + hevdwl; _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2); } } - inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, - const int eatom, SIMD_double &sevdwl, - SIMD_double &fwtmp, SIMD_double &fjtmp, - SIMD_double &fwtmp2, SIMD_double &fjtmp2, - const SIMD_int &k, double *force) { + inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, + const int eatom, SIMD_double &sevdwl, + SIMD_double &fwtmp, SIMD_double &fjtmp, + SIMD_double &fwtmp2, SIMD_double &fjtmp2, + const SIMD_int &k, double *force) { SIMD_double facradd; facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad)); sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd); @@ -1517,15 +1517,15 @@ namespace ip_simd { fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); SIMD_conflict_pi_reduce1(hmask, k, hevdwl); - SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), - hmask, k, force + 3, - _MM_SCALE_2); + SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), + hmask, k, force + 3, + _MM_SCALE_2); keng = keng + hevdwl; _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2); } SIMD_mask hmask2 = hmask >> 8; facradd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(facrad,facrad,238))); + _mm512_shuffle_f32x4(facrad,facrad,238))); sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, facradd); if (eatom) { SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333); @@ -1533,20 +1533,20 @@ namespace ip_simd { fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl); SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238); SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl); - SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), - hmask2, k2, force + 3, - _MM_SCALE_2); + SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), + hmask2, k2, force + 3, + _MM_SCALE_2); keng = keng + hevdwl; _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2); } } - inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, - const float ev_pre, - const SIMD_float &fpair, const SIMD_float &delx, - const SIMD_float &dely, const SIMD_float &delz, - SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2, - SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) { + inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, + const float ev_pre, + const SIMD_float &fpair, const SIMD_float &delx, + const SIMD_float &dely, const SIMD_float &delz, + SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2, + SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) { if (vflag == 1) { const SIMD_float prefpair = SIMD_set(ev_pre) * fpair; sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair); @@ -1558,12 +1558,12 @@ namespace ip_simd { } } - inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, - const double ev_pre, - const SIMD_double &fpair, const SIMD_double &delx, - const SIMD_double &dely, const SIMD_double &delz, - SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, - SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { + inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, + const double ev_pre, + const SIMD_double &fpair, const SIMD_double &delx, + const SIMD_double &dely, const SIMD_double &delz, + SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, + SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { if (vflag == 1) { const SIMD_double prefpair = SIMD_set(ev_pre) * fpair; sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair); @@ -1575,12 +1575,12 @@ namespace ip_simd { } } - inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, - const float ev_pre, - const SIMD_float &fpair, const SIMD_float &delx, - const SIMD_float &dely, const SIMD_float &delz, - SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, - SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { + inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, + const float ev_pre, + const SIMD_float &fpair, const SIMD_float &delx, + const SIMD_float &dely, const SIMD_float &delz, + SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, + SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { if (vflag == 1) { const SIMD_mask m2 = m >> 8; const SIMD_float prefpair = SIMD_set(ev_pre) * fpair; @@ -1588,55 +1588,55 @@ namespace ip_simd { SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv0 = SIMD_add(sv0, m, sv0, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv0 = SIMD_add(sv0, m2, sv0, dpaird); dpair = dely * dely * prefpair; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv1 = SIMD_add(sv1, m, sv1, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv1 = SIMD_add(sv1, m2, sv1, dpaird); dpair = delz * delz * prefpair; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv2 = SIMD_add(sv2, m, sv2, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv2 = SIMD_add(sv2, m2, sv2, dpaird); dpair = delx * dely * prefpair; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv3 = SIMD_add(sv3, m, sv3, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv3 = SIMD_add(sv3, m2, sv3, dpaird); dpair = delx * delz * prefpair; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv4 = SIMD_add(sv4, m, sv4, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv4 = SIMD_add(sv4, m2, sv4, dpaird); dpair = dely * delz * prefpair; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv5 = SIMD_add(sv5, m, sv5, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv5 = SIMD_add(sv5, m2, sv5, dpaird); } } - inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, - const SIMD_float &fj0, const SIMD_float &fj1, - const SIMD_float &fj2, const SIMD_float &fk0, - const SIMD_float &fk1, const SIMD_float &fk2, + inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, + const SIMD_float &fj0, const SIMD_float &fj1, + const SIMD_float &fj2, const SIMD_float &fk0, + const SIMD_float &fk1, const SIMD_float &fk2, const SIMD_float &delx, const SIMD_float &dely, const SIMD_float &delz, const SIMD_float &delr2x, const SIMD_float &delr2y, const SIMD_float &delr2z, - SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2, - SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) { + SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2, + SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) { if (vflag == 1) { sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0); sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1); @@ -1647,15 +1647,15 @@ namespace ip_simd { } } - inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, - const SIMD_double &fj0, const SIMD_double &fj1, - const SIMD_double &fj2, const SIMD_double &fk0, - const SIMD_double &fk1, const SIMD_double &fk2, - const SIMD_double &delx, const SIMD_double &dely, - const SIMD_double &delz, const SIMD_double &delr2x, - const SIMD_double &delr2y, const SIMD_double &delr2z, - SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, - SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { + inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, + const SIMD_double &fj0, const SIMD_double &fj1, + const SIMD_double &fj2, const SIMD_double &fk0, + const SIMD_double &fk1, const SIMD_double &fk2, + const SIMD_double &delx, const SIMD_double &dely, + const SIMD_double &delz, const SIMD_double &delr2x, + const SIMD_double &delr2y, const SIMD_double &delr2z, + SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, + SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { if (vflag == 1) { sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0); sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1); @@ -1666,62 +1666,62 @@ namespace ip_simd { } } - inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, - const SIMD_float &fj0, const SIMD_float &fj1, - const SIMD_float &fj2, const SIMD_float &fk0, - const SIMD_float &fk1, const SIMD_float &fk2, + inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, + const SIMD_float &fj0, const SIMD_float &fj1, + const SIMD_float &fj2, const SIMD_float &fk0, + const SIMD_float &fk1, const SIMD_float &fk2, const SIMD_float &delx, const SIMD_float &dely, const SIMD_float &delz, const SIMD_float &delr2x, const SIMD_float &delr2y, const SIMD_float &delr2z, - SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, - SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { + SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, + SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { if (vflag == 1) { const SIMD_mask m2 = m >> 8; SIMD_float dpair = delx * fj0 + delr2x * fk0; SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv0 = SIMD_add(sv0, m, sv0, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv0 = SIMD_add(sv0, m2, sv0, dpaird); dpair = dely * fj1 + delr2y * fk1; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv1 = SIMD_add(sv1, m, sv1, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv1 = SIMD_add(sv1, m2, sv1, dpaird); dpair = delz * fj2 + delr2z * fk2; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv2 = SIMD_add(sv2, m, sv2, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv2 = SIMD_add(sv2, m2, sv2, dpaird); dpair = delx * fj1 + delr2x * fk1; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv3 = SIMD_add(sv3, m, sv3, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv3 = SIMD_add(sv3, m2, sv3, dpaird); dpair = delx * fj2 + delr2x * fk2; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv4 = SIMD_add(sv4, m, sv4, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv4 = SIMD_add(sv4, m2, sv4, dpaird); dpair = dely * fj2 + delr2y * fk2; dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); sv5 = SIMD_add(sv5, m, sv5, dpaird); dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(dpair,dpair,238))); + _mm512_shuffle_f32x4(dpair,dpair,238))); sv5 = SIMD_add(sv5, m2, sv5, dpaird); } } - inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, + inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, float *force, const SIMD_int &joffset, SIMD_float &amx, SIMD_float &amy, SIMD_float &amz, SIMD_float &fxtmp, SIMD_float &fytmp, SIMD_float &fztmp, SIMD_float &fxtmp2, @@ -1733,10 +1733,10 @@ namespace ip_simd { SIMD_jforce_update(rmask, force, joffset, amx, amy, amz); } - inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, + inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, double *force, const SIMD_int &joffset, SIMD_double &amx, SIMD_double &amy, SIMD_double &amz, SIMD_double &fxtmp, - SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, + SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, SIMD_double &fytmp2, SIMD_double &fztmp2) { fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx); fytmp = SIMD_add(fytmp, rmask, fytmp, amy); @@ -1745,10 +1745,10 @@ namespace ip_simd { SIMD_jforce_update(rmask, force, joffset, amx, amy, amz); } - inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, + inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, double *force, const SIMD_int &joffset, SIMD_float &amx, SIMD_float &amy, SIMD_float &amz, SIMD_double &fxtmp, - SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, + SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, SIMD_double &fytmp2, SIMD_double &fztmp2) { SIMD_double amxd, amyd, amzd; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); @@ -1762,7 +1762,7 @@ namespace ip_simd { SIMD_mask rmask2 = rmask >> 8; amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( - _mm512_shuffle_f32x4(amx,amx,238))); + _mm512_shuffle_f32x4(amx,amx,238))); fxtmp2 = SIMD_add(fxtmp2, rmask2, fxtmp2, amxd); amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( _mm512_shuffle_f32x4(amy,amy,238))); @@ -1776,57 +1776,57 @@ namespace ip_simd { } inline void SIMD_iforce_update(const SIMD_mask &m, float *force, - const SIMD_int &i, const SIMD_float &fx, - const SIMD_float &fy, const SIMD_float &fz, - const int EFLAG, const int eatom, - const SIMD_float &fwtmp) { + const SIMD_int &i, const SIMD_float &fx, + const SIMD_float &fy, const SIMD_float &fz, + const int EFLAG, const int eatom, + const SIMD_float &fwtmp) { SIMD_float jfrc; - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, - _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, + _MM_SCALE_1); jfrc = jfrc + fx; _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, - _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, + _MM_SCALE_1); jfrc = jfrc + fy; _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, - _MM_SCALE_1); + _MM_SCALE_1); jfrc = jfrc + fz; _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); if (EFLAG) { if (eatom) { - jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3, - _MM_SCALE_1); - jfrc = jfrc + fwtmp; - _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1); + jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3, + _MM_SCALE_1); + jfrc = jfrc + fwtmp; + _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1); } } } inline void SIMD_iforce_update(const SIMD_mask &m, double *force, - const SIMD_int &i, const SIMD_double &fx, - const SIMD_double &fy, const SIMD_double &fz, - const int EFLAG, const int eatom, - const SIMD_double &fwtmp) { + const SIMD_int &i, const SIMD_double &fx, + const SIMD_double &fy, const SIMD_double &fz, + const int EFLAG, const int eatom, + const SIMD_double &fwtmp) { SIMD_double jfrc; - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, - _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, + _MM_SCALE_2); jfrc = jfrc + fx; _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2); - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, - _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, + _MM_SCALE_2); jfrc = jfrc + fy; _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2); jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2, - _MM_SCALE_2); + _MM_SCALE_2); jfrc = jfrc + fz; _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2); if (EFLAG) { if (eatom) { - jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, - force + 3, _MM_SCALE_2); - jfrc = jfrc + fwtmp; - _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2); + jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, + force + 3, _MM_SCALE_2); + jfrc = jfrc + fwtmp; + _mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2); } } } @@ -1834,8 +1834,8 @@ namespace ip_simd { #ifdef SW_GATHER_TEST template <class atom_t> inline void SIMD_atom_gather(const SIMD_mask &m, const atom_t *atom, - const SIMD_int &i, SIMD_float &x, SIMD_float &y, - SIMD_float &z, SIMD_int &type) { + const SIMD_int &i, SIMD_float &x, SIMD_float &y, + SIMD_float &z, SIMD_int &type) { int jv_scalar[16] __attribute__((aligned(64))); int jm_scalar[16] __attribute__((aligned(64))); _mm512_store_epi32(jv_scalar, i); @@ -1846,65 +1846,65 @@ namespace ip_simd { pl1 = _mm512_loadu_ps((float *)((char *)atom + js)); js = jv_scalar[1]; pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom + - js)), 1); + js)), 1); js = jv_scalar[2]; pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom + - js)), 2); + js)), 2); js = jv_scalar[3]; pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom + - js)), 3); - + js)), 3); + js = jv_scalar[4]; pl2 = _mm512_loadu_ps((float *)((char *)atom + js)); js = jv_scalar[5]; pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom + - js)), 1); + js)), 1); js = jv_scalar[6]; pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom + - js)), 2); + js)), 2); js = jv_scalar[7]; pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom + - js)), 3); - + js)), 3); + js = jv_scalar[8]; pl3 = _mm512_loadu_ps((float *)((char *)atom + js)); js = jv_scalar[9]; pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom + - js)), 1); + js)), 1); js = jv_scalar[10]; pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom + - js)), 2); + js)), 2); js = jv_scalar[11]; pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom + - js)), 3); - + js)), 3); + js = jv_scalar[12]; pl4 = _mm512_loadu_ps((float *)((char *)atom + js)); js = jv_scalar[13]; pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom + - js)), 1); + js)), 1); js = jv_scalar[14]; pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom + - js)), 2); + js)), 2); js = jv_scalar[15]; pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom + - js)), 3); - + js)), 3); + SIMD_int c0 = _mm512_setr_epi32(0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c, - 0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d); + 0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d); SIMD_int c1 = _mm512_setr_epi32(0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d, - 0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c); + 0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c); SIMD_int c2 = _mm512_setr_epi32(0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e, - 0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f); + 0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f); SIMD_int c3 = _mm512_setr_epi32(0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f, - 0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e); + 0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e); SIMD_mask k_1 = _mm512_int2mask(65280); SIMD_float sl1 = _mm512_permutex2var_ps(pl3, c0, pl4); SIMD_float sl2 = _mm512_permutex2var_ps(pl1, c1, pl2); SIMD_float sl3 = _mm512_permutex2var_ps(pl3, c2, pl4); SIMD_float sl4 = _mm512_permutex2var_ps(pl1, c3, pl2); - + x = _mm512_shuffle_f32x4(sl2, sl1, 78); z = _mm512_shuffle_f32x4(sl4, sl3, 78); y = _mm512_mask_blend_ps(k_1, sl2, sl1); diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h index 403b74d8fe..547fadb6e9 100644 --- a/src/USER-INTEL/math_extra_intel.h +++ b/src/USER-INTEL/math_extra_intel.h @@ -18,110 +18,110 @@ #ifndef LMP_MATH_EXTRA_INTEL_H #define LMP_MATH_EXTRA_INTEL_H -#define ME_quat_to_mat_trans(quat, mat) \ -{ \ - flt_t quat_w = quat.w; \ - flt_t quat_i = quat.i; \ - flt_t quat_j = quat.j; \ - flt_t quat_k = quat.k; \ - flt_t w2 = quat_w * quat_w; \ - flt_t i2 = quat_i * quat_i; \ - flt_t j2 = quat_j * quat_j; \ - flt_t k2 = quat_k * quat_k; \ - flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \ - flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \ - flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \ - flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \ - flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \ - flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \ - \ - mat##_0 = w2 + i2 - j2 - k2; \ - mat##_3 = twoij - twokw; \ - mat##_6 = twojw + twoik; \ - \ - mat##_1 = twoij + twokw; \ - mat##_4 = w2 - i2 + j2 - k2; \ - mat##_7 = twojk - twoiw; \ - \ - mat##_2 = twoik - twojw; \ - mat##_5 = twojk + twoiw; \ - mat##_8 = w2 - i2 - j2 + k2; \ +#define ME_quat_to_mat_trans(quat, mat) \ +{ \ + flt_t quat_w = quat.w; \ + flt_t quat_i = quat.i; \ + flt_t quat_j = quat.j; \ + flt_t quat_k = quat.k; \ + flt_t w2 = quat_w * quat_w; \ + flt_t i2 = quat_i * quat_i; \ + flt_t j2 = quat_j * quat_j; \ + flt_t k2 = quat_k * quat_k; \ + flt_t twoij = (flt_t)2.0 * quat_i * quat_j; \ + flt_t twoik = (flt_t)2.0 * quat_i * quat_k; \ + flt_t twojk = (flt_t)2.0 * quat_j * quat_k; \ + flt_t twoiw = (flt_t)2.0 * quat_i * quat_w; \ + flt_t twojw = (flt_t)2.0 * quat_j * quat_w; \ + flt_t twokw = (flt_t)2.0 * quat_k * quat_w; \ + \ + mat##_0 = w2 + i2 - j2 - k2; \ + mat##_3 = twoij - twokw; \ + mat##_6 = twojw + twoik; \ + \ + mat##_1 = twoij + twokw; \ + mat##_4 = w2 - i2 + j2 - k2; \ + mat##_7 = twojk - twoiw; \ + \ + mat##_2 = twoik - twojw; \ + mat##_5 = twojk + twoiw; \ + mat##_8 = w2 - i2 - j2 + k2; \ } /* ---------------------------------------------------------------------- diagonal matrix times a full matrix ------------------------------------------------------------------------- */ -#define ME_diag_times3(d, m, ans) \ - { \ - ans##_0 = d[0] * m##_0; \ - ans##_1 = d[0] * m##_1; \ - ans##_2 = d[0] * m##_2; \ - ans##_3 = d[1] * m##_3; \ - ans##_4 = d[1] * m##_4; \ - ans##_5 = d[1] * m##_5; \ - ans##_6 = d[2] * m##_6; \ - ans##_7 = d[2] * m##_7; \ - ans##_8 = d[2] * m##_8; \ +#define ME_diag_times3(d, m, ans) \ + { \ + ans##_0 = d[0] * m##_0; \ + ans##_1 = d[0] * m##_1; \ + ans##_2 = d[0] * m##_2; \ + ans##_3 = d[1] * m##_3; \ + ans##_4 = d[1] * m##_4; \ + ans##_5 = d[1] * m##_5; \ + ans##_6 = d[2] * m##_6; \ + ans##_7 = d[2] * m##_7; \ + ans##_8 = d[2] * m##_8; \ } -#define ME_diag_times3a(d, m, ans) \ - { \ - ans##_0 = d##_0 * m##_0; \ - ans##_1 = d##_0 * m##_1; \ - ans##_2 = d##_0 * m##_2; \ - ans##_3 = d##_1 * m##_3; \ - ans##_4 = d##_1 * m##_4; \ - ans##_5 = d##_1 * m##_5; \ - ans##_6 = d##_2 * m##_6; \ - ans##_7 = d##_2 * m##_7; \ - ans##_8 = d##_2 * m##_8; \ +#define ME_diag_times3a(d, m, ans) \ + { \ + ans##_0 = d##_0 * m##_0; \ + ans##_1 = d##_0 * m##_1; \ + ans##_2 = d##_0 * m##_2; \ + ans##_3 = d##_1 * m##_3; \ + ans##_4 = d##_1 * m##_4; \ + ans##_5 = d##_1 * m##_5; \ + ans##_6 = d##_2 * m##_6; \ + ans##_7 = d##_2 * m##_7; \ + ans##_8 = d##_2 * m##_8; \ } /* ---------------------------------------------------------------------- multiply the transpose of mat1 times mat2 ------------------------------------------------------------------------- */ -#define ME_transpose_times3(m1, m2, ans) \ -{ \ - ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \ - ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \ - ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \ - ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \ - ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \ - ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \ - ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \ - ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \ - ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \ +#define ME_transpose_times3(m1, m2, ans) \ +{ \ + ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6; \ + ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7; \ + ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8; \ + ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6; \ + ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7; \ + ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8; \ + ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6; \ + ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7; \ + ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8; \ } /* ---------------------------------------------------------------------- normalize a vector, return in ans ------------------------------------------------------------------------- */ -#define ME_normalize3(v0, v1, v2, ans) \ -{ \ - flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \ - ans##_0 = v0 * scale; \ - ans##_1 = v1 * scale; \ - ans##_2 = v2 * scale; \ +#define ME_normalize3(v0, v1, v2, ans) \ +{ \ + flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2); \ + ans##_0 = v0 * scale; \ + ans##_1 = v1 * scale; \ + ans##_2 = v2 * scale; \ } /* ---------------------------------------------------------------------- add two matrices ------------------------------------------------------------------------- */ -#define ME_plus3(m1, m2, ans) \ -{ \ - ans##_0 = m1##_0 + m2##_0; \ - ans##_1 = m1##_1 + m2##_1; \ - ans##_2 = m1##_2 + m2##_2; \ - ans##_3 = m1##_3 + m2##_3; \ - ans##_4 = m1##_4 + m2##_4; \ - ans##_5 = m1##_5 + m2##_5; \ - ans##_6 = m1##_6 + m2##_6; \ - ans##_7 = m1##_7 + m2##_7; \ - ans##_8 = m1##_8 + m2##_8; \ +#define ME_plus3(m1, m2, ans) \ +{ \ + ans##_0 = m1##_0 + m2##_0; \ + ans##_1 = m1##_1 + m2##_1; \ + ans##_2 = m1##_2 + m2##_2; \ + ans##_3 = m1##_3 + m2##_3; \ + ans##_4 = m1##_4 + m2##_4; \ + ans##_5 = m1##_5 + m2##_5; \ + ans##_6 = m1##_6 + m2##_6; \ + ans##_7 = m1##_7 + m2##_7; \ + ans##_8 = m1##_8 + m2##_8; \ } /* ---------------------------------------------------------------------- @@ -135,7 +135,7 @@ determinant of a matrix ------------------------------------------------------------------------- */ -#define ME_det3(m) \ +#define ME_det3(m) \ ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \ m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \ m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 ) @@ -144,8 +144,8 @@ row vector times matrix ------------------------------------------------------------------------- */ -#define ME_vecmat(v, m, ans) \ -{ \ +#define ME_vecmat(v, m, ans) \ +{ \ ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6; \ ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7; \ ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8; \ @@ -155,214 +155,214 @@ cross product of 2 vectors ------------------------------------------------------------------------- */ -#define ME_cross3(v1, v2, ans) \ -{ \ - ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \ - ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \ - ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \ +#define ME_cross3(v1, v2, ans) \ +{ \ + ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1; \ + ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2; \ + ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0; \ } /* ---------------------------------------------------------------------- cross product of 2 vectors ------------------------------------------------------------------------- */ -#define ME_mv0_cross3(m1, v2, ans) \ -{ \ - ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \ - ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \ - ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \ +#define ME_mv0_cross3(m1, v2, ans) \ +{ \ + ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1; \ + ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2; \ + ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0; \ } -#define ME_mv1_cross3(m1, v2, ans) \ -{ \ - ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \ - ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \ - ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \ +#define ME_mv1_cross3(m1, v2, ans) \ +{ \ + ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1; \ + ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2; \ + ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0; \ } -#define ME_mv2_cross3(m1, v2, ans) \ -{ \ - ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \ - ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \ - ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \ +#define ME_mv2_cross3(m1, v2, ans) \ +{ \ + ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1; \ + ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2; \ + ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0; \ } #define ME_compute_eta_torque(m1, m2, s1, ans) \ -{ \ - flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \ - m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \ - m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \ - den = (flt_t)1.0 / den; \ - \ +{ \ + flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7- \ + m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5- \ + m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8; \ + den = (flt_t)1.0 / den; \ + \ ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0- \ - m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \ - m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \ - m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \ - m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \ - \ - ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \ - (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \ - (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \ - m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \ - m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \ - \ + m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+ \ + m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8- \ + m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+ \ + m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den; \ + \ + ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+ \ + (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5- \ + (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2- \ + m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+ \ + m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den; \ + \ ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4- \ - m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \ - m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \ - (flt_t)2.0*m1##_4*m1##_0*m2##_2- \ - (flt_t)2.0*m1##_3*m2##_2*m1##_1+ \ - m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \ - \ + m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1- \ + m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+ \ + (flt_t)2.0*m1##_4*m1##_0*m2##_2- \ + (flt_t)2.0*m1##_3*m2##_2*m1##_1+ \ + m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den; \ + \ ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+ \ - m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \ - m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \ - m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \ - m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \ - \ - ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \ - (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \ - (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \ - m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \ - m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \ - \ - ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \ - m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \ - (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \ - m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \ - (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \ - den; \ - \ - ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \ - (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \ - m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \ - m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \ - m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \ - \ - ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \ - (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \ - (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \ - m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \ - m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \ - \ - ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \ - m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \ - m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \ - (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \ + m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+ \ + m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8- \ + m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- \ + m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den; \ + \ + ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+ \ + (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5- \ + (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+ \ + m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2- \ + m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den; \ + \ + ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4- \ + m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+ \ + (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+ \ + m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4- \ + (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)* \ + den; \ + \ + ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+ \ + (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+ \ + m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5- \ + m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7- \ + m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den; \ + \ + ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7- \ + (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+ \ + (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8- \ + m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+ \ + m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den; \ + \ + ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4- \ + m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7- \ + m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+ \ + (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+ \ m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)* \ - den; \ + den; \ } -#define ME_vcopy4(dst,src) \ - dst##_0 = src##_0; \ - dst##_1 = src##_1; \ - dst##_2 = src##_2; \ +#define ME_vcopy4(dst,src) \ + dst##_0 = src##_0; \ + dst##_1 = src##_1; \ + dst##_2 = src##_2; \ dst##_3 = src##_3; -#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \ -{ \ - flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \ - flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \ - \ - aug_3 = v_0; \ - aug_0 = m1##_0; \ - aug_1 = m1##_1; \ - aug_2 = m1##_2; \ - aug_7 = v_1; \ - aug_4 = m1##_3; \ - aug_5 = m1##_4; \ - aug_6 = m1##_5; \ - aug_11 = v_2; \ - aug_8 = m1##_6; \ - aug_9 = m1##_7; \ - aug_10 = m1##_8; \ - \ - if (fabs(aug_4) > fabs(aug_0)) { \ - flt_t swapt; \ - swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ - swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ - swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ - swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ - } \ - if (fabs(aug_8) > fabs(aug_0)) { \ - flt_t swapt; \ - swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ +#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error) \ +{ \ + flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5; \ + flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t; \ + \ + aug_3 = v_0; \ + aug_0 = m1##_0; \ + aug_1 = m1##_1; \ + aug_2 = m1##_2; \ + aug_7 = v_1; \ + aug_4 = m1##_3; \ + aug_5 = m1##_4; \ + aug_6 = m1##_5; \ + aug_11 = v_2; \ + aug_8 = m1##_6; \ + aug_9 = m1##_7; \ + aug_10 = m1##_8; \ + \ + if (fabs(aug_4) > fabs(aug_0)) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ + swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ + swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ + swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ + } \ + if (fabs(aug_8) > fabs(aug_0)) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ - } \ - \ - if (aug_0 != (flt_t)0.0) { \ - } else if (aug_4 != (flt_t)0.0) { \ - flt_t swapt; \ - swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ - swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ - swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ - swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ - } else if (aug_8 != (flt_t)0.0) { \ - flt_t swapt; \ - swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ - swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ - swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ - swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ - } else \ - error = 1; \ - \ - t = aug_4 / aug_0; \ - aug_5 -= t * aug_1; \ - aug_6 -= t * aug_2; \ - aug_7 -= t * aug_3; \ - t = aug_8 / aug_0; \ - aug_9 -= t * aug_1; \ - aug_10 -= t * aug_2; \ - aug_11 -= t * aug_3; \ - \ - if (fabs(aug_9) > fabs(aug_5)) { \ - flt_t swapt; \ - swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \ - swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ - swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ - swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ - } \ - \ - if (aug_5 != (flt_t)0.0) { \ - } else if (aug_9 != (flt_t)0.0) { \ - flt_t swapt; \ + } \ + \ + if (aug_0 != (flt_t)0.0) { \ + } else if (aug_4 != (flt_t)0.0) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_4; aug_4 = swapt; \ + swapt = aug_1; aug_1 = aug_5; aug_5 = swapt; \ + swapt = aug_2; aug_2 = aug_6; aug_6 = swapt; \ + swapt = aug_3; aug_3 = aug_7; aug_7 = swapt; \ + } else if (aug_8 != (flt_t)0.0) { \ + flt_t swapt; \ + swapt = aug_0; aug_0 = aug_8; aug_8 = swapt; \ + swapt = aug_1; aug_1 = aug_9; aug_9 = swapt; \ + swapt = aug_2; aug_2 = aug_10; aug_10 = swapt; \ + swapt = aug_3; aug_3 = aug_11; aug_11 = swapt; \ + } else \ + error = 1; \ + \ + t = aug_4 / aug_0; \ + aug_5 -= t * aug_1; \ + aug_6 -= t * aug_2; \ + aug_7 -= t * aug_3; \ + t = aug_8 / aug_0; \ + aug_9 -= t * aug_1; \ + aug_10 -= t * aug_2; \ + aug_11 -= t * aug_3; \ + \ + if (fabs(aug_9) > fabs(aug_5)) { \ + flt_t swapt; \ + swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \ + swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ + swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ + swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ + } \ + \ + if (aug_5 != (flt_t)0.0) { \ + } else if (aug_9 != (flt_t)0.0) { \ + flt_t swapt; \ swapt = aug_4; aug_4 = aug_8; aug_8 = swapt; \ - swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ - swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ - swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ - } \ - \ - t = aug_9 / aug_5; \ - aug_10 -= t * aug_6; \ - aug_11 -= t * aug_7; \ - \ - if (aug_10 == (flt_t)0.0) \ - error = 1; \ - \ - ans##_2 = aug_11/aug_10; \ - t = (flt_t)0.0; \ - t += aug_6 * ans##_2; \ - ans##_1 = (aug_7-t) / aug_5; \ - t = (flt_t)0.0; \ - t += aug_1 * ans##_1; \ - t += aug_2 * ans##_2; \ - ans##_0 = (aug_3 - t) / aug_0; \ + swapt = aug_5; aug_5 = aug_9; aug_9 = swapt; \ + swapt = aug_6; aug_6 = aug_10; aug_10 = swapt; \ + swapt = aug_7; aug_7 = aug_11; aug_11 = swapt; \ + } \ + \ + t = aug_9 / aug_5; \ + aug_10 -= t * aug_6; \ + aug_11 -= t * aug_7; \ + \ + if (aug_10 == (flt_t)0.0) \ + error = 1; \ + \ + ans##_2 = aug_11/aug_10; \ + t = (flt_t)0.0; \ + t += aug_6 * ans##_2; \ + ans##_1 = (aug_7-t) / aug_5; \ + t = (flt_t)0.0; \ + t += aug_1 * ans##_1; \ + t += aug_2 * ans##_2; \ + ans##_0 = (aug_3 - t) / aug_0; \ } /* ---------------------------------------------------------------------- normalize a quaternion ------------------------------------------------------------------------- */ -#define ME_qnormalize(q) \ -{ \ - double norm = 1.0 / \ - sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \ - q##_w *= norm; \ - q##_i *= norm; \ - q##_j *= norm; \ - q##_k *= norm; \ +#define ME_qnormalize(q) \ +{ \ + double norm = 1.0 / \ + sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k); \ + q##_w *= norm; \ + q##_i *= norm; \ + q##_j *= norm; \ + q##_k *= norm; \ } /* ---------------------------------------------------------------------- @@ -373,106 +373,106 @@ and divide by principal moments ------------------------------------------------------------------------- */ -#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \ -{ \ - double wbody_0, wbody_1, wbody_2; \ - double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \ - \ - double w2 = quat##_w * quat##_w; \ - double i2 = quat##_i * quat##_i; \ - double j2 = quat##_j * quat##_j; \ - double k2 = quat##_k * quat##_k; \ - double twoij = 2.0 * quat##_i * quat##_j; \ - double twoik = 2.0 * quat##_i * quat##_k; \ - double twojk = 2.0 * quat##_j * quat##_k; \ - double twoiw = 2.0 * quat##_i * quat##_w; \ - double twojw = 2.0 * quat##_j * quat##_w; \ - double twokw = 2.0 * quat##_k * quat##_w; \ - \ - rot##_0 = w2 + i2 - j2 - k2; \ - rot##_1 = twoij - twokw; \ - rot##_2 = twojw + twoik; \ - \ - rot##_3 = twoij + twokw; \ - rot##_4 = w2 - i2 + j2 - k2; \ - rot##_5 = twojk - twoiw; \ - \ - rot##_6 = twoik - twojw; \ - rot##_7 = twojk + twoiw; \ - rot##_8 = w2 - i2 - j2 + k2; \ - \ +#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w) \ +{ \ + double wbody_0, wbody_1, wbody_2; \ + double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \ + \ + double w2 = quat##_w * quat##_w; \ + double i2 = quat##_i * quat##_i; \ + double j2 = quat##_j * quat##_j; \ + double k2 = quat##_k * quat##_k; \ + double twoij = 2.0 * quat##_i * quat##_j; \ + double twoik = 2.0 * quat##_i * quat##_k; \ + double twojk = 2.0 * quat##_j * quat##_k; \ + double twoiw = 2.0 * quat##_i * quat##_w; \ + double twojw = 2.0 * quat##_j * quat##_w; \ + double twokw = 2.0 * quat##_k * quat##_w; \ + \ + rot##_0 = w2 + i2 - j2 - k2; \ + rot##_1 = twoij - twokw; \ + rot##_2 = twojw + twoik; \ + \ + rot##_3 = twoij + twokw; \ + rot##_4 = w2 - i2 + j2 - k2; \ + rot##_5 = twojk - twoiw; \ + \ + rot##_6 = twoik - twojw; \ + rot##_7 = twojk + twoiw; \ + rot##_8 = w2 - i2 - j2 + k2; \ + \ wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2; \ wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2; \ wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2; \ - \ - wbody_0 *= moments_0; \ - wbody_1 *= moments_1; \ - wbody_2 *= moments_2; \ - \ + \ + wbody_0 *= moments_0; \ + wbody_1 *= moments_1; \ + wbody_2 *= moments_2; \ + \ w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2; \ w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2; \ w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2; \ } -#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \ -{ \ - angmomin[0] += dtf * torque[0]; \ - double angmom_0 = angmomin[0]; \ - angmomin[1] += dtf * torque[1]; \ - double angmom_1 = angmomin[1]; \ - angmomin[2] += dtf * torque[2]; \ - double angmom_2 = angmomin[2]; \ - \ - double quat_w = quatin[0]; \ - double quat_i = quatin[1]; \ - double quat_j = quatin[2]; \ - double quat_k = quatin[3]; \ - \ - double omega_0, omega_1, omega_2; \ - ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \ - \ - double wq_0, wq_1, wq_2, wq_3; \ - wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \ - wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \ - wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \ - wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \ - \ - double qfull_w, qfull_i, qfull_j, qfull_k; \ - qfull_w = quat_w + dtq * wq_0; \ - qfull_i = quat_i + dtq * wq_1; \ - qfull_j = quat_j + dtq * wq_2; \ - qfull_k = quat_k + dtq * wq_3; \ - ME_qnormalize(qfull); \ - \ - double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \ - qhalf_w = quat_w + 0.5*dtq * wq_0; \ - qhalf_i = quat_i + 0.5*dtq * wq_1; \ - qhalf_j = quat_j + 0.5*dtq * wq_2; \ - qhalf_k = quat_k + 0.5*dtq * wq_3; \ - ME_qnormalize(qhalf); \ - \ - ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \ - wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \ - wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \ - wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \ - wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \ - \ - qhalf_w += 0.5*dtq * wq_0; \ - qhalf_i += 0.5*dtq * wq_1; \ - qhalf_j += 0.5*dtq * wq_2; \ - qhalf_k += 0.5*dtq * wq_3; \ - ME_qnormalize(qhalf); \ - \ - quat_w = 2.0*qhalf_w - qfull_w; \ - quat_i = 2.0*qhalf_i - qfull_i; \ - quat_j = 2.0*qhalf_j - qfull_j; \ - quat_k = 2.0*qhalf_k - qfull_k; \ - ME_qnormalize(quat); \ - \ - quatin[0] = quat_w; \ - quatin[1] = quat_i; \ - quatin[2] = quat_j; \ - quatin[3] = quat_k; \ +#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2) \ +{ \ + angmomin[0] += dtf * torque[0]; \ + double angmom_0 = angmomin[0]; \ + angmomin[1] += dtf * torque[1]; \ + double angmom_1 = angmomin[1]; \ + angmomin[2] += dtf * torque[2]; \ + double angmom_2 = angmomin[2]; \ + \ + double quat_w = quatin[0]; \ + double quat_i = quatin[1]; \ + double quat_j = quatin[2]; \ + double quat_k = quatin[3]; \ + \ + double omega_0, omega_1, omega_2; \ + ME_mq_to_omega(angmom,quat,i0,i1,i2,omega); \ + \ + double wq_0, wq_1, wq_2, wq_3; \ + wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k; \ + wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j; \ + wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k; \ + wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i; \ + \ + double qfull_w, qfull_i, qfull_j, qfull_k; \ + qfull_w = quat_w + dtq * wq_0; \ + qfull_i = quat_i + dtq * wq_1; \ + qfull_j = quat_j + dtq * wq_2; \ + qfull_k = quat_k + dtq * wq_3; \ + ME_qnormalize(qfull); \ + \ + double qhalf_w, qhalf_i, qhalf_j, qhalf_k; \ + qhalf_w = quat_w + 0.5*dtq * wq_0; \ + qhalf_i = quat_i + 0.5*dtq * wq_1; \ + qhalf_j = quat_j + 0.5*dtq * wq_2; \ + qhalf_k = quat_k + 0.5*dtq * wq_3; \ + ME_qnormalize(qhalf); \ + \ + ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega); \ + wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k; \ + wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j; \ + wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k; \ + wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i; \ + \ + qhalf_w += 0.5*dtq * wq_0; \ + qhalf_i += 0.5*dtq * wq_1; \ + qhalf_j += 0.5*dtq * wq_2; \ + qhalf_k += 0.5*dtq * wq_3; \ + ME_qnormalize(qhalf); \ + \ + quat_w = 2.0*qhalf_w - qfull_w; \ + quat_i = 2.0*qhalf_i - qfull_i; \ + quat_j = 2.0*qhalf_j - qfull_j; \ + quat_k = 2.0*qhalf_k - qfull_k; \ + ME_qnormalize(quat); \ + \ + quatin[0] = quat_w; \ + quatin[1] = quat_i; \ + quatin[2] = quat_j; \ + quatin[3] = quat_k; \ } #endif diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp index bff3d53636..c5574a78c7 100644 --- a/src/USER-INTEL/nbin_intel.cpp +++ b/src/USER-INTEL/nbin_intel.cpp @@ -51,11 +51,11 @@ NBinIntel::~NBinIntel() { const int * bins = this->bins; const int * _atombin = this->_atombin; const int * _binpacked = this->_binpacked; - #pragma offload_transfer target(mic:_cop) \ + #pragma offload_transfer target(mic:_cop) \ nocopy(binhead,bins,_atombin,_binpacked:alloc_if(0) free_if(1)) } #endif -} +} /* ---------------------------------------------------------------------- setup for bin_atoms() @@ -70,8 +70,8 @@ void NBinIntel::bin_atoms_setup(int nall) #ifdef _LMP_INTEL_OFFLOAD if (_offload_alloc) { const int * binhead = this->binhead; - #pragma offload_transfer target(mic:_cop) \ - nocopy(binhead:alloc_if(0) free_if(1)) + #pragma offload_transfer target(mic:_cop) \ + nocopy(binhead:alloc_if(0) free_if(1)) } #endif @@ -98,8 +98,8 @@ void NBinIntel::bin_atoms_setup(int nall) const int * bins = this->bins; const int * _atombin = this->_atombin; const int * _binpacked = this->_binpacked; - #pragma offload_transfer target(mic:_cop) \ - nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1)) + #pragma offload_transfer target(mic:_cop) \ + nocopy(bins,_atombin,_binpacked:alloc_if(0) free_if(1)) } #endif memory->destroy(bins); @@ -157,10 +157,10 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) { const flt_t dx = (INTEL_BIGP - bboxhi[0]); const flt_t dy = (INTEL_BIGP - bboxhi[1]); const flt_t dz = (INTEL_BIGP - bboxhi[2]); - if (dx * dx + dy * dy + dz * dz < - static_cast<flt_t>(neighbor->cutneighmaxsq)) + if (dx * dx + dy * dy + dz * dz < + static_cast<flt_t>(neighbor->cutneighmaxsq)) error->one(FLERR, - "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); + "Intel package expects no atoms within cutoff of {1e15,1e15,1e15}."); } // ---------- Grow and cast/pack buffers ------------- @@ -183,7 +183,7 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) { { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, - sizeof(ATOM_T)); + sizeof(ATOM_T)); buffers->thr_pack(ifrom, ito, 0); } _fix->stop_watch(TIME_PACK); diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp index ae4f599176..06c10c080f 100644 --- a/src/USER-INTEL/npair_full_bin_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_intel.cpp @@ -70,48 +70,48 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { #endif buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end, - _fix->nbor_pack_width()); + _fix->nbor_pack_width()); int need_ic = 0; if (atom->molecular) dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); + neighbor->cutneighmax); #ifdef _LMP_INTEL_OFFLOAD if (_fix->three_body_neighbor()) { if (need_ic) { if (offload_noghost) { - bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,1,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,1,1,0,1>(0, list, buffers, host_start, nlocal, off_end); } else { - bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,1,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,1,1,0,1>(0, list, buffers, host_start, nlocal); } } else { if (offload_noghost) { - bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,0,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,0,1,0,1>(0, list, buffers, host_start, nlocal, off_end); } else { - bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,0,1,0,1>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,0,1,0,1>(0, list, buffers, host_start, nlocal); } } } else { if (need_ic) { if (offload_noghost) { - bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,1,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,1,1,0,0>(0, list, buffers, host_start, nlocal, off_end); } else { - bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,1,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,1,1,0,0>(0, list, buffers, host_start, nlocal); } } else { if (offload_noghost) { - bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end); + bin_newton<flt_t,acc_t,1,0,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,1,0,1,0,0>(0, list, buffers, host_start, nlocal, off_end); } else { - bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal); + bin_newton<flt_t,acc_t,0,0,1,0,0>(1, list, buffers, 0, off_end); + bin_newton<flt_t,acc_t,0,0,1,0,0>(0, list, buffers, host_start, nlocal); } } } diff --git a/src/USER-INTEL/npair_full_bin_intel.h b/src/USER-INTEL/npair_full_bin_intel.h index 83f2c3cd4c..0f8a27b3b4 100644 --- a/src/USER-INTEL/npair_full_bin_intel.h +++ b/src/USER-INTEL/npair_full_bin_intel.h @@ -15,7 +15,7 @@ NPairStyle(full/bin/intel, NPairFullBinIntel, - NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | + NP_FULL | NP_BIN | NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL) #else diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp index e7d5995cc5..c761557097 100644 --- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp +++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp @@ -26,7 +26,7 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : +NPairHalfBinNewtonIntel::NPairHalfBinNewtonIntel(LAMMPS *lmp) : NPairIntel(lmp) {} /* ---------------------------------------------------------------------- @@ -75,14 +75,14 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int need_ic = 0; if (atom->molecular) dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); + neighbor->cutneighmax); #ifdef _LMP_INTEL_OFFLOAD if (need_ic) { if (offload_noghost) { bin_newton<flt_t,acc_t,1,1,0,0,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal, - off_end); + bin_newton<flt_t,acc_t,1,1,0,0,0>(0, list, buffers, host_start, nlocal, + off_end); } else { bin_newton<flt_t,acc_t,0,1,0,0,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal); @@ -90,7 +90,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { } else { if (offload_noghost) { bin_newton<flt_t,acc_t,1,0,0,0,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal, + bin_newton<flt_t,acc_t,1,0,0,0,0>(0, list, buffers, host_start, nlocal, off_end); } else { bin_newton<flt_t,acc_t,0,0,0,0,0>(1, list, buffers, 0, off_end); @@ -98,7 +98,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { } } #else - if (need_ic) + if (need_ic) bin_newton<flt_t,acc_t,0,1,0,0,0>(0, list, buffers, host_start, nlocal); else bin_newton<flt_t,acc_t,0,0,0,0,0>(0, list, buffers, host_start, nlocal); diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp index 3c36458f06..d70f1ec589 100644 --- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp +++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp @@ -26,7 +26,7 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : +NPairHalfBinNewtonTriIntel::NPairHalfBinNewtonTriIntel(LAMMPS *lmp) : NPairIntel(lmp) {} /* ---------------------------------------------------------------------- @@ -75,14 +75,14 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int need_ic = 0; if (atom->molecular) dminimum_image_check(need_ic, neighbor->cutneighmax, neighbor->cutneighmax, - neighbor->cutneighmax); + neighbor->cutneighmax); #ifdef _LMP_INTEL_OFFLOAD if (need_ic) { if (offload_noghost) { bin_newton<flt_t,acc_t,1,1,0,1,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal, - off_end); + bin_newton<flt_t,acc_t,1,1,0,1,0>(0, list, buffers, host_start, nlocal, + off_end); } else { bin_newton<flt_t,acc_t,0,1,0,1,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,1,0,1,0>(0, list, buffers, host_start, nlocal); @@ -90,8 +90,8 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { } else { if (offload_noghost) { bin_newton<flt_t,acc_t,1,0,0,1,0>(1, list, buffers, 0, off_end); - bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal, - off_end); + bin_newton<flt_t,acc_t,1,0,0,1,0>(0, list, buffers, host_start, nlocal, + off_end); } else { bin_newton<flt_t,acc_t,0,0,0,1,0>(1, list, buffers, 0, off_end); bin_newton<flt_t,acc_t,0,0,0,1,0>(0, list, buffers, host_start, nlocal); diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index 0412398796..b20b1dcd08 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -40,7 +40,7 @@ NPairIntel::~NPairIntel() { #ifdef _LMP_INTEL_OFFLOAD if (_off_map_stencil) { const int * stencil = this->stencil; - #pragma offload_transfer target(mic:_cop) \ + #pragma offload_transfer target(mic:_cop) \ nocopy(stencil:alloc_if(0) free_if(1)) } #endif @@ -49,10 +49,10 @@ NPairIntel::~NPairIntel() { /* ---------------------------------------------------------------------- */ template <class flt_t, class acc_t, int offload_noghost, int need_ic, - int FULL, int TRI, int THREE> -void NPairIntel::bin_newton(const int offload, NeighList *list, - IntelBuffers<flt_t,acc_t> *buffers, - const int astart, const int aend, + int FULL, int TRI, int THREE> +void NPairIntel::bin_newton(const int offload, NeighList *list, + IntelBuffers<flt_t,acc_t> *buffers, + const int astart, const int aend, const int offload_end) { if (aend-astart == 0) return; @@ -66,7 +66,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, if (THREE == 0 && offload) { if (INTEL_MIC_NBOR_PAD > 1) pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); - } else + } else #endif if (THREE == 0 && INTEL_NBOR_PAD > 1) pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); @@ -120,7 +120,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, overflow = _fix->get_off_overflow_flag(); _fix->stop_watch(TIME_HOST_NEIGHBOR); _fix->start_watch(TIME_OFFLOAD_LATENCY); - } else + } else #endif { tnum = comm->nthreads; @@ -193,8 +193,8 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int end = stencil[k] + 1; for (int kk = k + 1; kk < nstencil; kk++) { if (stencil[kk-1]+1 == stencil[kk]) { - end++; - k++; + end++; + k++; } else break; } binend[nstencilp] = end; @@ -214,16 +214,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int tid, ifrom, ito; if (THREE) { - IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width); + IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width); } else { - IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); + IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads); } ifrom += astart; ito += astart; int e_ito = ito; if (THREE && ito == num) { - int imod = ito % pack_width; - if (imod) e_ito += pack_width - imod; + int imod = ito % pack_width; + if (imod) e_ito += pack_width - imod; } const int list_size = (e_ito + tid * 2 + 2) * maxnbors; @@ -251,313 +251,313 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, // loop over all atoms in other bins in stencil, store every pair int istart, icount, ncount, oldbin = -9999999, lane, max_chunk; if (THREE) { - lane = 0; - max_chunk = 0; + lane = 0; + max_chunk = 0; } for (int i = ifrom; i < ito; i++) { const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; const int itype = x[i].w; - tagint itag; - if (THREE) itag = tag[i]; + tagint itag; + if (THREE) itag = tag[i]; const int ioffset = ntypes * itype; const int ibin = atombin[i]; - if (ibin != oldbin) { - oldbin = ibin; - ncount = 0; - for (int k = 0; k < nstencilp; k++) { - const int bstart = binhead[ibin + binstart[k]]; - const int bend = binhead[ibin + binend[k]]; + if (ibin != oldbin) { + oldbin = ibin; + ncount = 0; + for (int k = 0; k < nstencilp; k++) { + const int bstart = binhead[ibin + binstart[k]]; + const int bend = binhead[ibin + binend[k]]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd - #endif - for (int jj = bstart; jj < bend; jj++) - tj[ncount++] = binpacked[jj]; - } + #pragma vector aligned + #pragma simd + #endif + for (int jj = bstart; jj < bend; jj++) + tj[ncount++] = binpacked[jj]; + } #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd - #endif - for (int u = 0; u < ncount; u++) { - const int j = tj[u]; - tx[u] = x[j].x; - ty[u] = x[j].y; - tz[u] = x[j].z; - tjtype[u] = x[j].w; - } - - if (FULL == 0 || TRI == 1) { - icount = 0; - istart = ncount; - const int alignb = INTEL_DATA_ALIGN / sizeof(int); - int nedge = istart % alignb; - if (nedge) istart + (alignb - nedge); - itx = tx + istart; - ity = ty + istart; - itz = tz + istart; - itj = tj + istart; - itjtype = tjtype + istart; + #pragma simd + #endif + for (int u = 0; u < ncount; u++) { + const int j = tj[u]; + tx[u] = x[j].x; + ty[u] = x[j].y; + tz[u] = x[j].z; + tjtype[u] = x[j].w; + } + + if (FULL == 0 || TRI == 1) { + icount = 0; + istart = ncount; + const int alignb = INTEL_DATA_ALIGN / sizeof(int); + int nedge = istart % alignb; + if (nedge) istart + (alignb - nedge); + itx = tx + istart; + ity = ty + istart; + itz = tz + istart; + itj = tj + istart; + itjtype = tjtype + istart; const int bstart = binhead[ibin]; - const int bend = binhead[ibin + 1]; + const int bend = binhead[ibin + 1]; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd - #endif - for (int jj = bstart; jj < bend; jj++) { - const int j = binpacked[jj]; - itj[icount] = j; - itx[icount] = x[j].x; - ity[icount] = x[j].y; - itz[icount] = x[j].z; - itjtype[icount] = x[j].w; - icount++; - } - if (icount + istart > obound) *overflow = 1; - } else - if (ncount > obound) *overflow = 1; - } - - // ---------------------- Loop over i bin + #pragma vector aligned + #pragma simd + #endif + for (int jj = bstart; jj < bend; jj++) { + const int j = binpacked[jj]; + itj[icount] = j; + itx[icount] = x[j].x; + ity[icount] = x[j].y; + itz[icount] = x[j].z; + itjtype[icount] = x[j].w; + icount++; + } + if (icount + istart > obound) *overflow = 1; + } else + if (ncount > obound) *overflow = 1; + } + + // ---------------------- Loop over i bin int n = 0; - if (FULL == 0 || TRI == 1) { + if (FULL == 0 || TRI == 1) { #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma ivdep - #endif - for (int u = 0; u < icount; u++) { - int addme = 1; - int j = itj[u]; - - // Cutoff Check - const flt_t delx = xtmp - itx[u]; - const flt_t dely = ytmp - ity[u]; - const flt_t delz = ztmp - itz[u]; - const int jtype = itjtype[u]; - const flt_t rsq = delx * delx + dely * dely + delz * delz; - if (rsq > cutneighsq[ioffset + jtype]) addme = 0; - - // i bin (half) check and offload ghost check - if (j < nlocal) { - const int ijmod = (i + j) % 2; - if (i > j) { - if (ijmod == 0) addme = 0; - } else if (i < j) { - if (ijmod == 1) addme = 0; - } else - addme = 0; + #pragma ivdep + #endif + for (int u = 0; u < icount; u++) { + int addme = 1; + int j = itj[u]; + + // Cutoff Check + const flt_t delx = xtmp - itx[u]; + const flt_t dely = ytmp - ity[u]; + const flt_t delz = ztmp - itz[u]; + const int jtype = itjtype[u]; + const flt_t rsq = delx * delx + dely * dely + delz * delz; + if (rsq > cutneighsq[ioffset + jtype]) addme = 0; + + // i bin (half) check and offload ghost check + if (j < nlocal) { + const int ijmod = (i + j) % 2; + if (i > j) { + if (ijmod == 0) addme = 0; + } else if (i < j) { + if (ijmod == 1) addme = 0; + } else + addme = 0; #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost && i < offload_end) addme = 0; - #endif - } else { + if (offload_noghost && i < offload_end) addme = 0; + #endif + } else { #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost && offload) addme = 0; - #endif - if (itz[u] < ztmp) addme = 0; - if (itz[u] == ztmp) { + if (offload_noghost && offload) addme = 0; + #endif + if (itz[u] < ztmp) addme = 0; + if (itz[u] == ztmp) { if (ity[u] < ytmp) addme = 0; if (ity[u] == ytmp && itx[u] < xtmp) addme = 0; } - } - - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - j = -j - 1; - } - - if (addme) - neighptr[n++] = j; - } - } // if FULL==0 - - // ---------------------- Loop over other bins - - int n2, *neighptr2; - if (THREE) { - n = pack_offset; - n2 = pack_offset + maxnbors; - neighptr2 = neighptr; - } - #if defined(LMP_SIMD_COMPILER) + } + + if (need_ic) { + int no_special; + ominimum_image_check(no_special, delx, dely, delz); + if (no_special) + j = -j - 1; + } + + if (addme) + neighptr[n++] = j; + } + } // if FULL==0 + + // ---------------------- Loop over other bins + + int n2, *neighptr2; + if (THREE) { + n = pack_offset; + n2 = pack_offset + maxnbors; + neighptr2 = neighptr; + } + #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep - #endif - for (int u = 0; u < ncount; u++) { - int addme = 1; + #endif + for (int u = 0; u < ncount; u++) { + int addme = 1; int j = tj[u]; - if (FULL) - if (i == j) addme = 0; + if (FULL) + if (i == j) addme = 0; - // Cutoff Check + // Cutoff Check const flt_t delx = xtmp - tx[u]; const flt_t dely = ytmp - ty[u]; const flt_t delz = ztmp - tz[u]; - const int jtype = tjtype[u]; + const int jtype = tjtype[u]; const flt_t rsq = delx * delx + dely * dely + delz * delz; if (rsq > cutneighsq[ioffset + jtype]) addme = 0; - - // Triclinic - if (TRI) { - if (tz[u] < ztmp) addme = 0; - if (tz[u] == ztmp) { - if (ty[u] < ytmp) addme = 0; - if (ty[u] == ytmp) { - if (tx[u] < xtmp) addme = 0; + + // Triclinic + if (TRI) { + if (tz[u] < ztmp) addme = 0; + if (tz[u] == ztmp) { + if (ty[u] < ytmp) addme = 0; + if (ty[u] == ytmp) { + if (tx[u] < xtmp) addme = 0; if (tx[u] == xtmp && j <= i) addme = 0; } - } - } + } + } - // offload ghost check + // offload ghost check #ifdef _LMP_INTEL_OFFLOAD - if (offload_noghost) { - if (j < nlocal) { - if (i < offload_end) addme = 0; + if (offload_noghost) { + if (j < nlocal) { + if (i < offload_end) addme = 0; } else if (offload) addme = 0; - } - #endif - - int pj; - if (THREE) pj = j; - if (need_ic) { - int no_special; - ominimum_image_check(no_special, delx, dely, delz); - if (no_special) - j = -j - 1; - } - - if (THREE) { - const int jtag = tag[pj]; - int flist = 0; - if (itag > jtag) { - if ((itag+jtag) % 2 == 0) flist = 1; - } else if (itag < jtag) { - if ((itag+jtag) % 2 == 1) flist = 1; - } else { - if (tz[u] < ztmp) flist = 1; - else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1; - else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) - flist = 1; - } - if (addme) { - if (flist) - neighptr2[n2++] = j; - else - neighptr[n++] = j; - } - } else { - if (addme) - neighptr[n++] = j; - } - } // for u + } + #endif + + int pj; + if (THREE) pj = j; + if (need_ic) { + int no_special; + ominimum_image_check(no_special, delx, dely, delz); + if (no_special) + j = -j - 1; + } + + if (THREE) { + const int jtag = tag[pj]; + int flist = 0; + if (itag > jtag) { + if ((itag+jtag) % 2 == 0) flist = 1; + } else if (itag < jtag) { + if ((itag+jtag) % 2 == 1) flist = 1; + } else { + if (tz[u] < ztmp) flist = 1; + else if (tz[u] == ztmp && ty[u] < ytmp) flist = 1; + else if (tz[u] == ztmp && ty[u] == ytmp && tx[u] < xtmp) + flist = 1; + } + if (addme) { + if (flist) + neighptr2[n2++] = j; + else + neighptr[n++] = j; + } + } else { + if (addme) + neighptr[n++] = j; + } + } // for u #ifndef _LMP_INTEL_OFFLOAD - if (exclude) { - int alln = n; - if (THREE) n = pack_offset; - else n = 0; - for (int u = pack_offset; u < alln; u++) { - const int j = neighptr[u]; - int pj = j; - if (need_ic) - if (pj < 0) pj = -j - 1; - const int jtype = x[pj].w; - if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; - neighptr[n++] = j; + if (exclude) { + int alln = n; + if (THREE) n = pack_offset; + else n = 0; + for (int u = pack_offset; u < alln; u++) { + const int j = neighptr[u]; + int pj = j; + if (need_ic) + if (pj < 0) pj = -j - 1; + const int jtype = x[pj].w; + if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; + neighptr[n++] = j; + } + if (THREE) { + alln = n2; + n2 = pack_offset + maxnbors; + for (int u = pack_offset + maxnbors; u < alln; u++) { + const int j = neighptr[u]; + int pj = j; + if (need_ic) + if (pj < 0) pj = -j - 1; + const int jtype = x[pj].w; + if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; + neighptr[n2++] = j; + } } - if (THREE) { - alln = n2; - n2 = pack_offset + maxnbors; - for (int u = pack_offset + maxnbors; u < alln; u++) { - const int j = neighptr[u]; - int pj = j; - if (need_ic) - if (pj < 0) pj = -j - 1; - const int jtype = x[pj].w; - if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; - neighptr[n2++] = j; - } - } } - #endif - int ns; - if (THREE) { - int alln = n; - ns = n - pack_offset; - atombin[i] = ns; - n = lane; - for (int u = pack_offset; u < alln; u++) { - neighptr[n] = neighptr[u]; - n += pack_width; - } - ns += n2 - pack_offset - maxnbors; - for (int u = pack_offset + maxnbors; u < n2; u++) { - neighptr[n] = neighptr[u]; - n += pack_width; - } - if (ns > maxnbors) *overflow = 1; - } else - if (n > maxnbors) *overflow = 1; + #endif + int ns; + if (THREE) { + int alln = n; + ns = n - pack_offset; + atombin[i] = ns; + n = lane; + for (int u = pack_offset; u < alln; u++) { + neighptr[n] = neighptr[u]; + n += pack_width; + } + ns += n2 - pack_offset - maxnbors; + for (int u = pack_offset + maxnbors; u < n2; u++) { + neighptr[n] = neighptr[u]; + n += pack_width; + } + if (ns > maxnbors) *overflow = 1; + } else + if (n > maxnbors) *overflow = 1; ilist[i] = i; cnumneigh[i] = ct; - if (THREE) { - cnumneigh[i] += lane; - numneigh[i] = ns; - } else { - int edge = (n % pad_width); - if (edge) { - const int pad_end = n + (pad_width - edge); + if (THREE) { + cnumneigh[i] += lane; + numneigh[i] = ns; + } else { + int edge = (n % pad_width); + if (edge) { + const int pad_end = n + (pad_width - edge); #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned + #pragma vector aligned #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ - avg=INTEL_COMPILE_WIDTH/2 + avg=INTEL_COMPILE_WIDTH/2 #endif for ( ; n < pad_end; n++) neighptr[n] = e_nall; } - numneigh[i] = n; - } - - if (THREE) { - if (ns > max_chunk) max_chunk = ns; - lane++; - if (lane == pack_width) { - ct += max_chunk * pack_width; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = (ct % alignb); - if (edge) ct += alignb - edge; - neighptr = firstneigh + ct; - max_chunk = 0; - pack_offset = maxnbors * pack_width; - lane = 0; - if (ct + obound > list_size) { - if (i < ito - 1) { - *overflow = 1; - ct = (ifrom + tid * 2) * maxnbors; - } - } - } - } else { - ct += n; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = (ct % alignb); - if (edge) ct += alignb - edge; - neighptr = firstneigh + ct; - if (ct + obound > list_size) { - if (i < ito - 1) { - *overflow = 1; - ct = (ifrom + tid * 2) * maxnbors; - } - } - } + numneigh[i] = n; + } + + if (THREE) { + if (ns > max_chunk) max_chunk = ns; + lane++; + if (lane == pack_width) { + ct += max_chunk * pack_width; + const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); + const int edge = (ct % alignb); + if (edge) ct += alignb - edge; + neighptr = firstneigh + ct; + max_chunk = 0; + pack_offset = maxnbors * pack_width; + lane = 0; + if (ct + obound > list_size) { + if (i < ito - 1) { + *overflow = 1; + ct = (ifrom + tid * 2) * maxnbors; + } + } + } + } else { + ct += n; + const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); + const int edge = (ct % alignb); + if (edge) ct += alignb - edge; + neighptr = firstneigh + ct; + if (ct + obound > list_size) { + if (i < ito - 1) { + *overflow = 1; + ct = (ifrom + tid * 2) * maxnbors; + } + } + } } if (*overflow == 1) @@ -568,16 +568,16 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax; int ghost_offset = 0, nall_offset = e_nall; if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { + for (int i = ifrom; i < ito; ++i) { int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; - #if __INTEL_COMPILER+0 > 1499 - #pragma vector aligned + #if __INTEL_COMPILER+0 > 1499 + #pragma vector aligned #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) - #endif - for (int jj = 0; jj < jnum; jj++) { - int j = jlist[jj]; - if (need_ic && j < 0) j = -j - 1; + #endif + for (int jj = 0; jj < jnum; jj++) { + int j = jlist[jj]; + if (need_ic && j < 0) j = -j - 1; if (j < nlocal) { if (j < vlmin) vlmin = j; if (j > vlmax) vlmax = j; @@ -585,33 +585,33 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, if (j < vgmin) vgmin = j; if (j > vgmax) vgmax = j; } - } - } - lmin = MIN(lmin,vlmin); - gmin = MIN(gmin,vgmin); - lmax = MAX(lmax,vlmax); - gmax = MAX(gmax,vgmax); + } + } + lmin = MIN(lmin,vlmin); + gmin = MIN(gmin,vgmin); + lmax = MAX(lmax,vlmax); + gmax = MAX(gmax,vgmax); #if defined(_OPENMP) #pragma omp critical #endif { - if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; - if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; - if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; - if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; + if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin; + if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax; + if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin; + if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax; + } + #pragma omp barrier + + int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; + if (nghost < 0) nghost = 0; + if (offload) { + ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; + nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; + } else { + ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; + nall_offset = nlocal + nghost; } - #pragma omp barrier - - int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; - if (nghost < 0) nghost = 0; - if (offload) { - ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; - nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; - } else { - ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; - nall_offset = nlocal + nghost; - } } // if separate_buffers #endif @@ -620,67 +620,67 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; - if (THREE) { - const int trip = jnum * pack_width; + if (THREE) { + const int trip = jnum * pack_width; for (int jj = 0; jj < trip; jj+=pack_width) { const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; + if (need_ic && j < 0) { + which = 0; + jlist[jj] = -j - 1; } else ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; } else - #endif + #endif if (which) jlist[jj] = j ^ (which << SBBITS); - } - } else { + } + } else { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned + #pragma vector aligned #pragma simd - #endif + #endif for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; + if (need_ic && j < 0) { + which = 0; + jlist[jj] = -j - 1; } else ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; + #ifdef _LMP_INTEL_OFFLOAD + if (j >= nlocal) { + if (j == e_nall) + jlist[jj] = nall_offset; + else if (which) + jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); + else jlist[jj]-=ghost_offset; } else - #endif + #endif if (which) jlist[jj] = j ^ (which << SBBITS); } - } - } // for i + } + } // for i } // if molecular #ifdef _LMP_INTEL_OFFLOAD else if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { + for (int i = ifrom; i < ito; ++i) { int * _noalias jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; - int jj = 0; - #pragma vector aligned - #pragma simd - for (jj = 0; jj < jnum; jj++) { - if (jlist[jj] >= nlocal) { - if (jlist[jj] == e_nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; - } - } - } + int jj = 0; + #pragma vector aligned + #pragma simd + for (jj = 0; jj < jnum; jj++) { + if (jlist[jj] >= nlocal) { + if (jlist[jj] == e_nall) jlist[jj] = nall_offset; + else jlist[jj] -= ghost_offset; + } + } + } } #endif } // end omp @@ -704,9 +704,9 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, _fix->start_watch(TIME_PACK); _fix->set_neighbor_host_sizes(); buffers->pack_sep_from_single(_fix->host_min_local(), - _fix->host_used_local(), - _fix->host_min_ghost(), - _fix->host_used_ghost()); + _fix->host_used_local(), + _fix->host_min_ghost(), + _fix->host_used_ghost()); _fix->stop_watch(TIME_PACK); } } @@ -732,9 +732,9 @@ void NPairIntel::grow_stencil() _off_map_stencil = stencil; const int * stencil = _off_map_stencil; const int maxstencil = ns->get_maxstencil(); - #pragma offload_transfer target(mic:_cop) \ + #pragma offload_transfer target(mic:_cop) \ in(stencil:length(maxstencil) alloc_if(1) free_if(0)) - } + } } #endif diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h index 51574a252c..55a529b2cb 100644 --- a/src/USER-INTEL/npair_intel.h +++ b/src/USER-INTEL/npair_intel.h @@ -84,8 +84,8 @@ class NPairIntel : public NPair { FixIntel *_fix; template <class flt_t, class acc_t, int, int, int, int, int> - void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, - const int, const int, const int offload_end = 0); + void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, + const int, const int, const int offload_end = 0); #ifdef _LMP_INTEL_OFFLOAD int _cop; diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp index cdea9e76c4..07beae1e41 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp @@ -55,7 +55,7 @@ PairBuckCoulCutIntel::~PairBuckCoulCutIntel() void PairBuckCoulCutIntel::compute(int eflag, int vflag) { if (fix->precision()==FixIntel::PREC_MODE_MIXED) - compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), + compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), force_const_single); else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) compute<double,double>(eflag, vflag, fix->get_double_buffers(), @@ -70,8 +70,8 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void PairBuckCoulCutIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -94,13 +94,13 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag, #endif { int ifrom, ito, tid; - IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - packthreads, sizeof(ATOM_T)); + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - + int ovflag = 0; if (vflag_fdotr) ovflag = 2; else if (vflag) ovflag = 1; @@ -127,9 +127,9 @@ void PairBuckCoulCutIntel::compute(int eflag, int vflag, template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairBuckCoulCutIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -160,8 +160,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -198,8 +198,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, *timer_compute = MIC_Wtime(); #endif - IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, q); + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = oecoul = (acc_t)0; @@ -233,20 +233,20 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; - + const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; if (NEWTON_PAIR == 0) - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcecoul, forcebuck, evdwl, ecoul; @@ -262,19 +262,19 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t r = sqrt(rsq); const flt_t r2inv = (flt_t)1.0 / rsq; - - #ifdef INTEL_VMASK + + #ifdef INTEL_VMASK if (rsq < c_cuti[jtype].cut_coulsq) { #endif forcecoul = qqrd2e * qtmp*q[j]/r; - if (EFLAG) + if (EFLAG) ecoul = forcecoul; if (sbindex){ const flt_t factor_coul = special_coul[sbindex]; forcecoul *= factor_coul; if(EFLAG) ecoul *= factor_coul; - + } #ifdef INTEL_VMASK } @@ -282,7 +282,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, if (rsq >= c_cuti[jtype].cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; } #endif - + #ifdef INTEL_VMASK if (rsq < c_cuti[jtype].cut_ljsq) { #endif @@ -290,14 +290,14 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, flt_t rexp = exp(-r * c_forcei[jtype].rhoinv); forcebuck = r * rexp * c_forcei[jtype].buck1 - r6inv * c_forcei[jtype].buck2; - if (EFLAG) + if (EFLAG) evdwl = rexp * c_energyi[jtype].a - r6inv * c_energyi[jtype].c - c_energyi[jtype].offset; if (sbindex) { const flt_t factor_lj = special_lj[sbindex]; forcebuck *= factor_lj; - if (EFLAG) + if (EFLAG) evdwl *= factor_lj; } #ifdef INTEL_VMASK @@ -311,51 +311,51 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, if (rsq < c_cuti[jtype].cutsq) { #endif const flt_t fpair = (forcecoul + forcebuck) * r2inv; - const flt_t fpx = fpair * delx; - fxtmp += fpx; - if (NEWTON_PAIR) f[j].x -= fpx; - const flt_t fpy = fpair * dely; - fytmp += fpy; - if (NEWTON_PAIR) f[j].y -= fpy; - const flt_t fpz = fpair * delz; - fztmp += fpz; - if (NEWTON_PAIR) f[j].z -= fpz; - - - if (EFLAG) { - sevdwl += evdwl; - secoul += ecoul; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - } - if (NEWTON_PAIR == 0) + } + if (NEWTON_PAIR == 0) IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } #endif } // for jj if (NEWTON_PAIR) { - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - } else { - f[i].x = fxtmp; - f[i].y = fytmp; - f[i].z = fztmp; - } + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; @@ -364,12 +364,12 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } ev_global[2] = ov0; ev_global[3] = ov1; @@ -410,7 +410,7 @@ void PairBuckCoulCutIntel::init_style() error->all(FLERR, "The 'package intel' command is required for /intel styles"); fix = static_cast<FixIntel *>(modify->fix[ifix]); - + fix->pair_init_check(); #ifdef _LMP_INTEL_OFFLOAD _cop = fix->coprocessor_number(); @@ -492,9 +492,9 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, - const int ntable, - Memory *memory, - const int cop) { + const int ntable, + Memory *memory, + const int cop) { if ( (ntypes != _ntypes || ntable != _ntable) ) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD @@ -505,12 +505,12 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, c_cut_t * oc_cut = c_cut[0]; if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL && - oc_energy != NULL && ospecial_coul != NULL && + oc_energy != NULL && ospecial_coul != NULL && _cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ - nocopy(oc_cut: alloc_if(0) free_if(1)) + nocopy(oc_cut: alloc_if(0) free_if(1)) } #endif @@ -534,7 +534,7 @@ void PairBuckCoulCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, c_cut_t * oc_cut = c_cut[0]; int tp1sq = ntypes*ntypes; if (ospecial_lj != NULL && oc_force != NULL && oc_cut != NULL && - oc_energy != NULL && ospecial_coul != NULL && + oc_energy != NULL && ospecial_coul != NULL && cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.h b/src/USER-INTEL/pair_buck_coul_cut_intel.h index 42a55ac21f..7204323903 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.h +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.h @@ -51,8 +51,8 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut { template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc, const int astart, const int aend); + IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, @@ -75,7 +75,7 @@ class PairBuckCoulCutIntel : public PairBuckCoulCut { ~ForceConst() { set_ntypes(0,0,NULL,_cop); } void set_ntypes(const int ntypes, const int ntable, Memory *memory, - const int cop); + const int cop); private: int _ntypes, _ntable, _cop; diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp index a9aee1e53e..995e2e8583 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp @@ -55,7 +55,7 @@ PairBuckCoulLongIntel::~PairBuckCoulLongIntel() void PairBuckCoulLongIntel::compute(int eflag, int vflag) { if (fix->precision()==FixIntel::PREC_MODE_MIXED) - compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), + compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), force_const_single); else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) compute<double,double>(eflag, vflag, fix->get_double_buffers(), @@ -70,8 +70,8 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void PairBuckCoulLongIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -85,7 +85,7 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag, if (_lrt == 0 && ago != 0 && fix->separate_buffers() == 0) { fix->start_watch(TIME_PACK); - + int packthreads; if (nthreads > INTEL_HTHREADS) packthreads = nthreads; else packthreads = 1; @@ -94,13 +94,13 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag, #endif { int ifrom, ito, tid; - IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - packthreads, sizeof(ATOM_T)); + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - + int ovflag = 0; if (vflag_fdotr) ovflag = 2; else if (vflag) ovflag = 1; @@ -127,9 +127,9 @@ void PairBuckCoulLongIntel::compute(int eflag, int vflag, template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairBuckCoulLongIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -175,8 +175,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -213,7 +213,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ - in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ + in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ @@ -224,8 +224,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, *timer_compute = MIC_Wtime(); #endif - IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, q); + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = oecoul = (acc_t)0; @@ -260,24 +260,24 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; - const flt_t * _noalias const rho_invi = rho_inv + ptr_off; + const flt_t * _noalias const rho_invi = rho_inv + ptr_off; const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; acc_t fxtmp,fytmp,fztmp,fwtmp; - acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; + acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; - if (NEWTON_PAIR == 0) - if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag == 1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - int ej = 0; + int ej = 0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep @@ -287,33 +287,33 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; + const int jtype = x[j].w; const flt_t rsq = delx * delx + dely * dely + delz * delz; - + if (rsq < c_forcei[jtype].cutsq) { - trsq[ej]=rsq; - tdelx[ej]=delx; - tdely[ej]=dely; - tdelz[ej]=delz; - tjtype[ej]=jtype; - tj[ej]=jlist[jj]; - ej++; - } - } + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + tjtype[ej]=jtype; + tj[ej]=jlist[jj]; + ej++; + } + } #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcebuck, evdwl, ecoul; forcecoul = forcebuck = evdwl = ecoul = (flt_t)0.0; - const int j = tj[jj] & NEIGHMASK; + const int j = tj[jj] & NEIGHMASK; const int sbindex = tj[jj] >> SBBITS & 3; - const int jtype = tjtype[jj]; - const flt_t rsq = trsq[jj]; + const int jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; const flt_t r2inv = (flt_t)1.0 / rsq; const flt_t r = (flt_t)1.0 / sqrt(r2inv); @@ -321,52 +321,52 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, if (!ncoultablebits || rsq <= tabinnersq) { #endif const flt_t A1 = 0.254829592; - const flt_t A2 = -0.284496736; - const flt_t A3 = 1.421413741; - const flt_t A4 = -1.453152027; - const flt_t A5 = 1.061405429; - const flt_t EWALD_F = 1.12837917; - const flt_t INV_EWALD_P = 1.0 / 0.3275911; - - const flt_t grij = g_ewald * r; - const flt_t expm2 = exp(-grij * grij); - const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); - const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const flt_t prefactor = qqrd2e * qtmp * q[j] / r; - forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); - if (EFLAG) ecoul = prefactor * erfc; - - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + #ifdef INTEL_ALLOW_TABLE } else { - float rsq_lookup = rsq; - const int itable = (__intel_castf32_u32(rsq_lookup) & - ncoulmask) >> ncoulshiftbits; - const flt_t fraction = (rsq_lookup - table[itable].r) * - table[itable].dr; - - const flt_t tablet = table[itable].f + - fraction * table[itable].df; - forcecoul = qtmp * q[j] * tablet; - if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + - fraction * detable[itable]); - if (sbindex) { - const flt_t table2 = ctable[itable] + - fraction * dctable[itable]; - const flt_t prefactor = qtmp * q[j] * table2; - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; } } #endif - #ifdef INTEL_VMASK + #ifdef INTEL_VMASK if (rsq < c_forcei[jtype].cut_ljsq) { #endif flt_t r6inv = r2inv * r2inv * r2inv; @@ -389,7 +389,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, { forcebuck = (flt_t)0.0; evdwl = (flt_t)0.0; } #endif - const flt_t fpair = (forcecoul + forcebuck) * r2inv; + const flt_t fpair = (forcecoul + forcebuck) * r2inv; const flt_t fpx = fpair * tdelx[jj]; fxtmp += fpx; if (NEWTON_PAIR) f[j].x -= fpx; @@ -400,38 +400,38 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, fztmp += fpz; if (NEWTON_PAIR) f[j].z -= fpz; - if (EFLAG) { + if (EFLAG) { sevdwl += evdwl; - secoul += ecoul; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - } - if (NEWTON_PAIR == 0) - IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], - fpx, fpy, fpz); + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - if (NEWTON_PAIR) { - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - } else { - f[i].x = fxtmp; - f[i].y = fytmp; - f[i].z = fztmp; - } - IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; @@ -440,12 +440,12 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } ev_global[2] = ov0; ev_global[3] = ov1; @@ -486,7 +486,7 @@ void PairBuckCoulLongIntel::init_style() error->all(FLERR, "The 'package intel' command is required for /intel styles"); fix = static_cast<FixIntel *>(modify->fix[ifix]); - + fix->pair_init_check(); #ifdef _LMP_INTEL_OFFLOAD _cop = fix->coprocessor_number(); @@ -549,7 +549,7 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int j = 0; j < tp1; j++) { if (cutsq[i][j] < cut_ljsq[i][j]) error->all(FLERR, - "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic"); + "Intel variant of lj/buck/coul/long expects lj cutoff<=coulombic"); fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; fc.c_force[i][j].buck1 = buck1[i][j]; @@ -603,9 +603,9 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, - const int ntable, - Memory *memory, - const int cop) { + const int ntable, + Memory *memory, + const int cop) { if ( (ntypes != _ntypes || ntable != _ntable) ) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD @@ -625,10 +625,10 @@ void PairBuckCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, ospecial_coul != NULL && _cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ - nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ - nocopy(orho_inv: alloc_if(0) free_if(1)) \ - nocopy(otable: alloc_if(0) free_if(1)) \ - nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) + nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ + nocopy(orho_inv: alloc_if(0) free_if(1)) \ + nocopy(otable: alloc_if(0) free_if(1)) \ + nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) } #endif diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.h b/src/USER-INTEL/pair_buck_coul_long_intel.h index ec2cdba177..ec37c699c8 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.h +++ b/src/USER-INTEL/pair_buck_coul_long_intel.h @@ -50,8 +50,8 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong { template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc, const int astart, const int aend); + IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, @@ -76,7 +76,7 @@ class PairBuckCoulLongIntel : public PairBuckCoulLong { ~ForceConst() { set_ntypes(0,0,NULL,_cop); } void set_ntypes(const int ntypes, const int ntable, Memory *memory, - const int cop); + const int cop); private: int _ntypes, _ntable, _cop; diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp index bbfc7225dd..8c63d2e62d 100644 --- a/src/USER-INTEL/pair_buck_intel.cpp +++ b/src/USER-INTEL/pair_buck_intel.cpp @@ -48,7 +48,7 @@ PairBuckIntel::~PairBuckIntel() void PairBuckIntel::compute(int eflag, int vflag) { if (fix->precision()==FixIntel::PREC_MODE_MIXED) - compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), + compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), force_const_single); else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) compute<double,double>(eflag, vflag, fix->get_double_buffers(), @@ -63,8 +63,8 @@ void PairBuckIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void PairBuckIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -87,13 +87,13 @@ void PairBuckIntel::compute(int eflag, int vflag, #endif { int ifrom, ito, tid; - IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - packthreads, sizeof(ATOM_T)); + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - + int ovflag = 0; if (vflag_fdotr) ovflag = 2; else if (vflag) ovflag = 1; @@ -120,9 +120,9 @@ void PairBuckIntel::compute(int eflag, int vflag, template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairBuckIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -147,8 +147,8 @@ void PairBuckIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -160,7 +160,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, int *overflow = fix->get_off_overflow_flag(); double *timer_compute = fix->off_watch_pair(); // Redeclare as local variables for offload - + if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY); #pragma offload target(mic:_cop) if(offload) \ in(special_lj:length(0) alloc_if(0) free_if(0)) \ @@ -182,8 +182,8 @@ void PairBuckIntel::eval(const int offload, const int vflag, *timer_compute = MIC_Wtime(); #endif - IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, 0); + IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, + f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = (acc_t)0; @@ -215,23 +215,23 @@ void PairBuckIntel::eval(const int offload, const int vflag, const int jnum = numneigh[i]; acc_t fxtmp,fytmp,fztmp,fwtmp; - acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = (acc_t)0; - if (NEWTON_PAIR == 0) + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < jnum; jj++) { - + flt_t forcebuck, evdwl; forcebuck = evdwl = (flt_t)0.0; @@ -245,7 +245,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, const flt_t rsq = delx * delx + dely * dely + delz * delz; const flt_t r = sqrt(rsq); const flt_t r2inv = (flt_t)1.0 / rsq; - + #ifdef INTEL_VMASK if (rsq < c_forcei[jtype].cutsq) { #endif @@ -257,7 +257,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, #ifndef INTEL_VMASK if (rsq > c_forcei[jtype].cutsq) forcebuck =(flt_t)0.0; - #endif + #endif if (EFLAG) { evdwl = rexp * c_energyi[jtype].a - r6inv * c_energyi[jtype].c - @@ -272,67 +272,67 @@ void PairBuckIntel::eval(const int offload, const int vflag, if (sbindex) { const flt_t factor_lj = special_lj[sbindex]; forcebuck *= factor_lj; - if (EFLAG) + if (EFLAG) evdwl *= factor_lj; } const flt_t fpair = forcebuck * r2inv; - const flt_t fpx = fpair * delx; - fxtmp += fpx; - if (NEWTON_PAIR) f[j].x -= fpx; - const flt_t fpy = fpair * dely; - fytmp += fpy; - if (NEWTON_PAIR) f[j].y -= fpy; - const flt_t fpz = fpair * delz; - fztmp += fpz; - if (NEWTON_PAIR) f[j].z -= fpz; - - if (EFLAG) { - sevdwl += evdwl; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl; - if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl; - } - } - if (NEWTON_PAIR == 0) - IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } #endif } // for jj - if (NEWTON_PAIR) { - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - } else { - f[i].x = fxtmp; - f[i].y = fytmp; - f[i].z = fztmp; - } + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; + if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; ev_global[0] = oevdwl; ev_global[1] = (acc_t)0; } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } ev_global[2] = ov0; ev_global[3] = ov1; @@ -371,7 +371,7 @@ void PairBuckIntel::init_style() error->all(FLERR, "The 'package intel' command is required for /intel styles"); fix = static_cast<FixIntel *>(modify->fix[ifix]); - + fix->pair_init_check(); #ifdef _LMP_INTEL_OFFLOAD _cop = fix->coprocessor_number(); @@ -442,7 +442,7 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc, /* ---------------------------------------------------------------------- */ template <class flt_t> -void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, +void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, Memory *memory, const int cop) { if ( (ntypes != _ntypes ) ) { @@ -452,8 +452,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, c_force_t * oc_force = c_force[0]; c_energy_t * oc_energy = c_energy[0]; - if (ospecial_lj != NULL && oc_force != NULL && - oc_energy != NULL && + if (ospecial_lj != NULL && oc_force != NULL && + oc_energy != NULL && _cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: alloc_if(0) free_if(1)) \ @@ -476,8 +476,8 @@ void PairBuckIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, c_force_t * oc_force = c_force[0]; c_energy_t * oc_energy = c_energy[0]; int tp1sq = ntypes*ntypes; - if (ospecial_lj != NULL && oc_force != NULL && - oc_energy != NULL && + if (ospecial_lj != NULL && oc_force != NULL && + oc_energy != NULL && cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ diff --git a/src/USER-INTEL/pair_buck_intel.h b/src/USER-INTEL/pair_buck_intel.h index e699a1611e..ab5e135262 100644 --- a/src/USER-INTEL/pair_buck_intel.h +++ b/src/USER-INTEL/pair_buck_intel.h @@ -50,8 +50,8 @@ private: template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc, const int astart, const int aend); + IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, @@ -59,7 +59,7 @@ private: template <class flt_t> class ForceConst { - + public: typedef struct { flt_t buck1, buck2, rhoinv, cutsq; } c_force_t; typedef struct { flt_t a, c, offset, pad; } c_energy_t; @@ -78,7 +78,7 @@ private: int _ntypes, _cop; Memory *_memory; }; - + ForceConst<float> force_const_single; ForceConst<double> force_const_double; }; diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp index 541f9745cb..b97128bf9f 100644 --- a/src/USER-INTEL/pair_eam_intel.cpp +++ b/src/USER-INTEL/pair_eam_intel.cpp @@ -74,8 +74,8 @@ void PairEAMIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void PairEAMIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag, vflag); @@ -111,37 +111,37 @@ void PairEAMIntel::compute(int eflag, int vflag, if (_onetype) { if (eflag) { if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } else { if (eflag) { if (force->newton_pair) { - eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } @@ -151,8 +151,8 @@ void PairEAMIntel::compute(int eflag, int vflag, template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairEAMIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend) { const int inum = aend - astart; @@ -251,8 +251,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, #endif { int iifrom, iito, tid; - IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, - INTEL_VECTOR_WIDTH); + IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, + INTEL_VECTOR_WIDTH); iifrom += astart; iito += astart; @@ -264,8 +264,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, else foff = 0; double * _noalias const trho = rho + foff; if (NEWTON_PAIR) { - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - memset(trho, 0, nall * sizeof(double)); + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + memset(trho, 0, nall * sizeof(double)); } const int toffs = tid * ccache_stride; @@ -280,108 +280,108 @@ void PairEAMIntel::eval(const int offload, const int vflag, int rhor_joff, frho_ioff; if (ONETYPE) { const int ptr_off=_onetype * ntypes + _onetype; - oscale = scale_f[ptr_off]; - int rhor_ioff = istride * _onetype; - rhor_joff = rhor_ioff + _onetype * jstride; - frho_ioff = fstride * _onetype; + oscale = scale_f[ptr_off]; + int rhor_ioff = istride * _onetype; + rhor_joff = rhor_ioff + _onetype * jstride; + frho_ioff = fstride * _onetype; } for (int i = iifrom; i < iito; ++i) { int itype, rhor_ioff; - if (!ONETYPE) { + if (!ONETYPE) { itype = x[i].w; - rhor_ioff = istride * itype; - } - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + rhor_ioff = istride * itype; + } + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; - const flt_t xtmp = x[i].x; - const flt_t ytmp = x[i].y; - const flt_t ztmp = x[i].z; + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; - acc_t rhoi = (acc_t)0.0; - int ej = 0; + acc_t rhoi = (acc_t)0.0; + int ej = 0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep - #endif - for (int jj = 0; jj < jnum; jj++) { - const int j = jlist[jj] & NEIGHMASK; + #endif + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj] & NEIGHMASK; const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; - const flt_t rsq = delx*delx + dely*dely + delz*delz; + const flt_t rsq = delx*delx + dely*dely + delz*delz; - if (rsq < fcutforcesq) { - trsq[ej]=rsq; - if (!ONETYPE) tjtype[ej]=x[j].w; - tj[ej]=jlist[jj]; - ej++; + if (rsq < fcutforcesq) { + trsq[ej]=rsq; + if (!ONETYPE) tjtype[ej]=x[j].w; + tj[ej]=jlist[jj]; + ej++; } } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned + #pragma vector aligned #pragma simd reduction(+:rhoi) - #endif + #endif for (int jj = 0; jj < ej; jj++) { - int jtype; - const int j = tj[jj] & NEIGHMASK; - if (!ONETYPE) jtype = tjtype[jj]; - const flt_t rsq = trsq[jj]; - flt_t p = sqrt(rsq)*frdr + (flt_t)1.0; - int m = static_cast<int> (p); - m = MIN(m,nr-1); - p -= m; - p = MIN(p,(flt_t)1.0); - if (!ONETYPE) - rhor_joff = rhor_ioff + jtype * jstride; - const int joff = rhor_joff + m; - flt_t ra; - ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p + - rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d; - rhoi += ra; - if (NEWTON_PAIR) { - if (!ONETYPE) { - const int ioff = jtype * istride + itype * jstride + m; - ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p + - rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d; - } - trho[j] += ra; - } + int jtype; + const int j = tj[jj] & NEIGHMASK; + if (!ONETYPE) jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; + flt_t p = sqrt(rsq)*frdr + (flt_t)1.0; + int m = static_cast<int> (p); + m = MIN(m,nr-1); + p -= m; + p = MIN(p,(flt_t)1.0); + if (!ONETYPE) + rhor_joff = rhor_ioff + jtype * jstride; + const int joff = rhor_joff + m; + flt_t ra; + ra = ((rhor_spline_e[joff].a*p + rhor_spline_e[joff].b) * p + + rhor_spline_e[joff].c) * p + rhor_spline_e[joff].d; + rhoi += ra; + if (NEWTON_PAIR) { + if (!ONETYPE) { + const int ioff = jtype * istride + itype * jstride + m; + ra = ((rhor_spline_e[ioff].a*p + rhor_spline_e[ioff].b)*p + + rhor_spline_e[ioff].c) * p + rhor_spline_e[ioff].d; + } + trho[j] += ra; + } } // for jj - if (NEWTON_PAIR) - trho[i] += rhoi; - else - trho[i] = rhoi; + if (NEWTON_PAIR) + trho[i] += rhoi; + else + trho[i] = rhoi; } // for i #if defined(_OPENMP) if (NEWTON_PAIR && nthreads > 1) { #pragma omp barrier - if (tid == 0) { + if (tid == 0) { const int rcount = nall; - if (nthreads == 2) { + if (nthreads == 2) { double *trho2 = rho + nmax; - #pragma vector aligned + #pragma vector aligned #pragma simd - for (int n = 0; n < rcount; n++) - rho[n] += trho2[n]; + for (int n = 0; n < rcount; n++) + rho[n] += trho2[n]; } else if (nthreads == 4) { double *trho2 = rho + nmax; - double *trho3 = trho2 + nmax; - double *trho4 = trho3 + nmax; - #pragma vector aligned - #pragma simd - for (int n = 0; n < rcount; n++) - rho[n] += trho2[n] + trho3[n] + trho4[n]; + double *trho3 = trho2 + nmax; + double *trho4 = trho3 + nmax; + #pragma vector aligned + #pragma simd + for (int n = 0; n < rcount; n++) + rho[n] += trho2[n] + trho3[n] + trho4[n]; } else { - double *trhon = rho + nmax; - for (int t = 1; t < nthreads; t++) { - #pragma vector aligned - #pragma simd - for (int n = 0; n < rcount; n++) - rho[n] += trhon[n]; - trhon += nmax; + double *trhon = rho + nmax; + for (int t = 1; t < nthreads; t++) { + #pragma vector aligned + #pragma simd + for (int n = 0; n < rcount; n++) + rho[n] += trhon[n]; + trhon += nmax; } } } @@ -411,32 +411,32 @@ void PairEAMIntel::eval(const int offload, const int vflag, #pragma simd reduction(+:tevdwl) #endif for (int i = iifrom; i < iito; ++i) { - int itype; - if (!ONETYPE) itype = x[i].w; - flt_t p = rho[i]*frdrho + (flt_t)1.0; - int m = static_cast<int> (p); - m = MAX(1,MIN(m,nrho-1)); - p -= m; - p = MIN(p,(flt_t)1.0); - if (!ONETYPE) frho_ioff = itype * fstride; - const int ioff = frho_ioff + m; - fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p + - frho_spline_f[ioff].c; - if (EFLAG) { - flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p + - frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d; - if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax); - if (!ONETYPE) { - const int ptr_off=itype*ntypes + itype; - oscale = scale_f[ptr_off]; - } - phi *= oscale; - tevdwl += phi; - if (eatom) f[i].w += phi; - } + int itype; + if (!ONETYPE) itype = x[i].w; + flt_t p = rho[i]*frdrho + (flt_t)1.0; + int m = static_cast<int> (p); + m = MAX(1,MIN(m,nrho-1)); + p -= m; + p = MIN(p,(flt_t)1.0); + if (!ONETYPE) frho_ioff = itype * fstride; + const int ioff = frho_ioff + m; + fp_f[i] = (frho_spline_f[ioff].a*p + frho_spline_f[ioff].b)*p + + frho_spline_f[ioff].c; + if (EFLAG) { + flt_t phi = ((frho_spline_e[ioff].a*p + frho_spline_e[ioff].b)*p + + frho_spline_e[ioff].c)*p + frho_spline_e[ioff].d; + if (rho[i] > frhomax) phi += fp_f[i] * (rho[i]-frhomax); + if (!ONETYPE) { + const int ptr_off=itype*ntypes + itype; + oscale = scale_f[ptr_off]; + } + phi *= oscale; + tevdwl += phi; + if (eatom) f[i].w += phi; + } } if (EFLAG) oevdwl += tevdwl; - + // communicate derivative of embedding function @@ -447,7 +447,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, if (tid == 0) comm->forward_comm_pair(this); if (NEWTON_PAIR) - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); #if defined(_OPENMP) #pragma omp barrier @@ -458,94 +458,94 @@ void PairEAMIntel::eval(const int offload, const int vflag, for (int i = iifrom; i < iito; ++i) { int itype, rhor_ioff; - const flt_t * _noalias scale_fi; - if (!ONETYPE) { - itype = x[i].w; - rhor_ioff = istride * itype; - scale_fi = scale_f + itype*ntypes; - } - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - - acc_t fxtmp, fytmp, fztmp, fwtmp; - acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; - - const flt_t xtmp = x[i].x; - const flt_t ytmp = x[i].y; - const flt_t ztmp = x[i].z; - fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = (acc_t)0; + const flt_t * _noalias scale_fi; + if (!ONETYPE) { + itype = x[i].w; + rhor_ioff = istride * itype; + scale_fi = scale_f + itype*ntypes; + } + const int * _noalias const jlist = firstneigh + cnumneigh[i]; + const int jnum = numneigh[i]; + + acc_t fxtmp, fytmp, fztmp, fwtmp; + acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; + + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + fxtmp = fytmp = fztmp = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; if (NEWTON_PAIR == 0) - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - int ej = 0; + int ej = 0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep - #endif - for (int jj = 0; jj < jnum; jj++) { - const int j = jlist[jj] & NEIGHMASK; + #endif + for (int jj = 0; jj < jnum; jj++) { + const int j = jlist[jj] & NEIGHMASK; const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; - const flt_t rsq = delx*delx + dely*dely + delz*delz; - - if (rsq < fcutforcesq) { - trsq[ej]=rsq; - tdelx[ej]=delx; - tdely[ej]=dely; - tdelz[ej]=delz; - if (!ONETYPE) tjtype[ej]=x[j].w; - tj[ej]=jlist[jj]; - ej++; - } - } + const flt_t rsq = delx*delx + dely*dely + delz*delz; + + if (rsq < fcutforcesq) { + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + if (!ONETYPE) tjtype[ej]=x[j].w; + tj[ej]=jlist[jj]; + ej++; + } + } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned + #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { - int jtype; - const int j = tj[jj] & NEIGHMASK; - if (!ONETYPE) jtype = tjtype[jj]; - const flt_t rsq = trsq[jj]; - const flt_t r = sqrt(rsq); - flt_t p = r*frdr + (flt_t)1.0; - int m = static_cast<int> (p); - m = MIN(m,nr-1); - p -= m; - p = MIN(p,(flt_t)1.0); - if (!ONETYPE) - rhor_joff = rhor_ioff + jtype * jstride; - const int joff = rhor_joff + m; - const flt_t rhojp = (rhor_spline_f[joff].a*p + - rhor_spline_f[joff].b)*p + - rhor_spline_f[joff].c; - flt_t rhoip; - if (!ONETYPE) { - const int ioff = jtype * istride + itype * jstride + m; - rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + - rhor_spline_f[ioff].c; - } else - rhoip = rhojp; - const flt_t z2p = (z2r_spline_t[joff].a*p + - z2r_spline_t[joff].b)*p + - z2r_spline_t[joff].c; - const flt_t z2 = ((z2r_spline_t[joff].d*p + - z2r_spline_t[joff].e)*p + - z2r_spline_t[joff].f)*p + - z2r_spline_t[joff].g; - - const flt_t recip = (flt_t)1.0/r; - const flt_t phi = z2*recip; - const flt_t phip = z2p*recip - phi*recip; - const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip; - if (!ONETYPE) - oscale = scale_fi[jtype]; - const flt_t fpair = -oscale*psip*recip; - + int jtype; + const int j = tj[jj] & NEIGHMASK; + if (!ONETYPE) jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; + const flt_t r = sqrt(rsq); + flt_t p = r*frdr + (flt_t)1.0; + int m = static_cast<int> (p); + m = MIN(m,nr-1); + p -= m; + p = MIN(p,(flt_t)1.0); + if (!ONETYPE) + rhor_joff = rhor_ioff + jtype * jstride; + const int joff = rhor_joff + m; + const flt_t rhojp = (rhor_spline_f[joff].a*p + + rhor_spline_f[joff].b)*p + + rhor_spline_f[joff].c; + flt_t rhoip; + if (!ONETYPE) { + const int ioff = jtype * istride + itype * jstride + m; + rhoip = (rhor_spline_f[ioff].a*p + rhor_spline_f[ioff].b)*p + + rhor_spline_f[ioff].c; + } else + rhoip = rhojp; + const flt_t z2p = (z2r_spline_t[joff].a*p + + z2r_spline_t[joff].b)*p + + z2r_spline_t[joff].c; + const flt_t z2 = ((z2r_spline_t[joff].d*p + + z2r_spline_t[joff].e)*p + + z2r_spline_t[joff].f)*p + + z2r_spline_t[joff].g; + + const flt_t recip = (flt_t)1.0/r; + const flt_t phi = z2*recip; + const flt_t phip = z2p*recip - phi*recip; + const flt_t psip = fp_f[i]*rhojp + fp_f[j]*rhoip + phip; + if (!ONETYPE) + oscale = scale_fi[jtype]; + const flt_t fpair = -oscale*psip*recip; + const flt_t fpx = fpair * tdelx[jj]; fxtmp += fpx; if (NEWTON_PAIR) f[j].x -= fpx; @@ -556,20 +556,20 @@ void PairEAMIntel::eval(const int offload, const int vflag, fztmp += fpz; if (NEWTON_PAIR) f[j].z -= fpz; - if (EFLAG) { - const flt_t evdwl = oscale*phi; - sevdwl += evdwl; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl; - if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl; - } - } - if (NEWTON_PAIR == 0) - IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], - fpx, fpy, fpz); + if (EFLAG) { + const flt_t evdwl = oscale*phi; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl; + } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - if (NEWTON_PAIR) { + if (NEWTON_PAIR) { f[i].x += fxtmp; f[i].y += fytmp; f[i].z += fztmp; @@ -577,19 +577,19 @@ void PairEAMIntel::eval(const int offload, const int vflag, f[i].x = fxtmp; f[i].y = fytmp; f[i].z = fztmp; - sevdwl *= (acc_t)0.5; + sevdwl *= (acc_t)0.5; } - + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for i IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } /// omp IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { ev_global[0] = oevdwl; @@ -597,13 +597,13 @@ void PairEAMIntel::eval(const int offload, const int vflag, } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; - } + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -665,7 +665,7 @@ void PairEAMIntel::init_style() template <class flt_t, class acc_t> void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> *buffers) + IntelBuffers<flt_t,acc_t> *buffers) { int off_ccache = 0; #ifdef _LMP_INTEL_OFFLOAD @@ -684,14 +684,14 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { - cut = init_one(i,j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + cut = init_one(i,j); + cutneigh = cut + neighbor->skin; + cutsq[i][j] = cutsq[j][i] = cut*cut; + cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; } } } - + _onetype=-1; double oldscale=-1; for (int i = 1; i < tp1; i++) { @@ -709,32 +709,32 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, for (int j = 1; j < tp1; j++) { fc.scale_f[i][j] = scale[i][j]; if (type2rhor[i][j] >= 0) { - const int joff = ioff + j * fc.rhor_jstride(); - for (int k = 0; k < nr + 1; k++) { - if (type2rhor[j][i] != type2rhor[i][j]) - _onetype = 0; + const int joff = ioff + j * fc.rhor_jstride(); + for (int k = 0; k < nr + 1; k++) { + if (type2rhor[j][i] != type2rhor[i][j]) + _onetype = 0; else if (_onetype < 0) - _onetype = i; + _onetype = i; if (oldscale < 0) oldscale = scale[i][j]; else - if (oldscale != scale[i][j]) - _onetype = 0; - fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0]; - fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1]; - fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2]; - fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3]; - fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4]; - fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5]; - fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6]; - fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0]; - fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1]; - fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2]; - fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3]; - fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4]; - fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5]; - fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6]; - } + if (oldscale != scale[i][j]) + _onetype = 0; + fc.rhor_spline_f[joff + k].a=rhor_spline[type2rhor[j][i]][k][0]; + fc.rhor_spline_f[joff + k].b=rhor_spline[type2rhor[j][i]][k][1]; + fc.rhor_spline_f[joff + k].c=rhor_spline[type2rhor[j][i]][k][2]; + fc.rhor_spline_e[joff + k].a=rhor_spline[type2rhor[j][i]][k][3]; + fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4]; + fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5]; + fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6]; + fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0]; + fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1]; + fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2]; + fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3]; + fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4]; + fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5]; + fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6]; + } } } } @@ -745,9 +745,9 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, - const int nr, const int nrho, - Memory *memory, - const int cop) { + const int nr, const int nrho, + Memory *memory, + const int cop) { if (ntypes != _ntypes || nr > _nr || nrho > _nrho) { if (_ntypes > 0) { _memory->destroy(rhor_spline_f); @@ -780,7 +780,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, /* ---------------------------------------------------------------------- */ int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf, - int pbc_flag, int *pbc) + int pbc_flag, int *pbc) { if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) return pack_forward_comm(n, list, buf, fp); @@ -802,7 +802,7 @@ void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf) template<class flt_t> int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf, - flt_t *fp_f) + flt_t *fp_f) { int i,j,m; @@ -817,8 +817,8 @@ int PairEAMIntel::pack_forward_comm(int n, int *list, double *buf, /* ---------------------------------------------------------------------- */ template<class flt_t> -void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf, - flt_t *fp_f) +void PairEAMIntel::unpack_forward_comm(int n, int first, double *buf, + flt_t *fp_f) { int i,m,last; diff --git a/src/USER-INTEL/pair_eam_intel.h b/src/USER-INTEL/pair_eam_intel.h index c7bb3b7bd0..f34e740bda 100644 --- a/src/USER-INTEL/pair_eam_intel.h +++ b/src/USER-INTEL/pair_eam_intel.h @@ -53,8 +53,8 @@ class PairEAMIntel : public PairEAM { template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); - template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, - class acc_t> + template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, + class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, const int astart, const int aend); @@ -79,8 +79,8 @@ class PairEAMIntel : public PairEAM { ForceConst() : _ntypes(0), _nr(0) {} ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); } - void set_ntypes(const int ntypes, const int nr, const int nrho, - Memory *memory, const int cop); + void set_ntypes(const int ntypes, const int nr, const int nrho, + Memory *memory, const int cop); inline int rhor_jstride() const { return _nr; } inline int rhor_istride() const { return _nr * _ntypes; } inline int frho_stride() const { return _nrho; } diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index af96fcbb79..ed7dd424af 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -98,17 +98,17 @@ void PairGayBerneIntel::compute(int eflag, int vflag, { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, packthreads, - sizeof(ATOM_T)); + sizeof(ATOM_T)); if (ago != 0) buffers->thr_pack(ifrom,ito,ago); for (int i = ifrom; i < ito; i++) { - int qi = ellipsoid[i]; - if (qi > -1) { - quat[i].w = bonus[qi].quat[0]; - quat[i].i = bonus[qi].quat[1]; - quat[i].j = bonus[qi].quat[2]; - quat[i].k = bonus[qi].quat[3]; - } + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } } } quat[nall].w = (flt_t)1.0; @@ -161,65 +161,65 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, if (fix->separate_buffers()) { fix->start_watch(TIME_PACK); if (offload) { - #pragma omp parallel + #pragma omp parallel { int ifrom, ito, tid; - int nthreads = comm->nthreads; - IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, - nthreads, sizeof(ATOM_T)); - if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0); - for (int i = ifrom; i < ito; i++) { - int qi = ellipsoid[i]; - if (qi > -1) { - quat[i].w = bonus[qi].quat[0]; - quat[i].i = bonus[qi].quat[1]; - quat[i].j = bonus[qi].quat[2]; - quat[i].k = bonus[qi].quat[3]; - } - } - int nghost = nall - nlocal; - if (nghost) { - IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, - nthreads, sizeof(ATOM_T)); - int offset = 0; - ifrom += nlocal; - ito += nlocal; - if (ago != 0) { - offset = fix->offload_min_ghost() - nlocal; - buffers->thr_pack_cop(ifrom, ito, offset, ago == 1); - } - for (int i = ifrom; i < ito; i++) { - int qi = ellipsoid[i + offset]; - if (qi > -1) { - quat[i].w = bonus[qi].quat[0]; - quat[i].i = bonus[qi].quat[1]; - quat[i].j = bonus[qi].quat[2]; - quat[i].k = bonus[qi].quat[3]; - } - } - } + int nthreads = comm->nthreads; + IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, + nthreads, sizeof(ATOM_T)); + if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0); + for (int i = ifrom; i < ito; i++) { + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + int nghost = nall - nlocal; + if (nghost) { + IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, + nthreads, sizeof(ATOM_T)); + int offset = 0; + ifrom += nlocal; + ito += nlocal; + if (ago != 0) { + offset = fix->offload_min_ghost() - nlocal; + buffers->thr_pack_cop(ifrom, ito, offset, ago == 1); + } + for (int i = ifrom; i < ito; i++) { + int qi = ellipsoid[i + offset]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } + } + } } } else { if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); for (int i = fix->host_min_local(); i < nlocal; i++) { - int qi = ellipsoid[i]; - if (qi > -1) { - quat[i].w = bonus[qi].quat[0]; - quat[i].i = bonus[qi].quat[1]; - quat[i].j = bonus[qi].quat[2]; - quat[i].k = bonus[qi].quat[3]; - } + int qi = ellipsoid[i]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } } int offset = fix->host_min_ghost() - nlocal; if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset); for (int i = nlocal; i < nall; i++) { - int qi = ellipsoid[i + offset]; - if (qi > -1) { - quat[i].w = bonus[qi].quat[0]; - quat[i].i = bonus[qi].quat[1]; - quat[i].j = bonus[qi].quat[2]; - quat[i].k = bonus[qi].quat[3]; - } + int qi = ellipsoid[i + offset]; + if (qi > -1) { + quat[i].w = bonus[qi].quat[0]; + quat[i].i = bonus[qi].quat[1]; + quat[i].j = bonus[qi].quat[2]; + quat[i].k = bonus[qi].quat[3]; + } } } fix->stop_watch(TIME_PACK); @@ -252,8 +252,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -303,26 +303,26 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, #ifdef _LMP_INTEL_OFFLOAD if (separate_flag) { if (separate_flag < 3) { - int all_local = nlocal; - int ghost_min = overflow[LMP_GHOST_MIN]; - nlocal = overflow[LMP_LOCAL_MAX] + 1; - int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; - if (nghost < 0) nghost = 0; - nall = nlocal + nghost; - separate_flag--; - int flength; - if (NEWTON_PAIR) flength = nall; - else flength = nlocal; - IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), - separate_flag); - if (nghost) { - if (nlocal < all_local || ghost_min > all_local) { - memmove(x + nlocal, x + ghost_min, - (nall - nlocal) * sizeof(ATOM_T)); - memmove(quat + nlocal, quat + ghost_min, - (nall - nlocal) * sizeof(QUAT_T)); - } - } + int all_local = nlocal; + int ghost_min = overflow[LMP_GHOST_MIN]; + nlocal = overflow[LMP_LOCAL_MAX] + 1; + int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; + if (nghost < 0) nghost = 0; + nall = nlocal + nghost; + separate_flag--; + int flength; + if (NEWTON_PAIR) flength = nall; + else flength = nlocal; + IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), + separate_flag); + if (nghost) { + if (nlocal < all_local || ghost_min > all_local) { + memmove(x + nlocal, x + ghost_min, + (nall - nlocal) * sizeof(ATOM_T)); + memmove(quat + nlocal, quat + ghost_min, + (nall - nlocal) * sizeof(QUAT_T)); + } + } } x[nall].x = (flt_t)INTEL_BIGP; x[nall].y = (flt_t)INTEL_BIGP; @@ -395,17 +395,17 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0; - if (EFLAG) fwtmp = sevdwl = (acc_t)0.0; - if (NEWTON_PAIR == 0) + if (EFLAG) fwtmp = sevdwl = (acc_t)0.0; + if (NEWTON_PAIR == 0) if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0; bool multiple_forms = false; int packed_j = 0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma ivdep - #endif - for (int jj = 0; jj < jnum; jj++) { + #pragma vector aligned + #pragma ivdep + #endif + for (int jj = 0; jj < jnum; jj++) { int jm = jlist[jj]; int j = jm & NEIGHMASK; const int jtype = x[j].w; @@ -428,27 +428,27 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } else multiple_forms = true; } - const int edge = (packed_j % pad_width); - if (edge) { - const int packed_end = packed_j + (pad_width - edge); + const int edge = (packed_j % pad_width); + if (edge) { + const int packed_end = packed_j + (pad_width - edge); #if defined(LMP_SIMD_COMPILER) #pragma loop_count min=1, max=15, avg=8 #endif - for ( ; packed_j < packed_end; packed_j++) - jlist_form[packed_j] = nall; - } - + for ( ; packed_j < packed_end; packed_j++) + jlist_form[packed_j] = nall; + } + // ------------------------------------------------------------- - #ifdef INTEL_V512 - __assume(packed_j % INTEL_VECTOR_WIDTH == 0); - __assume(packed_j % 8 == 0); - __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0); - #endif + #ifdef INTEL_V512 + __assume(packed_j % INTEL_VECTOR_WIDTH == 0); + __assume(packed_j % 8 == 0); + __assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0); + #endif #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \ - sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) + #pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \ + sevdwl,sv0,sv1,sv2,sv3,sv4,sv5) #endif for (int jj = 0; jj < packed_j; jj++) { flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8; @@ -458,15 +458,15 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2; flt_t rtor_0, rtor_1, rtor_2; - const int sbindex = jlist_form[jj] >> SBBITS & 3; - const int j = jlist_form[jj] & NEIGHMASK; + const int sbindex = jlist_form[jj] >> SBBITS & 3; + const int j = jlist_form[jj] & NEIGHMASK; flt_t factor_lj = special_lj[sbindex]; const int jtype = jtype_form[jj]; - const flt_t sigma = ijci[jtype].sigma; - const flt_t epsilon = ijci[jtype].epsilon; - const flt_t shape2_0 = ic[jtype].shape2[0]; - const flt_t shape2_1 = ic[jtype].shape2[1]; - const flt_t shape2_2 = ic[jtype].shape2[2]; + const flt_t sigma = ijci[jtype].sigma; + const flt_t epsilon = ijci[jtype].epsilon; + const flt_t shape2_0 = ic[jtype].shape2[0]; + const flt_t shape2_1 = ic[jtype].shape2[1]; + const flt_t shape2_2 = ic[jtype].shape2[2]; flt_t one_eng, evdwl; ME_quat_to_mat_trans(quat[j], a2); @@ -488,7 +488,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ME_plus3(g1, g2, g12); flt_t kappa_0, kappa_1, kappa_2; ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj], - kappa, ierror); + kappa, ierror); // tempv = G12^-1*r12hat @@ -520,7 +520,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, flt_t iota_0, iota_1, iota_2; ME_plus3(b1, b2, b12); ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj], - iota, ierror); + iota, ierror); // tempv = G12^-1*r12hat @@ -534,7 +534,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, // compute dUr/dr temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) / - sigma; + sigma; temp1 = temp1 * (flt_t)24.0 * epsilon; flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5; flt_t dUr_0, dUr_1, dUr_2; @@ -548,8 +548,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, flt_t dchi_0, dchi_1, dchi_2; temp1 = ME_dot3(iota, r12hat); - temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * - std::pow(chi, (mu - (flt_t)1.0) / mu); + temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * + std::pow(chi, (mu - (flt_t)1.0) / mu); dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0); dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1); dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2); @@ -663,36 +663,36 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, temp3 = chi * eta; ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) * - (flt_t)-1.0; + (flt_t)-1.0; ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) * - (flt_t)-1.0; + (flt_t)-1.0; ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) * - (flt_t)-1.0; + (flt_t)-1.0; if (NEWTON_PAIR) { rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) * - (flt_t)-1.0; + (flt_t)-1.0; rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) * - (flt_t)-1.0; + (flt_t)-1.0; rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) * - (flt_t)-1.0; + (flt_t)-1.0; } one_eng = temp1 * chi; - #ifndef INTEL_VMASK - if (jlist_form[jj] == nall) { - one_eng = (flt_t)0.0; - fforce_0 = 0.0; - fforce_1 = 0.0; - fforce_2 = 0.0; - ttor_0 = 0.0; - ttor_1 = 0.0; - ttor_2 = 0.0; - rtor_0 = 0.0; - rtor_1 = 0.0; - rtor_2 = 0.0; - } - #endif + #ifndef INTEL_VMASK + if (jlist_form[jj] == nall) { + one_eng = (flt_t)0.0; + fforce_0 = 0.0; + fforce_1 = 0.0; + fforce_2 = 0.0; + ttor_0 = 0.0; + ttor_1 = 0.0; + ttor_2 = 0.0; + rtor_0 = 0.0; + rtor_1 = 0.0; + rtor_2 = 0.0; + } + #endif fforce_0 *= factor_lj; fforce_1 *= factor_lj; @@ -701,53 +701,53 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ttor_1 *= factor_lj; ttor_2 *= factor_lj; - #ifdef INTEL_VMASK - if (jlist_form[jj] < nall) { - #endif - fxtmp += fforce_0; - fytmp += fforce_1; - fztmp += fforce_2; - t1tmp += ttor_0; - t2tmp += ttor_1; - t3tmp += ttor_2; - - if (NEWTON_PAIR) { - rtor_0 *= factor_lj; - rtor_1 *= factor_lj; - rtor_2 *= factor_lj; - int jp = j * 2; - f[jp].x -= fforce_0; - f[jp].y -= fforce_1; - f[jp].z -= fforce_2; - jp++; - f[jp].x += rtor_0; - f[jp].y += rtor_1; - f[jp].z += rtor_2; - } - - if (EFLAG) { - evdwl = factor_lj * one_eng; - sevdwl += evdwl; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl; - if (NEWTON_PAIR) - f[j*2].w += (flt_t)0.5 * evdwl; - } - } - - if (NEWTON_PAIR == 0) { - if (vflag == 1) { - sv0 += delx_form[jj] * fforce_0; - sv1 += dely_form[jj] * fforce_1; - sv2 += delz_form[jj] * fforce_2; - sv3 += delx_form[jj] * fforce_1; - sv4 += delx_form[jj] * fforce_2; - sv5 += dely_form[jj] * fforce_2; - } + #ifdef INTEL_VMASK + if (jlist_form[jj] < nall) { + #endif + fxtmp += fforce_0; + fytmp += fforce_1; + fztmp += fforce_2; + t1tmp += ttor_0; + t2tmp += ttor_1; + t3tmp += ttor_2; + + if (NEWTON_PAIR) { + rtor_0 *= factor_lj; + rtor_1 *= factor_lj; + rtor_2 *= factor_lj; + int jp = j * 2; + f[jp].x -= fforce_0; + f[jp].y -= fforce_1; + f[jp].z -= fforce_2; + jp++; + f[jp].x += rtor_0; + f[jp].y += rtor_1; + f[jp].z += rtor_2; + } + + if (EFLAG) { + evdwl = factor_lj * one_eng; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + if (NEWTON_PAIR) + f[j*2].w += (flt_t)0.5 * evdwl; + } + } + + if (NEWTON_PAIR == 0) { + if (vflag == 1) { + sv0 += delx_form[jj] * fforce_0; + sv1 += dely_form[jj] * fforce_1; + sv2 += delz_form[jj] * fforce_2; + sv3 += delx_form[jj] * fforce_1; + sv4 += delx_form[jj] * fforce_2; + sv5 += dely_form[jj] * fforce_2; + } } // EVFLAG - #ifdef INTEL_VMASK - } - #endif + #ifdef INTEL_VMASK + } + #endif } // for jj // ------------------------------------------------------------- @@ -756,29 +756,29 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ierror = 2; int ip = i * 2; - if (NEWTON_PAIR) { - f[ip].x += fxtmp; - f[ip].y += fytmp; - f[ip].z += fztmp; - ip++; - f[ip].x += t1tmp; - f[ip].y += t2tmp; - f[ip].z += t3tmp; - } else { - f[ip].x = fxtmp; - f[ip].y = fytmp; - f[ip].z = fztmp; - ip++; - f[ip].x = t1tmp; - f[ip].y = t2tmp; - f[ip].z = t3tmp; - } - - if (EFLAG) { - oevdwl += sevdwl; - if (eatom) f[i * 2].w += fwtmp; - } - if (NEWTON_PAIR == 0) { + if (NEWTON_PAIR) { + f[ip].x += fxtmp; + f[ip].y += fytmp; + f[ip].z += fztmp; + ip++; + f[ip].x += t1tmp; + f[ip].y += t2tmp; + f[ip].z += t3tmp; + } else { + f[ip].x = fxtmp; + f[ip].y = fytmp; + f[ip].z = fztmp; + ip++; + f[ip].x = t1tmp; + f[ip].y = t2tmp; + f[ip].z = t3tmp; + } + + if (EFLAG) { + oevdwl += sevdwl; + if (eatom) f[i * 2].w += fwtmp; + } + if (NEWTON_PAIR == 0) { if (vflag == 1) { ov0 += sv0; ov1 += sv1; @@ -792,30 +792,30 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, int o_range; if (NEWTON_PAIR) { o_range = nall; - if (offload == 0) o_range -= minlocal; - IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, + if (offload == 0) o_range -= minlocal; + IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, sizeof(FORCE_T)); - const int sto = iito * 8; - const int fst4 = f_stride * 4; + const int sto = iito * 8; + const int fst4 = f_stride * 4; #if defined(_OPENMP) #pragma omp barrier #endif - acc_t *f_scalar = &f_start[0].x; + acc_t *f_scalar = &f_start[0].x; acc_t *f_scalar2 = f_scalar + fst4; - for (int t = 1; t < nthreads; t++) { + for (int t = 1; t < nthreads; t++) { #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd + #pragma vector aligned + #pragma simd #endif - for (int n = iifrom * 8; n < sto; n++) - f_scalar[n] += f_scalar2[n]; - f_scalar2 += fst4; + for (int n = iifrom * 8; n < sto; n++) + f_scalar[n] += f_scalar2[n]; + f_scalar2 += fst4; } if (vflag==2) { - const ATOM_T * _noalias const xo = x + minlocal; + const ATOM_T * _noalias const xo = x + minlocal; #if defined(LMP_SIMD_COMPILER) - #pragma novector + #pragma novector #endif for (int n = iifrom; n < iito; n++) { const int nt2 = n * 2; @@ -826,7 +826,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ov4 += f_start[nt2].z * xo[n].x; ov5 += f_start[nt2].z * xo[n].y; } - } + } } if (ierror) @@ -840,12 +840,12 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)-0.5; - ov1 *= (acc_t)-0.5; - ov2 *= (acc_t)-0.5; - ov3 *= (acc_t)-0.5; - ov4 *= (acc_t)-0.5; - ov5 *= (acc_t)-0.5; + ov0 *= (acc_t)-0.5; + ov1 *= (acc_t)-0.5; + ov2 *= (acc_t)-0.5; + ov3 *= (acc_t)-0.5; + ov4 *= (acc_t)-0.5; + ov5 *= (acc_t)-0.5; } ev_global[2] = ov0; ev_global[3] = ov1; @@ -982,7 +982,7 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, const int one_length, const int nthreads, Memory *memory, - const int cop) { + const int cop) { if (ntypes != _ntypes) { if (_ntypes > 0) { fc_packed3 *oic = ic; @@ -999,9 +999,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, int * ojlist_form = jlist_form[0]; if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && - orsq_form != NULL && odelx_form != NULL && odely_form != NULL && - odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL && - _cop >= 0) { + orsq_form != NULL && odelx_form != NULL && odely_form != NULL && + odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL && + _cop >= 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \ nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \ @@ -1033,14 +1033,14 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, memory->create(jlist_form, nthreads, one_length, "jlist_form"); for (int zn = 0; zn < nthreads; zn++) - for (int zo = 0; zo < one_length; zo++) { - rsq_form[zn][zo] = 10.0; - delx_form[zn][zo] = 10.0; - dely_form[zn][zo] = 10.0; - delz_form[zn][zo] = 10.0; - jtype_form[zn][zo] = 1; - jlist_form[zn][zo] = 0; - } + for (int zo = 0; zo < one_length; zo++) { + rsq_form[zn][zo] = 10.0; + delx_form[zn][zo] = 10.0; + dely_form[zn][zo] = 10.0; + delz_form[zn][zo] = 10.0; + jtype_form[zn][zo] = 1; + jlist_form[zn][zo] = 0; + } #ifdef _LMP_INTEL_OFFLOAD flt_t * ospecial_lj = special_lj; @@ -1057,9 +1057,9 @@ void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, int tp1sq = ntypes*ntypes; if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && - oic != NULL && orsq_form != NULL && odelx_form != NULL && - odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL && - ojlist_form !=NULL && cop >= 0) { + oic != NULL && orsq_form != NULL && odelx_form != NULL && + odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL && + ojlist_form !=NULL && cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \ diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index 7548b6eea3..fe99525122 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -67,8 +67,8 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -125,9 +125,9 @@ void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag, template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -177,8 +177,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -227,7 +227,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, #endif IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, q); + f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = oecoul = (acc_t)0; @@ -259,7 +259,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, int * _noalias const tjtype = ccachej + toffs; for (int i = iifrom; i < iito; i += iip) { - // const int i = ilist[ii]; + // const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; @@ -270,175 +270,175 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const int jnum = numneigh[i]; acc_t fxtmp,fytmp,fztmp,fwtmp; - acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; + acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; - if (NEWTON_PAIR == 0) - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - int ej = 0; + int ej = 0; #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma ivdep + #pragma vector aligned + #pragma ivdep #endif for (int jj = 0; jj < jnum; jj++) { const int j = jlist[jj] & NEIGHMASK; - const flt_t delx = xtmp - x[j].x; + const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; const flt_t rsq = delx * delx + dely * dely + delz * delz; - if (rsq < cut_coulsq) { - trsq[ej]=rsq; - tdelx[ej]=delx; - tdely[ej]=dely; - tdelz[ej]=delz; - tjtype[ej]=x[j].w; - tj[ej]=jlist[jj]; - ej++; - } - } + if (rsq < cut_coulsq) { + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + tjtype[ej]=x[j].w; + tj[ej]=jlist[jj]; + ej++; + } + } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma vector aligned + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcelj, evdwl, ecoul; forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; - const int j = tj[jj] & NEIGHMASK; + const int j = tj[jj] & NEIGHMASK; const int sbindex = tj[jj] >> SBBITS & 3; - const int jtype = tjtype[jj]; - const flt_t rsq = trsq[jj]; + const int jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; const flt_t r2inv = (flt_t)1.0 / rsq; #ifdef INTEL_ALLOW_TABLE if (!ncoultablebits || rsq <= tabinnersq) { #endif const flt_t A1 = 0.254829592; - const flt_t A2 = -0.284496736; - const flt_t A3 = 1.421413741; - const flt_t A4 = -1.453152027; - const flt_t A5 = 1.061405429; - const flt_t EWALD_F = 1.12837917; - const flt_t INV_EWALD_P = 1.0 / 0.3275911; - - const flt_t r = (flt_t)1.0 / sqrt(r2inv); - const flt_t grij = g_ewald * r; - const flt_t expm2 = exp(-grij * grij); - const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); - const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const flt_t prefactor = qqrd2e * qtmp * q[j] / r; - forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); - if (EFLAG) ecoul = prefactor * erfc; - - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t r = (flt_t)1.0 / sqrt(r2inv); + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; #ifdef INTEL_ALLOW_TABLE - } else { - float rsq_lookup = rsq; - const int itable = (__intel_castf32_u32(rsq_lookup) & - ncoulmask) >> ncoulshiftbits; - const flt_t fraction = (rsq_lookup - table[itable].r) * - table[itable].dr; - - const flt_t tablet = table[itable].f + - fraction * table[itable].df; - forcecoul = qtmp * q[j] * tablet; - if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + - fraction * detable[itable]); - if (sbindex) { - const flt_t table2 = ctable[itable] + - fraction * dctable[itable]; - const flt_t prefactor = qtmp * q[j] * table2; - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } + } else { + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + } } #endif - #ifdef INTEL_VMASK - if (rsq < cut_ljsq) { - #endif + #ifdef INTEL_VMASK + if (rsq < cut_ljsq) { + #endif flt_t r6inv = r2inv * r2inv * r2inv; forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y); if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w); - #ifdef INTEL_VMASK - if (rsq > cut_lj_innersq) { - #endif + #ifdef INTEL_VMASK + if (rsq > cut_lj_innersq) { + #endif const flt_t drsq = cut_ljsq - rsq; const flt_t cut2 = (rsq - cut_lj_innersq) * drsq; const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) * inv_denom_lj; const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj; if (EFLAG) { - #ifndef INTEL_VMASK - if (rsq > cut_lj_innersq) { - #endif + #ifndef INTEL_VMASK + if (rsq > cut_lj_innersq) { + #endif forcelj = forcelj * switch1 + evdwl * switch2; evdwl *= switch1; - #ifndef INTEL_VMASK - } - #endif + #ifndef INTEL_VMASK + } + #endif } else { const flt_t philj = r6inv * (lji[jtype].z*r6inv - lji[jtype].w); - #ifndef INTEL_VMASK - if (rsq > cut_lj_innersq) - #endif + #ifndef INTEL_VMASK + if (rsq > cut_lj_innersq) + #endif forcelj = forcelj * switch1 + philj * switch2; } - #ifdef INTEL_VMASK - } - #endif + #ifdef INTEL_VMASK + } + #endif if (sbindex) { const flt_t factor_lj = special_lj[sbindex]; forcelj *= factor_lj; if (EFLAG) evdwl *= factor_lj; } - #ifdef INTEL_VMASK - } - #else - if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } - #endif - - const flt_t fpair = (forcecoul + forcelj) * r2inv; - const flt_t fpx = fpair * tdelx[jj]; - fxtmp += fpx; - if (NEWTON_PAIR) f[j].x -= fpx; - const flt_t fpy = fpair * tdely[jj]; - fytmp += fpy; - if (NEWTON_PAIR) f[j].y -= fpy; - const flt_t fpz = fpair * tdelz[jj]; - fztmp += fpz; - if (NEWTON_PAIR) f[j].z -= fpz; - - if (EFLAG) { - sevdwl += evdwl; - secoul += ecoul; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - } - } - if (NEWTON_PAIR == 0) - IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], - fpx, fpy, fpz); + #ifdef INTEL_VMASK + } + #else + if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } + #endif + + const flt_t fpair = (forcecoul + forcelj) * r2inv; + const flt_t fpx = fpair * tdelx[jj]; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * tdely[jj]; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * tdelz[jj]; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; + + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + } + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj if (NEWTON_PAIR) { f[i].x += fxtmp; @@ -449,33 +449,33 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, f[i].y = fytmp; f[i].z = fztmp; } - IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { if (NEWTON_PAIR == 0) { - oevdwl *= (acc_t)0.5; - oecoul *= (acc_t)0.5; + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; } ev_global[0] = oevdwl; ev_global[1] = oecoul; } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } ev_global[2] = ov0; ev_global[3] = ov1; @@ -556,7 +556,7 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, double cut, cutneigh; if (cut_lj > cut_coul) error->all(FLERR, - "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); + "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { @@ -637,7 +637,7 @@ template <class flt_t> void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, const int ntable, Memory *memory, - const int cop) { + const int cop) { if ( (ntypes != _ntypes || ntable != _ntable) ) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD @@ -653,12 +653,12 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && otable != NULL && oetable != NULL && odetable != NULL && octable != NULL && odctable != NULL && ospecial_coul != NULL && - cop >= 0) { + cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ - nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \ - nocopy(otable: alloc_if(0) free_if(1)) \ - nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) + nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \ + nocopy(otable: alloc_if(0) free_if(1)) \ + nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) } #endif @@ -694,7 +694,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL && otable !=NULL && oetable != NULL && odetable != NULL && octable != NULL && odctable != NULL && ospecial_coul != NULL && - cop >= 0) { + cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \ nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \ diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h index cafc412a91..1b13d78497 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h @@ -50,8 +50,8 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong { const ForceConst<flt_t> &fc); template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc, const int astart, const int aend); + IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, @@ -75,7 +75,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong { ~ForceConst() { set_ntypes(0,0,NULL,_cop); } void set_ntypes(const int ntypes, const int ntable, Memory *memory, - const int cop); + const int cop); private: int _ntypes, _ntable, _cop; diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index 8a0bed2c01..e9775d6ec5 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -68,8 +68,8 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag) template <class flt_t, class acc_t> void PairLJCutCoulLongIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -92,7 +92,7 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag, { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - packthreads, sizeof(ATOM_T)); + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); @@ -124,9 +124,9 @@ void PairLJCutCoulLongIntel::compute(int eflag, int vflag, template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -171,8 +171,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -208,7 +208,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \ - in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ + in(ccache_stride,f_stride,nlocal,minlocal,separate_flag,offload) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ @@ -220,7 +220,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, #endif IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, q); + f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = oecoul = (acc_t)0; @@ -261,18 +261,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const int jnum = numneigh[i]; acc_t fxtmp,fytmp,fztmp,fwtmp; - acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; + acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; const flt_t qtmp = q[i]; fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; - if (NEWTON_PAIR == 0) - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; - int ej = 0; + int ej = 0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma ivdep @@ -282,91 +282,91 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; - const int jtype = x[j].w; + const int jtype = x[j].w; const flt_t rsq = delx * delx + dely * dely + delz * delz; - if (rsq < c_forcei[jtype].cutsq) { - trsq[ej]=rsq; - tdelx[ej]=delx; - tdely[ej]=dely; - tdelz[ej]=delz; - tjtype[ej]=jtype; - tj[ej]=jlist[jj]; - ej++; - } - } + if (rsq < c_forcei[jtype].cutsq) { + trsq[ej]=rsq; + tdelx[ej]=delx; + tdely[ej]=dely; + tdelz[ej]=delz; + tjtype[ej]=jtype; + tj[ej]=jlist[jj]; + ej++; + } + } #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned + #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \ - sv0, sv1, sv2, sv3, sv4, sv5) + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < ej; jj++) { flt_t forcecoul, forcelj, evdwl, ecoul; forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0; - const int j = tj[jj] & NEIGHMASK; + const int j = tj[jj] & NEIGHMASK; const int sbindex = tj[jj] >> SBBITS & 3; - const int jtype = tjtype[jj]; - const flt_t rsq = trsq[jj]; + const int jtype = tjtype[jj]; + const flt_t rsq = trsq[jj]; const flt_t r2inv = (flt_t)1.0 / rsq; #ifdef INTEL_ALLOW_TABLE - if (!ncoultablebits || rsq <= tabinnersq) { + if (!ncoultablebits || rsq <= tabinnersq) { #endif - const flt_t A1 = 0.254829592; - const flt_t A2 = -0.284496736; - const flt_t A3 = 1.421413741; - const flt_t A4 = -1.453152027; - const flt_t A5 = 1.061405429; - const flt_t EWALD_F = 1.12837917; - const flt_t INV_EWALD_P = 1.0 / 0.3275911; - - const flt_t r = (flt_t)1.0 / sqrt(r2inv); - const flt_t grij = g_ewald * r; - const flt_t expm2 = exp(-grij * grij); - const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); - const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; - const flt_t prefactor = qqrd2e * qtmp * q[j] / r; - forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); - if (EFLAG) ecoul = prefactor * erfc; - - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; + const flt_t A1 = 0.254829592; + const flt_t A2 = -0.284496736; + const flt_t A3 = 1.421413741; + const flt_t A4 = -1.453152027; + const flt_t A5 = 1.061405429; + const flt_t EWALD_F = 1.12837917; + const flt_t INV_EWALD_P = 1.0 / 0.3275911; + + const flt_t r = (flt_t)1.0 / sqrt(r2inv); + const flt_t grij = g_ewald * r; + const flt_t expm2 = exp(-grij * grij); + const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij); + const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + const flt_t prefactor = qqrd2e * qtmp * q[j] / r; + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (EFLAG) ecoul = prefactor * erfc; + + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])* + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; #ifdef INTEL_ALLOW_TABLE } else { - float rsq_lookup = rsq; - const int itable = (__intel_castf32_u32(rsq_lookup) & - ncoulmask) >> ncoulshiftbits; - const flt_t fraction = (rsq_lookup - table[itable].r) * - table[itable].dr; - - const flt_t tablet = table[itable].f + - fraction * table[itable].df; - forcecoul = qtmp * q[j] * tablet; - if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + - fraction * detable[itable]); - if (sbindex) { - const flt_t table2 = ctable[itable] + - fraction * dctable[itable]; - const flt_t prefactor = qtmp * q[j] * table2; - const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * - prefactor; - forcecoul -= adjust; - if (EFLAG) ecoul -= adjust; - } - } + float rsq_lookup = rsq; + const int itable = (__intel_castf32_u32(rsq_lookup) & + ncoulmask) >> ncoulshiftbits; + const flt_t fraction = (rsq_lookup - table[itable].r) * + table[itable].dr; + + const flt_t tablet = table[itable].f + + fraction * table[itable].df; + forcecoul = qtmp * q[j] * tablet; + if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] + + fraction * detable[itable]); + if (sbindex) { + const flt_t table2 = ctable[itable] + + fraction * dctable[itable]; + const flt_t prefactor = qtmp * q[j] * table2; + const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) * + prefactor; + forcecoul -= adjust; + if (EFLAG) ecoul -= adjust; + } + } #endif - #ifdef INTEL_VMASK - if (rsq < c_forcei[jtype].cut_ljsq) { - #endif + #ifdef INTEL_VMASK + if (rsq < c_forcei[jtype].cut_ljsq) { + #endif flt_t r6inv = r2inv * r2inv * r2inv; forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv - - c_forcei[jtype].lj2); + c_forcei[jtype].lj2); if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv - c_energyi[jtype].lj4) - c_energyi[jtype].offset; @@ -376,14 +376,14 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, forcelj *= factor_lj; if (EFLAG) evdwl *= factor_lj; } - #ifdef INTEL_VMASK - } - #else - if (rsq > c_forcei[jtype].cut_ljsq) - { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } - #endif - - const flt_t fpair = (forcecoul + forcelj) * r2inv; + #ifdef INTEL_VMASK + } + #else + if (rsq > c_forcei[jtype].cut_ljsq) + { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; } + #endif + + const flt_t fpair = (forcecoul + forcelj) * r2inv; const flt_t fpx = fpair * tdelx[jj]; fxtmp += fpx; if (NEWTON_PAIR) f[j].x -= fpx; @@ -394,58 +394,58 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, fztmp += fpz; if (NEWTON_PAIR) f[j].z -= fpz; - if (EFLAG) { - sevdwl += evdwl; - secoul += ecoul; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; - if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (EFLAG) { + sevdwl += evdwl; + secoul += ecoul; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; + if (NEWTON_PAIR) + f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul; } - } - if (NEWTON_PAIR == 0) - IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], - fpx, fpy, fpz); + } + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, tdelx[jj], tdely[jj], tdelz[jj], + fpx, fpy, fpz); } // for jj - if (NEWTON_PAIR) { + if (NEWTON_PAIR) { f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; + f[i].y += fytmp; + f[i].z += fztmp; } else { f[i].x = fxtmp; - f[i].y = fytmp; - f[i].z = fztmp; + f[i].y = fytmp; + f[i].z = fztmp; } - IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + IP_PRE_ev_tally_atomq(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { if (NEWTON_PAIR == 0) { - oevdwl *= (acc_t)0.5; - oecoul *= (acc_t)0.5; + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; } ev_global[0] = oevdwl; ev_global[1] = oecoul; } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; - } + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; + } ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -547,8 +547,8 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < tp1; i++) { for (int j = 0; j < tp1; j++) { if (cutsq[i][j] < cut_ljsq[i][j]) - error->all(FLERR, - "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic"); + error->all(FLERR, + "Intel variant of lj/cut/coul/long expects lj cutoff<=coulombic"); fc.c_force[i][j].cutsq = cutsq[i][j]; fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j]; fc.c_force[i][j].lj1 = lj1[i][j]; @@ -598,9 +598,9 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, - const int ntable, - Memory *memory, - const int cop) { + const int ntable, + Memory *memory, + const int cop) { if ( (ntypes != _ntypes || ntable != _ntable) ) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD @@ -619,9 +619,9 @@ void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, ospecial_coul != NULL && _cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \ - nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ - nocopy(otable: alloc_if(0) free_if(1)) \ - nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) + nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \ + nocopy(otable: alloc_if(0) free_if(1)) \ + nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1)) } #endif diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h index 2b7d87c040..288a6a7bc4 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h @@ -50,8 +50,8 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong { const ForceConst<flt_t> &fc); template <int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t> void eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc, const int astart, const int aend); + IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, @@ -76,7 +76,7 @@ class PairLJCutCoulLongIntel : public PairLJCutCoulLong { ~ForceConst() { set_ntypes(0,0,NULL,_cop); } void set_ntypes(const int ntypes, const int ntable, Memory *memory, - const int cop); + const int cop); private: int _ntypes, _ntable, _cop; diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index 8620646343..4871821842 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -96,37 +96,37 @@ void PairLJCutIntel::compute(int eflag, int vflag, if (_onetype) { if (eflag) { if (force->newton_pair) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } else { if (eflag) { if (force->newton_pair) { - eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (force->newton_pair) { - eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); - eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } @@ -161,8 +161,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -176,7 +176,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, #endif IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, - f_stride, x, 0); + f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = (acc_t)0; @@ -200,23 +200,23 @@ void PairLJCutIntel::eval(const int offload, const int vflag, flt_t cutsq, lj1, lj2, lj3, lj4, offset; if (ONETYPE) { - cutsq = ljc12o[3].cutsq; - lj1 = ljc12o[3].lj1; - lj2 = ljc12o[3].lj2; - lj3 = lj34[3].lj3; - lj4 = lj34[3].lj4; - offset = ljc12o[3].offset; + cutsq = ljc12o[3].cutsq; + lj1 = ljc12o[3].lj1; + lj2 = ljc12o[3].lj2; + lj3 = lj34[3].lj3; + lj4 = lj34[3].lj4; + offset = ljc12o[3].offset; } for (int i = iifrom; i < iito; i += iip) { int itype, ptr_off; const FC_PACKED1_T * _noalias ljc12oi; const FC_PACKED2_T * _noalias lj34i; - if (!ONETYPE) { - itype = x[i].w; + if (!ONETYPE) { + itype = x[i].w; ptr_off = itype * ntypes; ljc12oi = ljc12o + ptr_off; lj34i = lj34 + ptr_off; - } + } const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; @@ -228,113 +228,113 @@ void PairLJCutIntel::eval(const int offload, const int vflag, const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; fxtmp = fytmp = fztmp = (acc_t)0; - if (EFLAG) fwtmp = sevdwl = (acc_t)0; - if (NEWTON_PAIR == 0) - if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (NEWTON_PAIR == 0) + if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned - #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ - sv0, sv1, sv2, sv3, sv4, sv5) + #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \ + sv0, sv1, sv2, sv3, sv4, sv5) #endif for (int jj = 0; jj < jnum; jj++) { flt_t forcelj, evdwl; forcelj = evdwl = (flt_t)0.0; - int j, jtype, sbindex; - if (!ONETYPE) { - sbindex = jlist[jj] >> SBBITS & 3; - j = jlist[jj] & NEIGHMASK; - } else - j = jlist[jj]; + int j, jtype, sbindex; + if (!ONETYPE) { + sbindex = jlist[jj] >> SBBITS & 3; + j = jlist[jj] & NEIGHMASK; + } else + j = jlist[jj]; const flt_t delx = xtmp - x[j].x; const flt_t dely = ytmp - x[j].y; const flt_t delz = ztmp - x[j].z; if (!ONETYPE) { - jtype = x[j].w; + jtype = x[j].w; cutsq = ljc12oi[jtype].cutsq; - } + } const flt_t rsq = delx * delx + dely * dely + delz * delz; #ifdef INTEL_VMASK if (rsq < cutsq) { - #endif + #endif flt_t factor_lj; - if (!ONETYPE) factor_lj = special_lj[sbindex]; + if (!ONETYPE) factor_lj = special_lj[sbindex]; flt_t r2inv = 1.0 / rsq; flt_t r6inv = r2inv * r2inv * r2inv; #ifndef INTEL_VMASK - if (rsq > cutsq) r6inv = (flt_t)0.0; - #endif - if (!ONETYPE) { - lj1 = ljc12oi[jtype].lj1; - lj2 = ljc12oi[jtype].lj2; - } + if (rsq > cutsq) r6inv = (flt_t)0.0; + #endif + if (!ONETYPE) { + lj1 = ljc12oi[jtype].lj1; + lj2 = ljc12oi[jtype].lj2; + } forcelj = r6inv * (lj1 * r6inv - lj2); flt_t fpair; - if (!ONETYPE) - fpair = factor_lj * forcelj * r2inv; - else - fpair = forcelj * r2inv; - - const flt_t fpx = fpair * delx; - fxtmp += fpx; - if (NEWTON_PAIR) f[j].x -= fpx; - const flt_t fpy = fpair * dely; - fytmp += fpy; - if (NEWTON_PAIR) f[j].y -= fpy; - const flt_t fpz = fpair * delz; - fztmp += fpz; - if (NEWTON_PAIR) f[j].z -= fpz; + if (!ONETYPE) + fpair = factor_lj * forcelj * r2inv; + else + fpair = forcelj * r2inv; + + const flt_t fpx = fpair * delx; + fxtmp += fpx; + if (NEWTON_PAIR) f[j].x -= fpx; + const flt_t fpy = fpair * dely; + fytmp += fpy; + if (NEWTON_PAIR) f[j].y -= fpy; + const flt_t fpz = fpair * delz; + fztmp += fpz; + if (NEWTON_PAIR) f[j].z -= fpz; if (EFLAG) { - if (!ONETYPE) { - lj3 = lj34i[jtype].lj3; - lj4 = lj34i[jtype].lj4; - offset = ljc12oi[jtype].offset; - } - evdwl = r6inv * (lj3 * r6inv - lj4); + if (!ONETYPE) { + lj3 = lj34i[jtype].lj3; + lj4 = lj34i[jtype].lj4; + offset = ljc12oi[jtype].offset; + } + evdwl = r6inv * (lj3 * r6inv - lj4); #ifdef INTEL_VMASK - evdwl -= offset; + evdwl -= offset; #else - if (rsq < cutsq) evdwl -= offset; + if (rsq < cutsq) evdwl -= offset; #endif - if (!ONETYPE) evdwl *= factor_lj; - sevdwl += evdwl; - if (eatom) { + if (!ONETYPE) evdwl *= factor_lj; + sevdwl += evdwl; + if (eatom) { fwtmp += (flt_t)0.5 * evdwl; if (NEWTON_PAIR) - f[j].w += (flt_t)0.5 * evdwl; + f[j].w += (flt_t)0.5 * evdwl; } - } + } - if (NEWTON_PAIR == 0) - IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); + if (NEWTON_PAIR == 0) + IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz); #ifdef INTEL_VMASK } // if rsq #endif } // for jj - if (NEWTON_PAIR) { - f[i].x += fxtmp; - f[i].y += fytmp; - f[i].z += fztmp; - } else { - f[i].x = fxtmp; - f[i].y = fytmp; - f[i].z = fztmp; - } - - IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); + if (NEWTON_PAIR) { + f[i].x += fxtmp; + f[i].y += fytmp; + f[i].z += fztmp; + } else { + f[i].x = fxtmp; + f[i].y = fytmp; + f[i].z = fztmp; + } + + IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp); } // for ii IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end omp IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; @@ -343,12 +343,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag, } if (vflag) { if (NEWTON_PAIR == 0) { - ov0 *= (acc_t)0.5; - ov1 *= (acc_t)0.5; - ov2 *= (acc_t)0.5; - ov3 *= (acc_t)0.5; - ov4 *= (acc_t)0.5; - ov5 *= (acc_t)0.5; + ov0 *= (acc_t)0.5; + ov1 *= (acc_t)0.5; + ov2 *= (acc_t)0.5; + ov3 *= (acc_t)0.5; + ov4 *= (acc_t)0.5; + ov5 *= (acc_t)0.5; } ev_global[2] = ov0; ev_global[3] = ov1; @@ -454,7 +454,7 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc, template <class flt_t> void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, Memory *memory, - const int cop) { + const int cop) { if (ntypes != _ntypes) { if (_ntypes > 0) { fc_packed1 *oljc12o = ljc12o[0]; diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp index 99c7045098..86929d41ea 100644 --- a/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.cpp @@ -1,50 +1,50 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: William McDoniel (RWTH Aachen University) -------------------------------------------------------------------------- */ - -#include <math.h> -#include "pair_lj_long_coul_long_intel.h" -#include "atom.h" -#include "comm.h" -#include "force.h" -#include "group.h" -#include "kspace.h" -#include "memory.h" -#include "neighbor.h" -#include "neigh_list.h" -#include "neigh_request.h" -#include "memory.h" -#include "suffix.h" - - -using namespace LAMMPS_NS; - -#define C_FORCE_T typename ForceConst<flt_t>::c_force_t -#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t -#define TABLE_T typename ForceConst<flt_t>::table_t - -PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) : - PairLJLongCoulLong(lmp) -{ - suffix_flag |= Suffix::INTEL; - respa_enable = 0; - cut_respa = NULL; -} - - -PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel() -{ -} +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#include <math.h> +#include "pair_lj_long_coul_long_intel.h" +#include "atom.h" +#include "comm.h" +#include "force.h" +#include "group.h" +#include "kspace.h" +#include "memory.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "neigh_request.h" +#include "memory.h" +#include "suffix.h" + + +using namespace LAMMPS_NS; + +#define C_FORCE_T typename ForceConst<flt_t>::c_force_t +#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t +#define TABLE_T typename ForceConst<flt_t>::table_t + +PairLJLongCoulLongIntel::PairLJLongCoulLongIntel(LAMMPS *lmp) : + PairLJLongCoulLong(lmp) +{ + suffix_flag |= Suffix::INTEL; + respa_enable = 0; + cut_respa = NULL; +} + + +PairLJLongCoulLongIntel::~PairLJLongCoulLongIntel() +{ +} diff --git a/src/USER-INTEL/pair_lj_long_coul_long_intel.h b/src/USER-INTEL/pair_lj_long_coul_long_intel.h index 42eef932ec..b7d3504ecd 100644 --- a/src/USER-INTEL/pair_lj_long_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_long_coul_long_intel.h @@ -1,39 +1,39 @@ -/* *- c++ -*- ----------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: William McDoniel (RWTH Aachen University) -------------------------------------------------------------------------- */ - -#ifdef PAIR_CLASS - -PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel) - -#else - -#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H -#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H - -#include "pair_lj_long_coul_long.h" -#include "fix_intel.h" - -namespace LAMMPS_NS { - class PairLJLongCoulLongIntel : public PairLJLongCoulLong { - public: - PairLJLongCoulLongIntel(class LAMMPS *); - virtual ~PairLJLongCoulLongIntel(); - - }; -} -#endif -#endif +/* *- c++ -*- ----------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/long/coul/long/intel,PairLJLongCoulLongIntel) + +#else + +#ifndef LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H +#define LMP_PAIR_LJ_LONG_COUL_LONG_INTEL_H + +#include "pair_lj_long_coul_long.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + class PairLJLongCoulLongIntel : public PairLJLongCoulLong { + public: + PairLJLongCoulLongIntel(class LAMMPS *); + virtual ~PairLJLongCoulLongIntel(); + + }; +} +#endif +#endif diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp index 835f78664a..7a6b7afd92 100644 --- a/src/USER-INTEL/pair_sw_intel.cpp +++ b/src/USER-INTEL/pair_sw_intel.cpp @@ -77,7 +77,7 @@ void PairSWIntel::compute(int eflag, int vflag) { if (fix->precision() == FixIntel::PREC_MODE_MIXED) compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), - force_const_single); + force_const_single); else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) compute<double,double>(eflag, vflag, fix->get_double_buffers(), force_const_double); @@ -131,37 +131,37 @@ void PairSWIntel::compute(int eflag, int vflag, if (_onetype) { if (_spq) { if (eflag) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } else { if (eflag) { - eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } } else { if (_spq) { if (eflag) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } else { if (eflag) { - eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } else { - eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); } } } @@ -174,7 +174,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t> void PairSWIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, const int astart, - const int aend, const int pad_width) + const int aend, const int pad_width) { const int inum = aend - astart; if (inum == 0) return; @@ -278,23 +278,23 @@ void PairSWIntel::eval(const int offload, const int vflag, flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2; if (ONETYPE) { cutsq = p2[3].cutsq; - cut = p2f[3].cut; - sigma = p2f[3].sigma; - c1 = p2f2[3].c1; - c2 = p2f2[3].c2; - c3 = p2f2[3].c3; - c4 = p2f2[3].c4; - sigma_gamma = p2[3].sigma_gamma; - costheta = p3[7].costheta; - lambda_epsilon = p3[7].lambda_epsilon; - lambda_epsilon2 = p3[7].lambda_epsilon2; - if (SPQ == 0) { + cut = p2f[3].cut; + sigma = p2f[3].sigma; + c1 = p2f2[3].c1; + c2 = p2f2[3].c2; + c3 = p2f2[3].c3; + c4 = p2f2[3].c4; + sigma_gamma = p2[3].sigma_gamma; + costheta = p3[7].costheta; + lambda_epsilon = p3[7].lambda_epsilon; + lambda_epsilon2 = p3[7].lambda_epsilon2; + if (SPQ == 0) { powerp = p2f[3].powerp; - powerq = p2f[3].powerq; + powerq = p2f[3].powerq; } - if (EFLAG) { + if (EFLAG) { c5 = p2e[3].c5; - c6 = p2e[3].c6; + c6 = p2e[3].c6; } } @@ -304,23 +304,23 @@ void PairSWIntel::eval(const int offload, const int vflag, const flt_t ytmp = x[i].y; const flt_t ztmp = x[i].z; - if (!ONETYPE) { + if (!ONETYPE) { itype = x[i].w; - itype_offset = itype * ntypes; - } + itype_offset = itype * ntypes; + } const int * _noalias const jlist = firstneigh + cnumneigh[i]; const int jnum = numneigh[i]; - const int jnumhalf = numneighhalf[i]; + const int jnumhalf = numneighhalf[i]; acc_t fxtmp, fytmp, fztmp, fwtmp; acc_t sevdwl; fxtmp = fytmp = fztmp = (acc_t)0.0; - if (EFLAG) fwtmp = sevdwl = (acc_t)0; + if (EFLAG) fwtmp = sevdwl = (acc_t)0; - int ejnum = 0, ejnumhalf = 0; - #pragma vector aligned - #pragma ivdep + int ejnum = 0, ejnumhalf = 0; + #pragma vector aligned + #pragma ivdep for (int jj = 0; jj < jnum; jj++) { int j = jlist[jj]; j &= NEIGHMASK; @@ -329,115 +329,115 @@ void PairSWIntel::eval(const int offload, const int vflag, const flt_t delz = x[j].z - ztmp; int jtype, ijtype; if (!ONETYPE) { - jtype = x[j].w; - ijtype = itype_offset + jtype; - cutsq = p2[ijtype].cutsq; - } + jtype = x[j].w; + ijtype = itype_offset + jtype; + cutsq = p2[ijtype].cutsq; + } const flt_t rsq1 = delx * delx + dely * dely + delz * delz; if (rsq1 < cutsq) { - tdelx[ejnum] = delx; - tdely[ejnum] = dely; - tdelz[ejnum] = delz; - trsq[ejnum] = rsq1; - tj[ejnum] = j; - if (!ONETYPE) tjtype[ejnum] = jtype; - ejnum++; - if (jj < jnumhalf) ejnumhalf++; - } - } - int ejnum_pad = ejnum; - - while ( (ejnum_pad % pad_width) != 0) { - tdelx[ejnum_pad] = (flt_t)0.0; - tdely[ejnum_pad] = (flt_t)0.0; - tdelz[ejnum_pad] = (flt_t)0.0; - trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0; - tj[ejnum_pad] = nall; - if (!ONETYPE) tjtype[ejnum_pad] = 0; - ejnum_pad++; - } - + tdelx[ejnum] = delx; + tdely[ejnum] = dely; + tdelz[ejnum] = delz; + trsq[ejnum] = rsq1; + tj[ejnum] = j; + if (!ONETYPE) tjtype[ejnum] = jtype; + ejnum++; + if (jj < jnumhalf) ejnumhalf++; + } + } + int ejnum_pad = ejnum; + + while ( (ejnum_pad % pad_width) != 0) { + tdelx[ejnum_pad] = (flt_t)0.0; + tdely[ejnum_pad] = (flt_t)0.0; + tdelz[ejnum_pad] = (flt_t)0.0; + trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0; + tj[ejnum_pad] = nall; + if (!ONETYPE) tjtype[ejnum_pad] = 0; + ejnum_pad++; + } + #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned + #pragma vector aligned #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl) - #endif + #endif for (int jj = 0; jj < ejnum_pad; jj++) { acc_t fjxtmp, fjytmp, fjztmp, fjtmp; fjxtmp = fjytmp = fjztmp = (acc_t)0.0; if (EFLAG) fjtmp = (acc_t)0.0; - int ijtype; + int ijtype; - if (!ONETYPE) ijtype = tjtype[jj] + itype_offset; + if (!ONETYPE) ijtype = tjtype[jj] + itype_offset; const flt_t rsq1 = trsq[jj]; const flt_t rinvsq1 = (flt_t)1.0 / rsq1; const flt_t r1 = (flt_t)1.0/sqrt(rinvsq1); - if (!ONETYPE) cut = p2f[ijtype].cut; + if (!ONETYPE) cut = p2f[ijtype].cut; const flt_t rainv1 = (flt_t)1.0 / (r1 - cut); - - // two-body interactions, skip half of them - flt_t rp, rq; - if (SPQ == 1) { - rp = r1 * r1; - rp *= rp; - rp = (flt_t)1.0 / rp; - rq = (flt_t)1.0; - } else { + + // two-body interactions, skip half of them + flt_t rp, rq; + if (SPQ == 1) { + rp = r1 * r1; + rp *= rp; + rp = (flt_t)1.0 / rp; + rq = (flt_t)1.0; + } else { if (!ONETYPE) { powerp = p2f[ijtype].powerp; - powerq = p2f[ijtype].powerq; + powerq = p2f[ijtype].powerq; } - rp = std::pow(r1, powerp); - rq = std::pow(r1, powerq); - } + rp = std::pow(r1, powerp); + rq = std::pow(r1, powerq); + } - if (!ONETYPE) { + if (!ONETYPE) { sigma = p2f[ijtype].sigma; - c1 = p2f2[ijtype].c1; - c2 = p2f2[ijtype].c2; - c3 = p2f2[ijtype].c3; - c4 = p2f2[ijtype].c4; + c1 = p2f2[ijtype].c1; + c2 = p2f2[ijtype].c2; + c3 = p2f2[ijtype].c3; + c4 = p2f2[ijtype].c4; } - const flt_t rainvsq = rainv1 * rainv1 * r1; - flt_t expsrainv = exp(sigma * rainv1); - if (jj >= ejnumhalf) expsrainv = (flt_t)0.0; - const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * - rainvsq) * expsrainv * rinvsq1; - - const flt_t delx = tdelx[jj]; - const flt_t dely = tdely[jj]; - const flt_t delz = tdelz[jj]; - const flt_t fpx = fpair * delx; - fxtmp -= fpx; - fjxtmp += fpx; - const flt_t fpy = fpair * dely; - fytmp -= fpy; - fjytmp += fpy; - const flt_t fpz = fpair * delz; - fztmp -= fpz; - fjztmp += fpz; - - if (EFLAG) { - flt_t evdwl; - if (!ONETYPE) { - c5 = p2e[ijtype].c5; - c6 = p2e[ijtype].c6; + const flt_t rainvsq = rainv1 * rainv1 * r1; + flt_t expsrainv = exp(sigma * rainv1); + if (jj >= ejnumhalf) expsrainv = (flt_t)0.0; + const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * + rainvsq) * expsrainv * rinvsq1; + + const flt_t delx = tdelx[jj]; + const flt_t dely = tdely[jj]; + const flt_t delz = tdelz[jj]; + const flt_t fpx = fpair * delx; + fxtmp -= fpx; + fjxtmp += fpx; + const flt_t fpy = fpair * dely; + fytmp -= fpy; + fjytmp += fpy; + const flt_t fpz = fpair * delz; + fztmp -= fpz; + fjztmp += fpz; + + if (EFLAG) { + flt_t evdwl; + if (!ONETYPE) { + c5 = p2e[ijtype].c5; + c6 = p2e[ijtype].c6; } - evdwl = (c5 * rp - c6 * rq) * expsrainv; - sevdwl += evdwl; - if (eatom) { - fwtmp += (flt_t)0.5 * evdwl; - fjtmp += (flt_t)0.5 * evdwl; + evdwl = (c5 * rp - c6 * rq) * expsrainv; + sevdwl += evdwl; + if (eatom) { + fwtmp += (flt_t)0.5 * evdwl; + fjtmp += (flt_t)0.5 * evdwl; } - } + } - /*---------------------------------------------*/ + /*---------------------------------------------*/ - int ijkoff; - if (!ONETYPE) { + int ijkoff; + if (!ONETYPE) { sigma_gamma = p2[ijtype].sigma_gamma; - ijkoff = ijtype * ntypes; + ijkoff = ijtype * ntypes; } flt_t gsrainv1 = sigma_gamma * rainv1; @@ -446,15 +446,15 @@ void PairSWIntel::eval(const int offload, const int vflag, for (int kk = 0; kk < ejnum; kk++) { int iktype, ijktype; - if (!ONETYPE) { + if (!ONETYPE) { iktype = tjtype[kk]; - ijktype = ijkoff + iktype; - iktype += itype_offset; - cut = p2[iktype].cut; - sigma_gamma = p2[iktype].sigma_gamma; - costheta = p3[ijktype].costheta; - lambda_epsilon = p3[ijktype].lambda_epsilon; - lambda_epsilon2 = p3[ijktype].lambda_epsilon2; + ijktype = ijkoff + iktype; + iktype += itype_offset; + cut = p2[iktype].cut; + sigma_gamma = p2[iktype].sigma_gamma; + costheta = p3[ijktype].costheta; + lambda_epsilon = p3[ijktype].lambda_epsilon; + lambda_epsilon2 = p3[ijktype].lambda_epsilon2; } flt_t delr2[3]; @@ -463,76 +463,76 @@ void PairSWIntel::eval(const int offload, const int vflag, delr2[2] = tdelz[kk]; const flt_t rsq2 = trsq[kk]; - const flt_t rinvsq2 = (flt_t)1.0 / rsq2; - const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2); - const flt_t rainv2 = (flt_t)1.0 / (r2 - cut); - const flt_t gsrainv2 = sigma_gamma * rainv2; - const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2; - const flt_t expgsrainv2 = exp(gsrainv2); + const flt_t rinvsq2 = (flt_t)1.0 / rsq2; + const flt_t r2 = (flt_t)1.0 / sqrt(rinvsq2); + const flt_t rainv2 = (flt_t)1.0 / (r2 - cut); + const flt_t gsrainv2 = sigma_gamma * rainv2; + const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2; + const flt_t expgsrainv2 = exp(gsrainv2); - const flt_t rinv12 = (flt_t)1.0 / (r1 * r2); - const flt_t cs = (delx * delr2[0] + dely * delr2[1] + + const flt_t rinv12 = (flt_t)1.0 / (r1 * r2); + const flt_t cs = (delx * delr2[0] + dely * delr2[1] + delz * delr2[2]) * rinv12; - const flt_t delcs = cs - costheta; - const flt_t delcssq = delcs*delcs; - - flt_t kfactor; - if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0; - else kfactor = (flt_t)1.0; - - const flt_t facexp = expgsrainv1*expgsrainv2*kfactor; - const flt_t facrad = lambda_epsilon * facexp * delcssq; - const flt_t frad1 = facrad*gsrainvsq1; - const flt_t frad2 = facrad*gsrainvsq2; - const flt_t facang = lambda_epsilon2 * facexp * delcs; - const flt_t facang12 = rinv12*facang; - const flt_t csfacang = cs*facang; - const flt_t csfac1 = rinvsq1*csfacang; - - const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12; - const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12; - const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12; - - fxtmp -= fjx; - fytmp -= fjy; - fztmp -= fjz; - fjxtmp += fjx; - fjytmp += fjy; - fjztmp += fjz; - - if (EFLAG) { - const flt_t evdwl = facrad * (flt_t)0.5; - sevdwl += evdwl; - if (eatom) { - fwtmp += (acc_t)0.33333333 * evdwl; - fjtmp += (acc_t)0.33333333 * facrad; - } - } - } // for kk - const int j = tj[jj]; + const flt_t delcs = cs - costheta; + const flt_t delcssq = delcs*delcs; + + flt_t kfactor; + if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0; + else kfactor = (flt_t)1.0; + + const flt_t facexp = expgsrainv1*expgsrainv2*kfactor; + const flt_t facrad = lambda_epsilon * facexp * delcssq; + const flt_t frad1 = facrad*gsrainvsq1; + const flt_t frad2 = facrad*gsrainvsq2; + const flt_t facang = lambda_epsilon2 * facexp * delcs; + const flt_t facang12 = rinv12*facang; + const flt_t csfacang = cs*facang; + const flt_t csfac1 = rinvsq1*csfacang; + + const flt_t fjx = delx*(frad1+csfac1)-delr2[0]*facang12; + const flt_t fjy = dely*(frad1+csfac1)-delr2[1]*facang12; + const flt_t fjz = delz*(frad1+csfac1)-delr2[2]*facang12; + + fxtmp -= fjx; + fytmp -= fjy; + fztmp -= fjz; + fjxtmp += fjx; + fjytmp += fjy; + fjztmp += fjz; + + if (EFLAG) { + const flt_t evdwl = facrad * (flt_t)0.5; + sevdwl += evdwl; + if (eatom) { + fwtmp += (acc_t)0.33333333 * evdwl; + fjtmp += (acc_t)0.33333333 * facrad; + } + } + } // for kk + const int j = tj[jj]; f[j].x += fjxtmp; f[j].y += fjytmp; f[j].z += fjztmp; - if (EFLAG) - if (eatom) f[j].w += fjtmp; + if (EFLAG) + if (eatom) f[j].w += fjtmp; } // for jj f[i].x += fxtmp; f[i].y += fytmp; f[i].z += fztmp; - if (EFLAG) { - f[i].w += fwtmp; - oevdwl += sevdwl; - } + if (EFLAG) { + f[i].w += fwtmp; + oevdwl += sevdwl; + } } // for ii - IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, - x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); + IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, + x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); } // end omp IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { ev_global[0] = oevdwl; @@ -561,7 +561,7 @@ void PairSWIntel::eval(const int offload, const int vflag, fix->add_result_array(f_start, 0, offload); } -#else +#else /* ---------------------------------------------------------------------- @@ -577,8 +577,8 @@ authors for more details. template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t> void PairSWIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, const int astart, - const int aend, const int pad_width) + const ForceConst<flt_t> &fc, const int astart, + const int aend, const int pad_width) { typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t; typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t; @@ -646,7 +646,7 @@ void PairSWIntel::eval(const int offload, const int vflag, in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \ in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \ in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \ - in(ccache_stride3) \ + in(ccache_stride3) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ @@ -669,9 +669,9 @@ void PairSWIntel::eval(const int offload, const int vflag, #endif { int iifrom, iip, iito, tid; - IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads, - swidth); - + IP_PRE_omp_stride_id_vec(iifrom, iip, iito, tid, inum, nthreads, + swidth); + iifrom += astart; iito += astart; @@ -692,22 +692,22 @@ void PairSWIntel::eval(const int offload, const int vflag, SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6; SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2; if (ONETYPE) { - cutsq = SIMD_set(p2[3].cutsq); - cut = SIMD_set(p2f[3].cut); - sigma = SIMD_set(p2f[3].sigma); - c1 = SIMD_set(p2f2[3].c1); - c2 = SIMD_set(p2f2[3].c2); - c3 = SIMD_set(p2f2[3].c3); - c4 = SIMD_set(p2f2[3].c4); - sigma_gamma = SIMD_set(p2[3].sigma_gamma); - costheta = SIMD_set(p3[7].costheta); - lambda_epsilon = SIMD_set(p3[7].lambda_epsilon); - lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2); - if (SPQ == 0) { - powerp = SIMD_set(p2f[3].powerp); - powerq = SIMD_set(p2f[3].powerq); - } - if (EFLAG) { + cutsq = SIMD_set(p2[3].cutsq); + cut = SIMD_set(p2f[3].cut); + sigma = SIMD_set(p2f[3].sigma); + c1 = SIMD_set(p2f2[3].c1); + c2 = SIMD_set(p2f2[3].c2); + c3 = SIMD_set(p2f2[3].c3); + c4 = SIMD_set(p2f2[3].c4); + sigma_gamma = SIMD_set(p2[3].sigma_gamma); + costheta = SIMD_set(p3[7].costheta); + lambda_epsilon = SIMD_set(p3[7].lambda_epsilon); + lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2); + if (SPQ == 0) { + powerp = SIMD_set(p2f[3].powerp); + powerq = SIMD_set(p2f[3].powerq); + } + if (EFLAG) { c5 = SIMD_set(p2e[3].c5); c6 = SIMD_set(p2e[3].c6); } @@ -715,120 +715,120 @@ void PairSWIntel::eval(const int offload, const int vflag, SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128, - 144,160,176,192,208,224,240); + 144,160,176,192,208,224,240); ilist = ilist + iifrom; acc_t * const dforce = &(f[0].x); for (int i = iifrom; i < iito; i += iip) { - SIMD_mask imask = ilist < iito; - SIMD_flt_t xtmp, ytmp, ztmp; - SIMD_int itype, itype_offset; - - if (ONETYPE) - SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp); - else { - SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype); - itype_offset = itype * ntypes; - } - - #ifdef OUTER_CHUNK - const int* ng = firstneigh + cnumneigh[i] - swidth; - #else + SIMD_mask imask = ilist < iito; + SIMD_flt_t xtmp, ytmp, ztmp; + SIMD_int itype, itype_offset; + + if (ONETYPE) + SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp); + else { + SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype); + itype_offset = itype * ntypes; + } + + #ifdef OUTER_CHUNK + const int* ng = firstneigh + cnumneigh[i] - swidth; + #else SIMD_int ng = SIMD_load(cnumneigh + i); - ng = ng - 1; - #endif - const SIMD_int jnum = SIMD_loadz(imask, numneigh + i); - const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i); - const int jnum_max = SIMD_max(jnum); - - SIMD_acc_t fxtmp = SIMD_set((acc_t)0); - SIMD_acc_t fytmp = SIMD_set((acc_t)0); - SIMD_acc_t fztmp = SIMD_set((acc_t)0); - SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2; - if (is_same<flt_t,acc_t>::value == 0) { - fxtmp2 = SIMD_set((acc_t)0); - fytmp2 = SIMD_set((acc_t)0); - fztmp2 = SIMD_set((acc_t)0); + ng = ng - 1; + #endif + const SIMD_int jnum = SIMD_loadz(imask, numneigh + i); + const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i); + const int jnum_max = SIMD_max(jnum); + + SIMD_acc_t fxtmp = SIMD_set((acc_t)0); + SIMD_acc_t fytmp = SIMD_set((acc_t)0); + SIMD_acc_t fztmp = SIMD_set((acc_t)0); + SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2; + if (is_same<flt_t,acc_t>::value == 0) { + fxtmp2 = SIMD_set((acc_t)0); + fytmp2 = SIMD_set((acc_t)0); + fztmp2 = SIMD_set((acc_t)0); if (EFLAG) fwtmp2 = SIMD_set((acc_t)0); - } + } SIMD_acc_t sevdwl; - if (EFLAG) { + if (EFLAG) { fwtmp = SIMD_set((acc_t)0); - sevdwl = SIMD_set((acc_t)0); + sevdwl = SIMD_set((acc_t)0); } - SIMD_int ejnum = SIMD_set(0); - SIMD_int ejnumhalf = SIMD_set(0); - SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, - 11, 12, 13, 14, 15); + SIMD_int ejnum = SIMD_set(0); + SIMD_int ejnumhalf = SIMD_set(0); + SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15); for (int jj = 0; jj < jnum_max; jj++) { SIMD_mask jmask = jj < jnum; - #ifdef OUTER_CHUNK - ng += swidth; - SIMD_int j = SIMD_load(ng); - #else - ng = ng + 1; - SIMD_int j = SIMD_gather(jmask, firstneigh, ng); - #endif + #ifdef OUTER_CHUNK + ng += swidth; + SIMD_int j = SIMD_load(ng); + #else + ng = ng + 1; + SIMD_int j = SIMD_gather(jmask, firstneigh, ng); + #endif j = j & SIMD_set(NEIGHMASK); - const SIMD_int joffset = j << 4; - - SIMD_flt_t delx, dely, delz; - SIMD_int jtype, ijtype; - if (ONETYPE) - SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz); - else { - SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz, - jtype); - ijtype = (jtype + itype_offset) << 2; - cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype); - } - - delx = delx - xtmp; - dely = dely - ytmp; - delz = delz - ztmp; + const SIMD_int joffset = j << 4; + + SIMD_flt_t delx, dely, delz; + SIMD_int jtype, ijtype; + if (ONETYPE) + SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz); + else { + SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz, + jtype); + ijtype = (jtype + itype_offset) << 2; + cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype); + } + + delx = delx - xtmp; + dely = dely - ytmp; + delz = delz - ztmp; SIMD_flt_t rsq1 = delx * delx; - rsq1 = SIMD_fma(dely, dely, rsq1); - rsq1 = SIMD_fma(delz, delz, rsq1); - - const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq); - SIMD_scatter(rmask, tdelx, coffset, delx); - SIMD_scatter(rmask, tdely, coffset, dely); - SIMD_scatter(rmask, tdelz, coffset, delz); - SIMD_scatter(rmask, trsq, coffset, rsq1); - SIMD_scatter(rmask, tj, coffset, j); - if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype); - ejnum = SIMD_add(rmask, ejnum, 1); - coffset = SIMD_add(rmask, coffset, swidth); - const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf); - ejnumhalf = SIMD_add(hmask, ejnumhalf, 1); - } - - const int ejnum_max = SIMD_max(ejnum); - const int ejnumhalf_max = SIMD_max(ejnumhalf); - memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3); + rsq1 = SIMD_fma(dely, dely, rsq1); + rsq1 = SIMD_fma(delz, delz, rsq1); + + const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq); + SIMD_scatter(rmask, tdelx, coffset, delx); + SIMD_scatter(rmask, tdely, coffset, dely); + SIMD_scatter(rmask, tdelz, coffset, delz); + SIMD_scatter(rmask, trsq, coffset, rsq1); + SIMD_scatter(rmask, tj, coffset, j); + if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype); + ejnum = SIMD_add(rmask, ejnum, 1); + coffset = SIMD_add(rmask, coffset, swidth); + const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf); + ejnumhalf = SIMD_add(hmask, ejnumhalf, 1); + } + + const int ejnum_max = SIMD_max(ejnum); + const int ejnumhalf_max = SIMD_max(ejnumhalf); + memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3); for (int jj = 0; jj < ejnum_max; jj++) { SIMD_int ijtype; - const int coffset = jj * swidth; - if (!ONETYPE) { - ijtype = SIMD_load(tjtype + coffset); - ijtype = (ijtype + itype_offset) << 2; - cut = SIMD_gather(&(p2f[0].cut), ijtype); - } - - SIMD_acc_t fjxtmp = SIMD_set((acc_t)0); - SIMD_acc_t fjytmp = SIMD_set((acc_t)0); - SIMD_acc_t fjztmp = SIMD_set((acc_t)0); - SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2; + const int coffset = jj * swidth; + if (!ONETYPE) { + ijtype = SIMD_load(tjtype + coffset); + ijtype = (ijtype + itype_offset) << 2; + cut = SIMD_gather(&(p2f[0].cut), ijtype); + } + + SIMD_acc_t fjxtmp = SIMD_set((acc_t)0); + SIMD_acc_t fjytmp = SIMD_set((acc_t)0); + SIMD_acc_t fjztmp = SIMD_set((acc_t)0); + SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2; if (EFLAG) fjtmp = SIMD_set((acc_t)0.0); - if (is_same<flt_t,acc_t>::value == 0) { - fjxtmp2 = SIMD_set((acc_t)0); - fjytmp2 = SIMD_set((acc_t)0); - fjztmp2 = SIMD_set((acc_t)0); - if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0); - } + if (is_same<flt_t,acc_t>::value == 0) { + fjxtmp2 = SIMD_set((acc_t)0); + fjytmp2 = SIMD_set((acc_t)0); + fjztmp2 = SIMD_set((acc_t)0); + if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0); + } const SIMD_flt_t delx = SIMD_load(tdelx + coffset); const SIMD_flt_t dely = SIMD_load(tdely + coffset); @@ -836,211 +836,211 @@ void PairSWIntel::eval(const int offload, const int vflag, const SIMD_flt_t rsq1 = SIMD_load(trsq + coffset); const SIMD_flt_t rinvsq1 = SIMD_rcp(rsq1); - const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1); + const SIMD_flt_t r1 = SIMD_invsqrt(rinvsq1); const SIMD_flt_t rainv1 = SIMD_rcp(r1 - cut); - - // two-body interactions, skip half of them - if (jj < ejnumhalf_max) { + + // two-body interactions, skip half of them + if (jj < ejnumhalf_max) { SIMD_flt_t rp, rq; - if (SPQ == 1) { + if (SPQ == 1) { rp = r1 * r1; - rp = rp * rp; - rp = SIMD_rcp(rp); - rq = SIMD_set((flt_t)1.0); + rp = rp * rp; + rp = SIMD_rcp(rp); + rq = SIMD_set((flt_t)1.0); } else { - if (!ONETYPE) { - powerp = SIMD_gather(&(p2f[0].powerp), ijtype); - powerq = SIMD_gather(&(p2f[0].powerq), ijtype); - } - rp = SIMD_pow(r1, powerp); - rq = SIMD_pow(r1, powerq); - } - - if (!ONETYPE) { - sigma = SIMD_gather(&(p2f[0].sigma), ijtype); - c1 = SIMD_gather(&(p2f2[0].c1), ijtype); - c2 = SIMD_gather(&(p2f2[0].c2), ijtype); - c3 = SIMD_gather(&(p2f2[0].c3), ijtype); - c4 = SIMD_gather(&(p2f2[0].c4), ijtype); - } - - const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1; - const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1); - const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * - rainvsq) * expsrainv * rinvsq1; - - const SIMD_flt_t fjx = delx * fpair; - const SIMD_flt_t fjy = dely * fpair; - const SIMD_flt_t fjz = delz * fpair; - - const SIMD_mask hmask = jj < ejnumhalf; - SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp, - fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2, - fztmp2, fjxtmp2, fjytmp2, fjztmp2); - - if (EFLAG) { - if (!ONETYPE) { - c5 = SIMD_gather(&(p2e[0].c5), ijtype); - c6 = SIMD_gather(&(p2e[0].c6), ijtype); - } - SIMD_flt_t evdwl; - evdwl = (c5 * rp - c6 * rq) * expsrainv; - SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp, - fwtmp2, fjtmp2); - } + if (!ONETYPE) { + powerp = SIMD_gather(&(p2f[0].powerp), ijtype); + powerq = SIMD_gather(&(p2f[0].powerq), ijtype); + } + rp = SIMD_pow(r1, powerp); + rq = SIMD_pow(r1, powerq); + } + + if (!ONETYPE) { + sigma = SIMD_gather(&(p2f[0].sigma), ijtype); + c1 = SIMD_gather(&(p2f2[0].c1), ijtype); + c2 = SIMD_gather(&(p2f2[0].c2), ijtype); + c3 = SIMD_gather(&(p2f2[0].c3), ijtype); + c4 = SIMD_gather(&(p2f2[0].c4), ijtype); + } + + const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1; + const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1); + const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * + rainvsq) * expsrainv * rinvsq1; + + const SIMD_flt_t fjx = delx * fpair; + const SIMD_flt_t fjy = dely * fpair; + const SIMD_flt_t fjz = delz * fpair; + + const SIMD_mask hmask = jj < ejnumhalf; + SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp, + fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2, + fztmp2, fjxtmp2, fjytmp2, fjztmp2); + + if (EFLAG) { + if (!ONETYPE) { + c5 = SIMD_gather(&(p2e[0].c5), ijtype); + c6 = SIMD_gather(&(p2e[0].c6), ijtype); + } + SIMD_flt_t evdwl; + evdwl = (c5 * rp - c6 * rq) * expsrainv; + SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp, + fwtmp2, fjtmp2); + } } - /*---------------------------------------------*/ - SIMD_int ijkoff; - if (!ONETYPE) { - sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype); - ijkoff = ijtype * ntypes; - } + /*---------------------------------------------*/ + SIMD_int ijkoff; + if (!ONETYPE) { + sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype); + ijkoff = ijtype * ntypes; + } const SIMD_flt_t gsrainv1 = sigma_gamma * rainv1; const SIMD_flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1; const SIMD_flt_t expgsrainv1 = SIMD_exp(gsrainv1); - const SIMD_mask jmask = jj < ejnum; + const SIMD_mask jmask = jj < ejnum; for (int kk = jj+1; kk < ejnum_max; kk++) { - SIMD_int iktype, ijktype; - const int kcoffset = kk * swidth; - if (!ONETYPE) { - iktype = SIMD_load(tjtype + kcoffset); - ijktype = ijkoff + (iktype << 2); - iktype = (iktype + itype_offset) << 2; - cut = SIMD_gather(&(p2[0].cut), iktype); - sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype); - costheta = SIMD_gather(&(p3[0].costheta), ijktype); - lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype); - lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype); - } - const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset); - const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset); - const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset); - const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset); - - const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2); - const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2); - const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut); - const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2; - const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2; - const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2); - const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2); - const SIMD_flt_t cs = (delx * delr2x + dely * delr2y + + SIMD_int iktype, ijktype; + const int kcoffset = kk * swidth; + if (!ONETYPE) { + iktype = SIMD_load(tjtype + kcoffset); + ijktype = ijkoff + (iktype << 2); + iktype = (iktype + itype_offset) << 2; + cut = SIMD_gather(&(p2[0].cut), iktype); + sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype); + costheta = SIMD_gather(&(p3[0].costheta), ijktype); + lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype); + lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype); + } + const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset); + const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset); + const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset); + const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset); + + const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2); + const SIMD_flt_t r2 = SIMD_invsqrt(rinvsq2); + const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut); + const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2; + const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2; + const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2); + const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2); + const SIMD_flt_t cs = (delx * delr2x + dely * delr2y + delz * delr2z) * rinv12; - const SIMD_flt_t delcs = cs - costheta; - const SIMD_flt_t delcssq = delcs*delcs; - - const SIMD_flt_t facexp = expgsrainv1*expgsrainv2; - const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq; - const SIMD_flt_t frad1 = facrad * gsrainvsq1; - const SIMD_flt_t frad2 = facrad * gsrainvsq2; - const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs; - const SIMD_flt_t facang12 = rinv12 * facang; - const SIMD_flt_t csfacang = cs * facang; - - const SIMD_flt_t csfac1 = rinvsq1 * csfacang; - const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12; - const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12; - const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12; - - const SIMD_flt_t csfac2 = rinvsq2 * csfacang; - SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2); - SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2); - SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2); - - const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum); - - SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp, - fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2, - fztmp2, fjxtmp2, fjytmp2, fjztmp2, - tf + kcoffset * 3, swidth); - - if (EFLAG) { - SIMD_int k; - if (eatom) { - k = SIMD_load(tj + kcoffset); - k = k << 4; - } - SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp, - fwtmp2, fjtmp2, k, dforce); - } - } // for kk - if (is_same<flt_t,acc_t>::value == 1) - SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp); - else - SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, - fjxtmp2, fjytmp2, fjztmp2); - - if (EFLAG) { - if (eatom) { - SIMD_int j = SIMD_load(tj + coffset); - j = j << 4; - SIMD_jeng_update(jmask, dforce + 3, j, fjtmp); - if (is_same<flt_t,acc_t>::value == 0) - SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2); - } - } + const SIMD_flt_t delcs = cs - costheta; + const SIMD_flt_t delcssq = delcs*delcs; + + const SIMD_flt_t facexp = expgsrainv1*expgsrainv2; + const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq; + const SIMD_flt_t frad1 = facrad * gsrainvsq1; + const SIMD_flt_t frad2 = facrad * gsrainvsq2; + const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs; + const SIMD_flt_t facang12 = rinv12 * facang; + const SIMD_flt_t csfacang = cs * facang; + + const SIMD_flt_t csfac1 = rinvsq1 * csfacang; + const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12; + const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12; + const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12; + + const SIMD_flt_t csfac2 = rinvsq2 * csfacang; + SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2); + SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2); + SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2); + + const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum); + + SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp, + fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2, + fztmp2, fjxtmp2, fjytmp2, fjztmp2, + tf + kcoffset * 3, swidth); + + if (EFLAG) { + SIMD_int k; + if (eatom) { + k = SIMD_load(tj + kcoffset); + k = k << 4; + } + SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp, + fwtmp2, fjtmp2, k, dforce); + } + } // for kk + if (is_same<flt_t,acc_t>::value == 1) + SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp); + else + SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, + fjxtmp2, fjytmp2, fjztmp2); + + if (EFLAG) { + if (eatom) { + SIMD_int j = SIMD_load(tj + coffset); + j = j << 4; + SIMD_jeng_update(jmask, dforce + 3, j, fjtmp); + if (is_same<flt_t,acc_t>::value == 0) + SIMD_jeng_update_hi(jmask, dforce + 3, j, fjtmp2); + } + } } // for jj first loop for (int jj = 0; jj < ejnum_max; jj++) { - const int coffset = jj * swidth; - const SIMD_mask jmask = jj < ejnum; + const int coffset = jj * swidth; + const SIMD_mask jmask = jj < ejnum; const SIMD_int j = SIMD_load(tj + coffset); - const SIMD_int joffset = j << 4; - - SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2; - int foffset = swidth; - if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1; - acc_t *p = tf + coffset * 3; - fjxtmp = SIMD_load(p); - if (is_same<flt_t,acc_t>::value == 0) { - p = p + foffset; - fjxtmp2 = SIMD_load(p); - } - p = p + foffset; - fjytmp = SIMD_load(p); - if (is_same<flt_t,acc_t>::value == 0) { - p = p + foffset; - fjytmp2 = SIMD_load(p); - } - p = p + foffset; - fjztmp = SIMD_load(p); - if (is_same<flt_t,acc_t>::value == 0) { - p = p + foffset; - fjztmp2 = SIMD_load(p); - } - - SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp); - SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp, - fjztmp); + const SIMD_int joffset = j << 4; + + SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2; + int foffset = swidth; + if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1; + acc_t *p = tf + coffset * 3; + fjxtmp = SIMD_load(p); + if (is_same<flt_t,acc_t>::value == 0) { + p = p + foffset; + fjxtmp2 = SIMD_load(p); + } + p = p + foffset; + fjytmp = SIMD_load(p); + if (is_same<flt_t,acc_t>::value == 0) { + p = p + foffset; + fjytmp2 = SIMD_load(p); + } + p = p + foffset; + fjztmp = SIMD_load(p); + if (is_same<flt_t,acc_t>::value == 0) { + p = p + foffset; + fjztmp2 = SIMD_load(p); + } + + SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp); + SIMD_jforce_update(jmask, dforce, joffset, fjxtmp, fjytmp, + fjztmp); if (is_same<flt_t,acc_t>::value == 0) { - SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); - SIMD_mask jmask2 = jmask >> 8; - SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2, - fjztmp2); - SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2, - fjztmp2); - } - } // for jj second loop - - SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp, - EFLAG, eatom, fwtmp); - if (is_same<flt_t,acc_t>::value == 0) { - imask = imask >> 8; - SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, - fztmp2, EFLAG, eatom, fwtmp2); - } - if (EFLAG) oevdwl += SIMD_sum(sevdwl); - ilist = ilist + iip; + SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); + SIMD_mask jmask2 = jmask >> 8; + SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2, + fjztmp2); + SIMD_jforce_update(jmask2, dforce, joffset2, fjxtmp2, fjytmp2, + fjztmp2); + } + } // for jj second loop + + SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp, + EFLAG, eatom, fwtmp); + if (is_same<flt_t,acc_t>::value == 0) { + imask = imask >> 8; + SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, + fztmp2, EFLAG, eatom, fwtmp2); + } + if (EFLAG) oevdwl += SIMD_sum(sevdwl); + ilist = ilist + iip; } // for ii - IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, - x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); + IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, + x, offload, vflag, ov0, ov1, ov2, ov3, ov4, ov5); } // end omp - + IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { ev_global[0] = oevdwl; @@ -1119,7 +1119,7 @@ void PairSWIntel::init_style() #if defined(__INTEL_COMPILER) if (__INTEL_COMPILER_BUILD_DATE < 20141023) error->all(FLERR, "Intel compiler versions before " - "15 Update 1 not supported for sw/intel"); + "15 Update 1 not supported for sw/intel"); #endif } @@ -1168,7 +1168,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, } } } - + _onetype = 0; if (atom->ntypes == 1) _onetype = 1; @@ -1178,55 +1178,55 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, for (int jj = 0; jj < tp1; jj++) { int j = map[jj]; if (i < 0 || j < 0 || ii == 0 || jj == 0) { - fc.p2[ii][jj].cutsq = 0; - fc.p2[ii][jj].cut = 0; - fc.p2[ii][jj].sigma_gamma = 0; - fc.p2f[ii][jj].cut = 0; - fc.p2f[ii][jj].powerp = 0; - fc.p2f[ii][jj].powerq = 0; - fc.p2f[ii][jj].sigma = 0; - fc.p2f2[ii][jj].c1 = 0; - fc.p2f2[ii][jj].c2 = 0; - fc.p2f2[ii][jj].c3 = 0; - fc.p2f2[ii][jj].c4 = 0; - fc.p2e[ii][jj].c5 = 0; - fc.p2e[ii][jj].c6 = 0; + fc.p2[ii][jj].cutsq = 0; + fc.p2[ii][jj].cut = 0; + fc.p2[ii][jj].sigma_gamma = 0; + fc.p2f[ii][jj].cut = 0; + fc.p2f[ii][jj].powerp = 0; + fc.p2f[ii][jj].powerq = 0; + fc.p2f[ii][jj].sigma = 0; + fc.p2f2[ii][jj].c1 = 0; + fc.p2f2[ii][jj].c2 = 0; + fc.p2f2[ii][jj].c3 = 0; + fc.p2f2[ii][jj].c4 = 0; + fc.p2e[ii][jj].c5 = 0; + fc.p2e[ii][jj].c6 = 0; } else { - int ijparam = elem2param[i][j][j]; - fc.p2[ii][jj].cutsq = params[ijparam].cutsq; - fc.p2[ii][jj].cut = params[ijparam].cut; - fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma; - fc.p2f[ii][jj].cut = params[ijparam].cut; - fc.p2f[ii][jj].powerp = -params[ijparam].powerp; - fc.p2f[ii][jj].powerq = -params[ijparam].powerq; - fc.p2f[ii][jj].sigma = params[ijparam].sigma; - fc.p2f2[ii][jj].c1 = params[ijparam].c1; - fc.p2f2[ii][jj].c2 = params[ijparam].c2; - fc.p2f2[ii][jj].c3 = params[ijparam].c3; - fc.p2f2[ii][jj].c4 = params[ijparam].c4; - fc.p2e[ii][jj].c5 = params[ijparam].c5; - fc.p2e[ii][jj].c6 = params[ijparam].c6; - - double cutcut = params[ijparam].cut * params[ijparam].cut; - if (params[ijparam].cutsq >= cutcut) - fc.p2[ii][jj].cutsq *= 0.98; - - if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0) - _spq = 0; + int ijparam = elem2param[i][j][j]; + fc.p2[ii][jj].cutsq = params[ijparam].cutsq; + fc.p2[ii][jj].cut = params[ijparam].cut; + fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma; + fc.p2f[ii][jj].cut = params[ijparam].cut; + fc.p2f[ii][jj].powerp = -params[ijparam].powerp; + fc.p2f[ii][jj].powerq = -params[ijparam].powerq; + fc.p2f[ii][jj].sigma = params[ijparam].sigma; + fc.p2f2[ii][jj].c1 = params[ijparam].c1; + fc.p2f2[ii][jj].c2 = params[ijparam].c2; + fc.p2f2[ii][jj].c3 = params[ijparam].c3; + fc.p2f2[ii][jj].c4 = params[ijparam].c4; + fc.p2e[ii][jj].c5 = params[ijparam].c5; + fc.p2e[ii][jj].c6 = params[ijparam].c6; + + double cutcut = params[ijparam].cut * params[ijparam].cut; + if (params[ijparam].cutsq >= cutcut) + fc.p2[ii][jj].cutsq *= 0.98; + + if (params[ijparam].powerp != 4.0 || params[ijparam].powerq != 0.0) + _spq = 0; } for (int kk = 0; kk < tp1; kk++) { int k = map[kk]; - if (i < 0 || j < 0 || k < 0 || ii == 0 || jj == 0 || kk == 0) { - fc.p3[ii][jj][kk].costheta = 0; - fc.p3[ii][jj][kk].lambda_epsilon = 0; - fc.p3[ii][jj][kk].lambda_epsilon2 = 0; - } else { - int ijkparam = elem2param[i][j][k]; - fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta; - fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon; - fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2; - } + if (i < 0 || j < 0 || k < 0 || ii == 0 || jj == 0 || kk == 0) { + fc.p3[ii][jj][kk].costheta = 0; + fc.p3[ii][jj][kk].lambda_epsilon = 0; + fc.p3[ii][jj][kk].lambda_epsilon2 = 0; + } else { + int ijkparam = elem2param[i][j][k]; + fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta; + fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon; + fc.p3[ii][jj][kk].lambda_epsilon2 = params[ijkparam].lambda_epsilon2; + } } } } @@ -1247,10 +1247,10 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; int tp1cu = tp1sq * tp1; - if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && + if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && op3 != NULL && ocutneighsq != NULL) { #pragma offload_transfer target(mic:_cop) \ - in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0)) \ + in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0)) \ in(op3: length(tp1cu) alloc_if(0) free_if(0)) \ in(ocutneighsq: length(tp1sq)) } @@ -1272,8 +1272,8 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, fc_packed3 *op3 = p3[0][0]; #ifdef _LMP_INTEL_OFFLOAD - if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && - op3 != NULL && _cop >= 0) { + if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && + op3 != NULL && _cop >= 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(op2, op2f, op2f2, op2e, op3: alloc_if(0) free_if(1)) } @@ -1301,8 +1301,8 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, fc_packed3 *op3 = p3[0][0]; int tp1sq = ntypes * ntypes; int tp1cu = tp1sq * ntypes; - if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && - op3 != NULL && cop >= 0) { + if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && + op3 != NULL && cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(1) free_if(0)) \ nocopy(op3: length(tp1cu) alloc_if(1) free_if(0)) diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h index b55022328f..ffcf9a6fb6 100644 --- a/src/USER-INTEL/pair_sw_intel.h +++ b/src/USER-INTEL/pair_sw_intel.h @@ -49,7 +49,7 @@ class PairSWIntel : public PairSW { template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, - const int astart, const int aend, const int pad_width); + const int astart, const int aend, const int pad_width); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp index f59a6b7c96..9e0a888638 100644 --- a/src/USER-INTEL/pair_tersoff_intel.cpp +++ b/src/USER-INTEL/pair_tersoff_intel.cpp @@ -47,7 +47,7 @@ void PairTersoffIntel::init_style() { if (comm->me == 0) { error->warning(FLERR, "Tersoff/intel currently requires intel compiler. " - "Using MANYBODY version."); + "Using MANYBODY version."); } PairTersoff::init_style(); } @@ -87,7 +87,7 @@ PairTersoffIntel::PairTersoffIntel(LAMMPS *lmp) : PairTersoff(lmp) void PairTersoffIntel::compute(int eflag, int vflag) { if (fix->precision()==FixIntel::PREC_MODE_MIXED) { - compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), + compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), force_const_single); } else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE) { compute<double,double>(eflag, vflag, fix->get_double_buffers(), @@ -104,8 +104,8 @@ void PairTersoffIntel::compute(int eflag, int vflag) // do we need to calculate energy/virial template <class flt_t, class acc_t> void PairTersoffIntel::compute(int eflag, int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc) { if (eflag || vflag) { ev_setup(eflag,vflag); @@ -127,13 +127,13 @@ void PairTersoffIntel::compute(int eflag, int vflag, #endif { int ifrom, ito, tid; - IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, - packthreads, sizeof(ATOM_T)); + IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, + packthreads, sizeof(ATOM_T)); buffers->thr_pack(ifrom,ito,ago); } fix->stop_watch(TIME_PACK); } - + int ovflag = 0; if (vflag_fdotr) ovflag = 2; else if (vflag) ovflag = 1; @@ -170,14 +170,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> // what's done in here is that they are inlined and vectorized // attractive() also provides an option to compute zeta as well static fvec zeta_vector( - const c_inner_t * param, - ivec xjw, bvec mask, - fvec vrij, fvec rsq2, - fvec vdijx, fvec vdijy, fvec vdijz, + const c_inner_t * param, + ivec xjw, bvec mask, + fvec vrij, fvec rsq2, + fvec vdijx, fvec vdijy, fvec vdijz, fvec dikx, fvec diky, fvec dikz ); static void force_zeta_vector( - const c_outer_t * param, + const c_outer_t * param, ivec xjw, bvec mask, fvec vrijsq, fvec vzeta_ij, @@ -202,14 +202,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> // perform the actual computation template<bool EFLAG> static void kernel( - int iito, int iifrom, int eatom, int vflag, + int iito, int iifrom, int eatom, int vflag, const int * _noalias const numneigh, const int * _noalias const numneighhalf, - const int * _noalias const cnumneigh, - const int * _noalias const firstneigh, int ntypes, + const int * _noalias const cnumneigh, + const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, - const c_inner_t * _noalias const c_inner, - const c_outer_t * _noalias const c_outer, + const c_inner_t * _noalias const c_inner, + const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, acc_t *evdwl ); @@ -217,14 +217,14 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> // perform one step of calculation, pass in i-j pairs of atoms (is, js) template<int EFLAG> static void kernel_step( - int eatom, int vflag, + int eatom, int vflag, const int * _noalias const numneigh, - const int * _noalias const cnumneigh, - const int * _noalias const firstneigh, + const int * _noalias const cnumneigh, + const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, - const c_inner_t * _noalias const c_inner, - const c_outer_t * _noalias const c_outer, + const c_inner_t * _noalias const c_inner, + const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, avec *vsevdwl, int compress_idx, iarr is, iarr js, bvec vmask_repulsive ); @@ -233,12 +233,12 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> // with fixed i and a number of js template<int EFLAG> static void kernel_step_const_i( - int eatom, int vflag, - const int * _noalias const numneigh, const int * _noalias const cnumneigh, - const int * _noalias const firstneigh, int ntypes, + int eatom, int vflag, + const int * _noalias const numneigh, const int * _noalias const cnumneigh, + const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, - const c_inner_t * _noalias const c_inner, - const c_outer_t * _noalias const c_outer, + const c_inner_t * _noalias const c_inner, + const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive ); @@ -255,9 +255,9 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> // This method is nearly identical to what happens in the other /intel styles template <int EFLAG, class flt_t, class acc_t> void PairTersoffIntel::eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> *buffers, - const ForceConst<flt_t> &fc, - const int astart, const int aend) + IntelBuffers<flt_t,acc_t> *buffers, + const ForceConst<flt_t> &fc, + const int astart, const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -289,8 +289,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag, // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; IP_PRE_get_transfern(ago, 1, EFLAG, vflag, - buffers, offload, fix, separate_flag, - x_size, q_size, ev_size, f_stride); + buffers, offload, fix, separate_flag, + x_size, q_size, ev_size, f_stride); int tc; FORCE_T * _noalias f_start; @@ -326,8 +326,8 @@ void PairTersoffIntel::eval(const int offload, const int vflag, #endif #endif - IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, - f_stride, x, 0); + IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, + f_stride, x, 0); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; if (EFLAG) oevdwl = oecoul = (acc_t)0; @@ -354,7 +354,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag, // Pick the variable i algorithm under specific conditions // do use scalar algorithm with very short vectors int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL; - bool pack_i = VL >= 8 && + bool pack_i = VL >= 8 && lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops; bool use_scalar = VL < 4; if (use_scalar) { @@ -364,16 +364,16 @@ void PairTersoffIntel::eval(const int offload, const int vflag, } else { IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS); } - if (EFLAG) oevdwl += sevdwl; + if (EFLAG) oevdwl += sevdwl; } IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, - f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, - ov4, ov5); + f_stride, x, offload, vflag, ov0, ov1, ov2, ov3, + ov4, ov5); } // end of omp parallel region IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, - ov0, ov1, ov2, ov3, ov4, ov5); + ov0, ov1, ov2, ov3, ov4, ov5); if (EFLAG) { ev_global[0] = oevdwl; @@ -431,7 +431,7 @@ void PairTersoffIntel::init_style() error->all(FLERR, "The 'package intel' command is required for /intel styles"); fix = static_cast<FixIntel *>(modify->fix[ifix]); - + fix->pair_init_check(); fix->three_body_neighbor(1); #ifdef _LMP_INTEL_OFFLOAD @@ -481,25 +481,25 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc, for (int k = 1; k < tp1; k++) { Param * param = ¶ms[elem2param[map[i]][map[j]][map[k]]]; fc.c_cutoff_inner[i][k][j].cutsq = static_cast<flt_t>(param->cutsq); - fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3); + fc.c_inner_loop[i][j][k].lam3 = static_cast<flt_t>(param->lam3); fc.c_inner_loop[i][j][k].bigr = static_cast<flt_t>(param->bigr); fc.c_inner_loop[i][j][k].bigd = static_cast<flt_t>(param->bigd); fc.c_inner_loop[i][j][k].c2 = static_cast<flt_t>(param->c * param->c); fc.c_inner_loop[i][j][k].d2 = static_cast<flt_t>(param->d * param->d); fc.c_inner_loop[i][j][k].h = static_cast<flt_t>(param->h); fc.c_inner_loop[i][j][k].gamma = static_cast<flt_t>(param->gamma); - fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint); + fc.c_inner_loop[i][j][k].powermint = static_cast<flt_t>(param->powermint); fc.c_inner[i][j][k].cutsq = static_cast<flt_t>(param->cutsq); - fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3); + fc.c_inner[i][j][k].lam3 = static_cast<flt_t>(param->lam3); fc.c_inner[i][j][k].bigr = static_cast<flt_t>(param->bigr); fc.c_inner[i][j][k].bigd = static_cast<flt_t>(param->bigd); fc.c_inner[i][j][k].c2 = static_cast<flt_t>(param->c * param->c); fc.c_inner[i][j][k].d2 = static_cast<flt_t>(param->d * param->d); fc.c_inner[i][j][k].h = static_cast<flt_t>(param->h); fc.c_inner[i][j][k].gamma = static_cast<flt_t>(param->gamma); - fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint); - + fc.c_inner[i][j][k].powermint = static_cast<flt_t>(param->powermint); + } Param * param = ¶ms[elem2param[map[i]][map[j]][map[j]]]; fc.c_cutoff_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq); @@ -515,7 +515,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc, fc.c_second_loop[i][j].c2 = static_cast<flt_t>(param->c2); fc.c_second_loop[i][j].c3 = static_cast<flt_t>(param->c3); fc.c_second_loop[i][j].c4 = static_cast<flt_t>(param->c4); - + fc.c_outer[i][j].cutsq = static_cast<flt_t>(param->cutsq); fc.c_outer[i][j].bigr = static_cast<flt_t>(param->bigr); fc.c_outer[i][j].bigd = static_cast<flt_t>(param->bigd); @@ -563,8 +563,8 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc, // As in any other /intel pair style template <class flt_t> void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, - Memory *memory, - const int cop) { + Memory *memory, + const int cop) { if ( (ntypes != _ntypes) ) { if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD @@ -575,12 +575,12 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, c_cutoff_t * oc_cutoff_outer = c_cutoff_outer[0]; c_inner_t * oc_inner = c_inner[0][0]; c_outer_t * oc_outer = c_outer[0]; - if (c_first_loop != NULL && c_second_loop != NULL && + if (c_first_loop != NULL && c_second_loop != NULL && c_inner_loop != NULL && _cop >= 0) { #pragma offload_transfer target(mic:cop) \ - nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \ - nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \ + nocopy(oc_first_loop, oc_second_loop, oc_inner_loop: alloc_if(0) free_if(1)) \ + nocopy(oc_cutoff_outer, oc_cutoff_inner: alloc_if(0) free_if(1)) \ nocopy(oc_inner, oc_outer: alloc_if(0) free_if(0)) } #endif @@ -614,7 +614,7 @@ void PairTersoffIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, int tp1sq = ntypes * ntypes; int tp1cb = ntypes * ntypes * ntypes; int tp1cb_pad = ntypes * ntypes * ntypes_pad; - if (oc_first_loop != NULL && oc_second_loop != NULL && + if (oc_first_loop != NULL && oc_second_loop != NULL && oc_inner_loop != NULL && cop >= 0) { #pragma offload_transfer target(mic:cop) \ nocopy(oc_first_loop: length(tp1sq) alloc_if(1) free_if(0)) \ @@ -642,15 +642,15 @@ static const int N_CACHE = 8; template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<int EFLAG> void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( - int eatom, int vflag, - const int * _noalias const numneigh, const int * _noalias const cnumneigh, - const int * _noalias const firstneigh, int ntypes, + int eatom, int vflag, + const int * _noalias const numneigh, const int * _noalias const cnumneigh, + const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, - const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, - const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, + const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, + const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - avec *vsevdwl, - int compress_idx, + avec *vsevdwl, + int compress_idx, iarr is, iarr js, bvec vmask_repulsive @@ -662,7 +662,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( ivec v_i0(0); ivec v_i_ntypes(ntypes); ivec v_i_NEIGHMASK(NEIGHMASK); - + farr fx, fy, fz, fw; int cache_idx = 0; fvec vfkx_cache[N_CACHE]; @@ -672,7 +672,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( bvec vmask_cache[N_CACHE]; ivec vkks_final_cache; bvec vmask_final_cache; - iarr ts; + iarr ts; // compute all the stuff we know from i and j // TDO: We could extract this from the driver routine ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is)); @@ -738,7 +738,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( &vfix,&vfiy,&vfiz, &vfjx,&vfjy,&vfjz, &vfkx,&vfky,&vfkz, - &vzeta_contrib); + &vzeta_contrib); vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix); vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy); vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz); @@ -749,9 +749,9 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( vfkx_cache[cache_idx] = vfkx; vfky_cache[cache_idx] = vfky; vfkz_cache[cache_idx] = vfkz; - vks_cache[cache_idx] = vks; - vmask_cache[cache_idx] = veff_mask; - cache_idx += 1; + vks_cache[cache_idx] = vks; + vmask_cache[cache_idx] = veff_mask; + cache_idx += 1; vzeta = v::mask_add(vzeta, veff_mask, vzeta, vzeta_contrib); vkks = vkks + v_i1; @@ -799,7 +799,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( vfjxtmp = vfjxtmp * vprefactor - vdx_ij * vfpair; vfjytmp = vfjytmp * vprefactor - vdy_ij * vfpair; vfjztmp = vfjztmp * vprefactor - vdz_ij * vfpair; - + if (EFLAG) { *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); if (eatom) { @@ -833,7 +833,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( fvec vx_k, vy_k, vz_k, vcutsq; while (! v::mask_testz(vactive_mask)) { bvec vnew_mask = vactive_mask & ~ veff_old_mask; - vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & + vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK & v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh)); v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k); fvec vdx_ik = vx_k - vx_i; @@ -855,7 +855,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( &vfix,&vfiy,&vfiz, &vfjx,&vfjy,&vfjz, &vfkx,&vfky,&vfkz, - 0); + 0); vfxtmp = v::mask_add(vfxtmp, veff_mask, vfxtmp, vfix); vfytmp = v::mask_add(vfytmp, veff_mask, vfytmp, vfiy); vfztmp = v::mask_add(vfztmp, veff_mask, vfztmp, vfiz); @@ -917,15 +917,15 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<int EFLAG> void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( - int eatom, int vflag, - const int * _noalias const numneigh, const int * _noalias const cnumneigh, - const int * _noalias const firstneigh, int ntypes, + int eatom, int vflag, + const int * _noalias const numneigh, const int * _noalias const cnumneigh, + const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, - const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, - const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, + const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, + const typename PairTersoffIntel::ForceConst<flt_t>::c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - avec *vsevdwl, - int compress_idx, + avec *vsevdwl, + int compress_idx, int i, iarr js, bvec vmask_repulsive @@ -951,7 +951,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( int kk_final_cache; aarr fx, fy, fz, fw; - iarr ts; + iarr ts; bvec vmask = v::mask_enable_lower(compress_idx); fvec vx_i(x[i].x), vy_i(x[i].y), vz_i(x[i].z); @@ -997,7 +997,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( fvec vfix, vfiy, vfiz; fvec vfjx, vfjy, vfjz; fvec vfkx, vfky, vfkz; - + attractive_vector<true>(&c_inner[ntypes * ntypes * w_i + w_k],vc_idx_j_ntypes,veff_mask,fvec(1.), vrij,vrsq,vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik, &vfix,&vfiy,&vfiz, @@ -1010,7 +1010,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( vfjxtmp = v::acc_mask_add(vfjxtmp, veff_mask, vfjxtmp, vfjx); vfjytmp = v::acc_mask_add(vfjytmp, veff_mask, vfjytmp, vfjy); vfjztmp = v::acc_mask_add(vfjztmp, veff_mask, vfjztmp, vfjz); - + vfkx_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkx, v::zero()); vfky_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfky, v::zero()); vfkz_cache[cache_idx] = v::mask_add(v::zero(), veff_mask, vfkz, v::zero()); @@ -1037,7 +1037,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k))); bvec veff_mask = vcutoff_mask & vsame_mask & vmask; if (! v::mask_testz(veff_mask)) { - fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq, + fvec vzeta_contrib = zeta_vector(&c_inner[ntypes * ntypes * w_i + w_k], vc_idx_j_ntypes, veff_mask, vrij, vrsq, vdx_ij,vdy_ij,vdz_ij,vdx_ik,vdy_ik,vdz_ik); vzeta = v::acc_mask_add(vzeta, veff_mask, vzeta, vzeta_contrib); } @@ -1051,7 +1051,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( vfjxtmp = vfjxtmp * vaprefactor - avec(vdx_ij * vfpair); vfjytmp = vfjytmp * vaprefactor - avec(vdy_ij * vfpair); vfjztmp = vfjztmp * vaprefactor - avec(vdz_ij * vfpair); - + if (EFLAG) { *vsevdwl = v::acc_mask_add(*vsevdwl, vmask, *vsevdwl, vevdwl); if (eatom) { @@ -1093,7 +1093,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( &vfix,&vfiy,&vfiz, &vfjx,&vfjy,&vfjz, &vfkx,&vfky,&vfkz, - 0); + 0); vfxtmp = v::acc_mask_add(vfxtmp, veff_mask, vfxtmp, vfix); vfytmp = v::acc_mask_add(vfytmp, veff_mask, vfytmp, vfiy); vfztmp = v::acc_mask_add(vfztmp, veff_mask, vfztmp, vfiz); @@ -1129,14 +1129,14 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<bool EFLAG> void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( - int iito, int iifrom, int eatom, int vflag, - const int * _noalias const numneigh, - const int * _noalias const numneighhalf, - const int * _noalias const cnumneigh, - const int * _noalias const firstneigh, int ntypes, + int iito, int iifrom, int eatom, int vflag, + const int * _noalias const numneigh, + const int * _noalias const numneighhalf, + const int * _noalias const cnumneigh, + const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, - const c_inner_t * _noalias const c_inner, - const c_outer_t * _noalias const c_outer, + const c_inner_t * _noalias const c_inner, + const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, acc_t *evdwl ) { @@ -1181,10 +1181,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( if (compress_idx == v::VL) { vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); kernel_step<EFLAG>( - eatom, vflag, + eatom, vflag, numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, - &vsevdwl, compress_idx, + &vsevdwl, compress_idx, is, js, vmask_repulsive ); compress_idx = 0; @@ -1194,10 +1194,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( if (compress_idx == v::VL || (compress_idx > 0 && jj == jnum-1)) { vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); kernel_step_const_i<EFLAG>( - eatom, vflag, + eatom, vflag, numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, - &vsevdwl, compress_idx, + &vsevdwl, compress_idx, i, js, vmask_repulsive ); compress_idx = 0; @@ -1209,10 +1209,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( if (compress_idx > 0) { vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); IntelKernelTersoff::kernel_step<EFLAG>( - eatom, vflag, + eatom, vflag, numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, - &vsevdwl, compress_idx, + &vsevdwl, compress_idx, is, js, vmask_repulsive ); } @@ -1224,10 +1224,10 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector( - const c_inner_t * param, - ivec xjw, bvec mask, - fvec vrij, fvec rsq2, - fvec vdijx, fvec vdijy, fvec vdijz, + const c_inner_t * param, + ivec xjw, bvec mask, + fvec vrij, fvec rsq2, + fvec vdijx, fvec vdijy, fvec vdijz, fvec dikx, fvec diky, fvec dikz ) { fvec v_1_0(1.0); @@ -1250,7 +1250,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t // Its kind of important to check the mask. // Some simulations never/rarely invoke this branch. if (! v::mask_testz(vmask_need_sine)) { - vfc = v::blend(vmask_need_sine, vfc, + vfc = v::blend(vmask_need_sine, vfc, v_0_5 * (v_1_0 - sin(fvec(MY_PI2) * (vrik - vpbigr) * v::recip(vpbigd)))); } return vgijk * vex_delr * vfc; @@ -1258,7 +1258,7 @@ IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector( - const c_outer_t * param, + const c_outer_t * param, ivec xjw, bvec mask, fvec vrij, fvec vzeta_ij, @@ -1402,9 +1402,9 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector( vfc_d = v::blend(vmask_need_sine, vfc_d, fvec(-0.5) * vtmp * vfccos); } - fvec vzeta_d_fc = vfc_d * vgijk * vex_delr; - fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr; - fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d; + fvec vzeta_d_fc = vfc_d * vgijk * vex_delr; + fvec vzeta_d_gijk = vfc * vgijk_d * vex_delr; + fvec vzeta_d_ex_delr = vfc * vgijk * vex_delr_d; if (ZETA) *zeta = vfc * vgijk * vex_delr; fvec vminus_costheta = - vcostheta; @@ -1417,7 +1417,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector( fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx); fvec vdcosdriy = -(vdcosdrjy + vdcosdrky); fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz); - + *fix = vprefactor * (vzeta_d_gijk * vdcosdrix + vzeta_d_ex_delr * (rik_hatx - vrij_hatx) - vzeta_d_fc * rik_hatx); *fiy = vprefactor * (vzeta_d_gijk * vdcosdriy + vzeta_d_ex_delr * (rik_haty - vrij_haty) - vzeta_d_fc * rik_haty); *fiz = vprefactor * (vzeta_d_gijk * vdcosdriz + vzeta_d_ex_delr * (rik_hatz - vrij_hatz) - vzeta_d_fc * rik_hatz); diff --git a/src/USER-INTEL/pair_tersoff_intel.h b/src/USER-INTEL/pair_tersoff_intel.h index c725487ae7..6da478c10f 100644 --- a/src/USER-INTEL/pair_tersoff_intel.h +++ b/src/USER-INTEL/pair_tersoff_intel.h @@ -75,14 +75,14 @@ class PairTersoffIntel : public PairTersoff { }; ForceConst<float> force_const_single; ForceConst<double> force_const_double; - + template <class flt_t, class acc_t> void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc); template <int EFLAG, class flt_t, class acc_t> void eval(const int offload, const int vflag, - IntelBuffers<flt_t,acc_t> * buffers, - const ForceConst<flt_t> &fc, const int astart, const int aend); + IntelBuffers<flt_t,acc_t> * buffers, + const ForceConst<flt_t> &fc, const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, diff --git a/src/USER-INTEL/pppm_disp_intel.cpp b/src/USER-INTEL/pppm_disp_intel.cpp index 110649f8ee..ec5f5150c2 100644 --- a/src/USER-INTEL/pppm_disp_intel.cpp +++ b/src/USER-INTEL/pppm_disp_intel.cpp @@ -1,3034 +1,3034 @@ -/* ---------------------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: William McDoniel (RWTH Aachen University) -------------------------------------------------------------------------- */ - -#include <mpi.h> -#include <stdlib.h> -#include <math.h> -#include "pppm_disp_intel.h" -#include "atom.h" -#include "error.h" -#include "fft3d_wrap.h" -#include "gridcomm.h" -#include "math_const.h" -#include "math_special.h" -#include "memory.h" -#include "suffix.h" - -using namespace LAMMPS_NS; -using namespace MathConst; -using namespace MathSpecial; - -#define MAXORDER 7 -#define OFFSET 16384 -#define SMALL 0.00001 -#define LARGE 10000.0 -#define EPS_HOC 1.0e-7 - -enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER}; -enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE}; -enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM, - FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G, - FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A, - FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE, - FORWARD_AD_PERATOM_NONE}; - -#ifdef FFT_SINGLE -#define ZEROF 0.0f -#define ONEF 1.0f -#else -#define ZEROF 0.0 -#define ONEF 1.0 -#endif - -/* ---------------------------------------------------------------------- */ - -PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) : - PPPMDisp(lmp, narg, arg) -{ - suffix_flag |= Suffix::INTEL; - - order = 7; - order_6 = 7; //sets default stencil sizes to 7 - - perthread_density = NULL; - particle_ekx = particle_eky = particle_ekz = NULL; - particle_ekx0 = particle_eky0 = particle_ekz0 = NULL; - particle_ekx1 = particle_eky1 = particle_ekz1 = NULL; - particle_ekx2 = particle_eky2 = particle_ekz2 = NULL; - particle_ekx3 = particle_eky3 = particle_ekz3 = NULL; - particle_ekx4 = particle_eky4 = particle_ekz4 = NULL; - particle_ekx5 = particle_eky5 = particle_ekz5 = NULL; - particle_ekx6 = particle_eky6 = particle_ekz6 = NULL; - - rho_lookup = drho_lookup = NULL; - rho6_lookup = drho6_lookup = NULL; - rho_points = 0; - - _use_table = _use_packing = _use_lrt = 0; -} - -PPPMDispIntel::~PPPMDispIntel() -{ - memory->destroy(perthread_density); - memory->destroy(particle_ekx); - memory->destroy(particle_eky); - memory->destroy(particle_ekz); - - memory->destroy(rho_lookup); - memory->destroy(drho_lookup); - memory->destroy(rho6_lookup); - memory->destroy(drho6_lookup); -} - - - -/* ---------------------------------------------------------------------- - called once before run -------------------------------------------------------------------------- */ - - -void PPPMDispIntel::init() -{ - - PPPMDisp::init(); - int ifix = modify->find_fix("package_intel"); - if (ifix < 0) - error->all(FLERR, - "The 'package intel' command is required for /intel styles"); - fix = static_cast<FixIntel *>(modify->fix[ifix]); - - #ifdef _LMP_INTEL_OFFLOAD - _use_base = 0; - if (fix->offload_balance() != 0.0) { - _use_base = 1; - return; - } - #endif - - fix->kspace_init_check(); - - _use_lrt = fix->lrt(); - if (_use_lrt) - error->all(FLERR, - "LRT mode is currently not supported for pppm/disp/intel"); - - - // For vectorization, we need some padding in the end - // The first thread computes on the global density - if ((comm->nthreads > 1) && !_use_lrt) { - memory->destroy(perthread_density); - memory->create(perthread_density, comm->nthreads-1, - ngrid + INTEL_P3M_ALIGNED_MAXORDER, - "pppmdispintel:perthread_density"); - } - - _use_table = fix->pppm_table(); - if (_use_table) { - rho_points = 5000; - memory->destroy(rho_lookup); - memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, - "pppmdispintel:rho_lookup"); - memory->destroy(rho6_lookup); - memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, - "pppmdispintel:rho6_lookup"); - - if(differentiation_flag == 1) { - memory->destroy(drho_lookup); - memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, - "pppmdispintel:drho_lookup"); - memory->destroy(drho6_lookup); - memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, - "pppmdispintel:drho6_lookup"); - } - precompute_rho(); - } - if (order > INTEL_P3M_MAXORDER) - error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); -} - -/* ---------------------------------------------------------------------- - compute the PPPMDispIntel long-range force, energy, virial -------------------------------------------------------------------------- */ - -void PPPMDispIntel::compute(int eflag, int vflag) -{ - #ifdef _LMP_INTEL_OFFLOAD - if (_use_base) { - PPPMDisp::compute(eflag, vflag); - return; - } - #endif - int i; - // convert atoms from box to lamda coords - - if (eflag || vflag) ev_setup(eflag,vflag); - else evflag = evflag_atom = eflag_global = vflag_global = - eflag_atom = vflag_atom = 0; - - if (evflag_atom && !peratom_allocate_flag) { - allocate_peratom(); - if (function[0]) { - cg_peratom->ghost_notify(); - cg_peratom->setup(); - } - if (function[1] + function[2] + function[3]) { - cg_peratom_6->ghost_notify(); - cg_peratom_6->setup(); - } - peratom_allocate_flag = 1; - } - if (triclinic == 0) boxlo = domain->boxlo; - else { - boxlo = domain->boxlo_lamda; - domain->x2lamda(atom->nlocal); - } - // extend size of per-atom arrays if necessary - - if (atom->nmax > nmax) { - - if (function[0]) memory->destroy(part2grid); - if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6); - if (differentiation_flag == 1) { - memory->destroy(particle_ekx); - memory->destroy(particle_eky); - memory->destroy(particle_ekz); - if (function[2] == 1){ - memory->destroy(particle_ekx0); - memory->destroy(particle_eky0); - memory->destroy(particle_ekz0); - memory->destroy(particle_ekx1); - memory->destroy(particle_eky1); - memory->destroy(particle_ekz1); - memory->destroy(particle_ekx2); - memory->destroy(particle_eky2); - memory->destroy(particle_ekz2); - memory->destroy(particle_ekx3); - memory->destroy(particle_eky3); - memory->destroy(particle_ekz3); - memory->destroy(particle_ekx4); - memory->destroy(particle_eky4); - memory->destroy(particle_ekz4); - memory->destroy(particle_ekx5); - memory->destroy(particle_eky5); - memory->destroy(particle_ekz5); - memory->destroy(particle_ekx6); - memory->destroy(particle_eky6); - memory->destroy(particle_ekz6); - } - - } - nmax = atom->nmax; - if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid"); - if (function[1] + function[2] + function[3]) - memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6"); - if (differentiation_flag == 1) { - memory->create(particle_ekx, nmax, "pppmdispintel:pekx"); - memory->create(particle_eky, nmax, "pppmdispintel:peky"); - memory->create(particle_ekz, nmax, "pppmdispintel:pekz"); - if (function[2] == 1){ - memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0"); - memory->create(particle_eky0, nmax, "pppmdispintel:peky0"); - memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0"); - memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1"); - memory->create(particle_eky1, nmax, "pppmdispintel:peky1"); - memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1"); - memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2"); - memory->create(particle_eky2, nmax, "pppmdispintel:peky2"); - memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2"); - memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3"); - memory->create(particle_eky3, nmax, "pppmdispintel:peky3"); - memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3"); - memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4"); - memory->create(particle_eky4, nmax, "pppmdispintel:peky4"); - memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4"); - memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5"); - memory->create(particle_eky5, nmax, "pppmdispintel:peky5"); - memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5"); - memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6"); - memory->create(particle_eky6, nmax, "pppmdispintel:peky6"); - memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6"); - } - } - } - energy = 0.0; - energy_1 = 0.0; - energy_6 = 0.0; - if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0; - - // find grid points for all my particles - // distribute partcles' charges/dispersion coefficients on the grid - // communication between processors and remapping two fft - // Solution of poissons equation in k-space and backtransformation - // communication between processors - // calculation of forces - - if (function[0]) { - - //perform calculations for coulomb interactions only - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid, - nupper, nlower, nxlo_out, nylo_out, nzlo_out, - nxhi_out, nyhi_out, nzhi_out, - fix->get_mixed_buffers()); - make_rho_c<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid, - nupper, nlower, nxlo_out, nylo_out, - nzlo_out, nxhi_out, nyhi_out, nzhi_out, - fix->get_double_buffers()); - make_rho_c<double,double>(fix->get_double_buffers()); - } else { - particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid, - nupper, nlower, nxlo_out, nylo_out, nzlo_out, - nxhi_out, nyhi_out, nzhi_out, - fix->get_single_buffers()); - make_rho_c<float,float>(fix->get_single_buffers()); - } - - cg->reverse_comm(this,REVERSE_RHO); - - brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, - density_brick, density_fft, work1,remap); - - if (differentiation_flag == 1) { - poisson_ad(work1, work2, density_fft, fft1, fft2, - nx_pppm, ny_pppm, nz_pppm, nfft, - nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft, - nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, - energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick, - v1_brick, v2_brick, v3_brick, v4_brick, v5_brick); - - cg->forward_comm(this,FORWARD_AD); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_c_ad<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_c_ad<double,double>(fix->get_double_buffers()); - } else { - fieldforce_c_ad<float,float>(fix->get_single_buffers()); - } - - if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM); - - } else { - poisson_ik(work1, work2, density_fft, fft1, fft2, - nx_pppm, ny_pppm, nz_pppm, nfft, - nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft, - nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, - energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2, - vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2, - u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, - v5_brick); - - cg->forward_comm(this, FORWARD_IK); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_c_ik<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_c_ik<double,double>(fix->get_double_buffers()); - } else { - fieldforce_c_ik<float,float>(fix->get_single_buffers()); - } - - if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM); - } - if (evflag_atom) fieldforce_c_peratom(); - } - - if (function[1]) { - //perfrom calculations for geometric mixing - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_mixed_buffers()); - make_rho_g<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_double_buffers()); - make_rho_g<double,double>(fix->get_double_buffers()); - } else { - particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_single_buffers()); - make_rho_g<float,float>(fix->get_single_buffers()); - } - - - cg_6->reverse_comm(this, REVERSE_RHO_G); - - brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6, - density_brick_g, density_fft_g, work1_6,remap_6); - - if (differentiation_flag == 1) { - - poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6, - nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, - nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, - nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6, - nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6, - virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g, - v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g); - - cg_6->forward_comm(this,FORWARD_AD_G); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_g_ad<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_g_ad<double,double>(fix->get_double_buffers()); - } else { - fieldforce_g_ad<float,float>(fix->get_single_buffers()); - } - - if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G); - - } else { - poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6, - nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, - nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, - nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, - nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6, - fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g, - vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, - v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g); - - cg_6->forward_comm(this,FORWARD_IK_G); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_g_ik<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_g_ik<double,double>(fix->get_double_buffers()); - } else { - fieldforce_g_ik<float,float>(fix->get_single_buffers()); - } - - - if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G); - } - if (evflag_atom) fieldforce_g_peratom(); - } - - if (function[2]) { - //perform calculations for arithmetic mixing - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, - nxlo_out_6, nylo_out_6, nzlo_out_6, - nxhi_out_6, nyhi_out_6, nzhi_out_6, - fix->get_mixed_buffers()); - make_rho_a<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_double_buffers()); - make_rho_a<double,double>(fix->get_double_buffers()); - } else { - particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_single_buffers()); - make_rho_a<float,float>(fix->get_single_buffers()); - } - - cg_6->reverse_comm(this, REVERSE_RHO_A); - - brick2fft_a(); - - if ( differentiation_flag == 1) { - - poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6, - nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, - nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, - nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, - nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6, - u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, - v3_brick_a3, v4_brick_a3, v5_brick_a3); - poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0, - v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, - v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6, - v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6); - poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1, - v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, - v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5, - v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5); - poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2, - v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, - v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4, - v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4); - - cg_6->forward_comm(this, FORWARD_AD_A); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_a_ad<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_a_ad<double,double>(fix->get_double_buffers()); - } else { - fieldforce_a_ad<float,float>(fix->get_single_buffers()); - } - - if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A); - - } else { - - poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6, - nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, - nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, - nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, - nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6, - fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, - virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3, - v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3); - poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0, - vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6, - vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0, - v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0, - u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, - v3_brick_a6, v4_brick_a6, v5_brick_a6); - poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1, - vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5, - vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1, - v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1, - u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, - v3_brick_a5, v4_brick_a5, v5_brick_a5); - poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2, - vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4, - vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2, - v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2, - u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, - v3_brick_a4, v4_brick_a4, v5_brick_a4); - - cg_6->forward_comm(this, FORWARD_IK_A); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_a_ik<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_a_ik<double,double>(fix->get_double_buffers()); - } else { - fieldforce_a_ik<float,float>(fix->get_single_buffers()); - } - - if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A); - } - if (evflag_atom) fieldforce_a_peratom(); - } - - if (function[3]) { - //perform calculations if no mixing rule applies - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_mixed_buffers()); - make_rho_none<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_double_buffers()); - make_rho_none<double,double>(fix->get_double_buffers()); - } else { - particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, - part2grid_6, nupper_6, nlower_6, nxlo_out_6, - nylo_out_6, nzlo_out_6, nxhi_out_6, - nyhi_out_6, nzhi_out_6, - fix->get_single_buffers()); - make_rho_none<float,float>(fix->get_single_buffers()); - } - - cg_6->reverse_comm(this, REVERSE_RHO_NONE); - - brick2fft_none(); - - if (differentiation_flag == 1) { - - int n = 0; - for (int k = 0; k<nsplit_alloc/2; k++) { - poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1], - u_brick_none[n],u_brick_none[n+1], - v0_brick_none, v1_brick_none, v2_brick_none, - v3_brick_none, v4_brick_none, v5_brick_none); - n += 2; - } - - cg_6->forward_comm(this,FORWARD_AD_NONE); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_none_ad<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_none_ad<double,double>(fix->get_double_buffers()); - } else { - fieldforce_none_ad<float,float>(fix->get_single_buffers()); - } - - if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE); - - } else { - int n = 0; - for (int k = 0; k<nsplit_alloc/2; k++) { - - poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1], - vdx_brick_none[n], vdy_brick_none[n], - vdz_brick_none[n], vdx_brick_none[n+1], - vdy_brick_none[n+1], vdz_brick_none[n+1], - u_brick_none, v0_brick_none, v1_brick_none, - v2_brick_none, v3_brick_none, v4_brick_none, - v5_brick_none); - n += 2; - } - - cg_6->forward_comm(this,FORWARD_IK_NONE); - - if (fix->precision() == FixIntel::PREC_MODE_MIXED) { - fieldforce_none_ik<float,double>(fix->get_mixed_buffers()); - } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { - fieldforce_none_ik<double,double>(fix->get_double_buffers()); - } else { - fieldforce_none_ik<float,float>(fix->get_single_buffers()); - } - - if (evflag_atom) - cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE); - } - if (evflag_atom) fieldforce_none_peratom(); - } - - // update qsum and qsqsum, if atom count has changed and energy needed - - if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) { - qsum_qsq(); - natoms_original = atom->natoms; - } - - // sum energy across procs and add in volume-dependent term - - const double qscale = force->qqrd2e * scale; - if (eflag_global) { - double energy_all; - MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); - energy_1 = energy_all; - MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); - energy_6 = energy_all; - - energy_1 *= 0.5*volume; - energy_6 *= 0.5*volume; - - energy_1 -= g_ewald*qsqsum/MY_PIS + - MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume); - energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij + - 1.0/12.0*pow(g_ewald_6,6)*csum; - energy_1 *= qscale; - } - - // sum virial across procs - - if (vflag_global) { - double virial_all[6]; - MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world); - for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; - MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world); - for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i]; - if (function[1]+function[2]+function[3]){ - double a = MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij; - virial[0] -= a; - virial[1] -= a; - virial[2] -= a; - } - } - - if (eflag_atom) { - if (function[0]) { - double *q = atom->q; - for (i = 0; i < atom->nlocal; i++) { - eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]* - qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction - } - } - if (function[1] + function[2] + function[3]) { - int tmp; - for (i = 0; i < atom->nlocal; i++) { - tmp = atom->type[i]; - eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] + - 1.0/12.0*pow(g_ewald_6,6)*cii[tmp]; - } - } - } - - if (vflag_atom) { - if (function[1] + function[2] + function[3]) { - int tmp; - for (i = 0; i < atom->nlocal; i++) { - tmp = atom->type[i]; - //dispersion self virial correction - for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)* - pow(g_ewald_6,3)*csumi[tmp]; - } - } - } - - - // 2d slab correction - - if (slabflag) slabcorr(eflag); - if (function[0]) energy += energy_1; - if (function[1] + function[2] + function[3]) energy += energy_6; - - // convert atoms back from lamda to box coords - - if (triclinic) domain->lamda2x(atom->nlocal); -} - - -/* ---------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - find center grid pt for each of my particles - check that full stencil for the particle will fit in my 3d brick - store central grid pt indices in part2grid array -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t> -void PPPMDispIntel::particle_map(double delx, double dely, double delz, - double sft, int** p2g, int nup, int nlow, - int nxlo, int nylo, int nzlo, - int nxhi, int nyhi, int nzhi, - IntelBuffers<flt_t,acc_t> *buffers) -{ - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2])) - error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); - - int flag = 0; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\ - nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt) - #endif - { - double **x = atom->x; - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delx; - const flt_t yi = dely; - const flt_t zi = delz; - const flt_t fshift = sft; - - - int iifrom, iito, tid; - IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T)); - - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd reduction(+:flag) - #endif - for (int i = iifrom; i < iito; i++) { - - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // current particle coord can be outside global and local box - // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 - - int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET; - int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET; - int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET; - - p2g[i][0] = nx; - p2g[i][1] = ny; - p2g[i][2] = nz; - - // check that entire stencil around nx,ny,nz will fit in my 3d brick - - if (nx+nlow < nxlo || nx+nup > nxhi || - ny+nlow < nylo || ny+nup > nyhi || - nz+nlow < nzlo || nz+nup > nzhi) - flag = 1; - } - } - - if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp"); -} - -/* ---------------------------------------------------------------------- - create discretized "density" on section of global grid due to my particles - density(x,y,z) = charge "density" at grid points of my 3d brick - (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) - in global grid -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) -{ - // clear 3d density array - - FFT_SCALAR * _noalias global_density = - &(density_brick[nzlo_out][nylo_out][nxlo_out]); - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - //double *q = atom->q; - //double **x = atom->x; - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nthr, nlocal, global_density) if(!_use_lrt) - #endif - { - double *q = atom->q; - double **x = atom->x; - - const int nix = nxhi_out - nxlo_out + 1; - const int niy = nyhi_out - nylo_out + 1; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshift = shift; - const flt_t fshiftone = shiftone; - const flt_t fdelvolinv = delvolinv; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : - perthread_density[tid - 1]; - // clear 3d density array - memset(my_density, 0, ngrid * sizeof(FFT_SCALAR)); - - for (int i = ifrom; i < ito; i++) { - - int nx = part2grid[i][0]; - int ny = part2grid[i][1]; - int nz = part2grid[i][2]; - - int nysum = nlower + ny - nylo_out; - int nxsum = nlower + nx - nxlo_out; - int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho_lookup[idx][k]; - rho[1][k] = rho_lookup[idy][k]; - rho[2][k] = rho_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower; k <= nupper; k++) { - FFT_SCALAR r1,r2,r3; - r1 = r2 = r3 = ZEROF; - - for (int l = order-1; l >= 0; l--) { - r1 = rho_coeff[l][k] + r1*dx; - r2 = rho_coeff[l][k] + r2*dy; - r3 = rho_coeff[l][k] + r3*dz; - } - rho[0][k-nlower] = r1; - rho[1][k-nlower] = r2; - rho[2][k-nlower] = r3; - } - } - - FFT_SCALAR z0 = fdelvolinv * q[i]; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order; n++) { - int mz = n*nix*niy + nzsum; - FFT_SCALAR y0 = z0*rho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order; m++) { - int mzy = m*nix + mz; - FFT_SCALAR x0 = y0*rho[1][m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mzyx = l + mzy; - my_density[mzyx] += x0*rho[0][l]; - } - } - } - } - } - - // reduce all the perthread_densities into global_density - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nthr, global_density) if(!_use_lrt) - #endif - { - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr); - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int i = ifrom; i < ito; i++) { - for(int j = 1; j < nthr; j++) { - global_density[i] += perthread_density[j-1][i]; - } - } - } -} - -/* ---------------------------------------------------------------------- - create discretized "density" on section of global grid due to my particles - density(x,y,z) = dispersion "density" at grid points of my 3d brick - (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) - in global grid --- geometric mixing -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) -{ - // clear 3d density array - - FFT_SCALAR * _noalias global_density = - &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]); - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nthr, nlocal, global_density) if(!_use_lrt) - #endif - { - int type; - double **x = atom->x; - - const int nix = nxhi_out_6 - nxlo_out_6 + 1; - const int niy = nyhi_out_6 - nylo_out_6 + 1; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshift = shift_6; - const flt_t fshiftone = shiftone_6; - const flt_t fdelvolinv = delvolinv_6; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : - perthread_density[tid - 1]; - - // clear 3d density array - memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR)); - - for (int i = ifrom; i < ito; i++) { - - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - - int nysum = nlower_6 + ny - nylo_out_6; - int nxsum = nlower_6 + nx - nxlo_out_6; - int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho6_lookup[idx][k]; - rho[1][k] = rho6_lookup[idy][k]; - rho[2][k] = rho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1,r2,r3; - r1 = r2 = r3 = ZEROF; - - for (int l = order_6-1; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1*dx; - r2 = rho_coeff_6[l][k] + r2*dy; - r3 = rho_coeff_6[l][k] + r3*dz; - } - rho[0][k-nlower_6] = r1; - rho[1][k-nlower_6] = r2; - rho[2][k-nlower_6] = r3; - } - } - - type = atom->type[i]; - FFT_SCALAR z0 = fdelvolinv * B[type]; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n*nix*niy + nzsum; - FFT_SCALAR y0 = z0*rho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int mzy = m*nix + mz; - FFT_SCALAR x0 = y0*rho[1][m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mzyx = l + mzy; - my_density[mzyx] += x0*rho[0][l]; - } - } - } - } - } - - // reduce all the perthread_densities into global_density - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nthr, global_density) if(!_use_lrt) - #endif - { - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr); - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int i = ifrom; i < ito; i++) { - for(int j = 1; j < nthr; j++) { - global_density[i] += perthread_density[j-1][i]; - } - } - } - -} - -/* ---------------------------------------------------------------------- - create discretized "density" on section of global grid due to my particles - density(x,y,z) = dispersion "density" at grid points of my 3d brick - (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) - in global grid --- arithmetic mixing -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) -{ - // clear 3d density array - - memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, - ngrid_6*sizeof(FFT_SCALAR)); - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - int nlocal = atom->nlocal; - - double **x = atom->x; - - const int nix = nxhi_out_6 - nxlo_out_6 + 1; - const int niy = nyhi_out_6 - nylo_out_6 + 1; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshift = shift_6; - const flt_t fshiftone = shiftone_6; - const flt_t fdelvolinv = delvolinv_6; - - for (int i = 0; i < nlocal; i++) { - - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho6_lookup[idx][k]; - rho[1][k] = rho6_lookup[idy][k]; - rho[2][k] = rho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1,r2,r3; - r1 = r2 = r3 = ZEROF; - - for (int l = order_6-1; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1*dx; - r2 = rho_coeff_6[l][k] + r2*dy; - r3 = rho_coeff_6[l][k] + r3*dz; - } - rho[0][k-nlower_6] = r1; - rho[1][k-nlower_6] = r2; - rho[2][k-nlower_6] = r3; - } - } - - const int type = atom->type[i]; - FFT_SCALAR z0 = fdelvolinv; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n + nzsum; - FFT_SCALAR y0 = z0*rho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m + nysum; - FFT_SCALAR x0 = y0*rho[1][m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l + nxsum; - FFT_SCALAR w = x0*rho[0][l]; - density_brick_a0[mz][my][mx] += w*B[7*type]; - density_brick_a1[mz][my][mx] += w*B[7*type+1]; - density_brick_a2[mz][my][mx] += w*B[7*type+2]; - density_brick_a3[mz][my][mx] += w*B[7*type+3]; - density_brick_a4[mz][my][mx] += w*B[7*type+4]; - density_brick_a5[mz][my][mx] += w*B[7*type+5]; - density_brick_a6[mz][my][mx] += w*B[7*type+6]; - } - } - } - } -} - -/* ---------------------------------------------------------------------- - create discretized "density" on section of global grid due to my particles - density(x,y,z) = dispersion "density" at grid points of my 3d brick - (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) - in global grid --- case when mixing rules don't apply -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) -{ - - FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]); - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nthr, nlocal, global_density) if(!_use_lrt) - #endif - { - int type; - double **x = atom->x; - - const int nix = nxhi_out_6 - nxlo_out_6 + 1; - const int niy = nyhi_out_6 - nylo_out_6 + 1; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshift = shift_6; - const flt_t fshiftone = shiftone_6; - const flt_t fdelvolinv = delvolinv_6; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : - perthread_density[tid - 1]; - // clear 3d density array - memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR)); - - for (int i = ifrom; i < ito; i++) { - - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - - int nysum = nlower_6 + ny - nylo_out_6; - int nxsum = nlower_6 + nx - nxlo_out_6; - int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho6_lookup[idx][k]; - rho[1][k] = rho6_lookup[idy][k]; - rho[2][k] = rho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1,r2,r3; - r1 = r2 = r3 = ZEROF; - - for (int l = order_6-1; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1*dx; - r2 = rho_coeff_6[l][k] + r2*dy; - r3 = rho_coeff_6[l][k] + r3*dz; - } - rho[0][k-nlower_6] = r1; - rho[1][k-nlower_6] = r2; - rho[2][k-nlower_6] = r3; - } - } - - type = atom->type[i]; - FFT_SCALAR z0 = fdelvolinv; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n*nix*niy + nzsum; - FFT_SCALAR y0 = z0*rho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int mzy = m*nix + mz; - FFT_SCALAR x0 = y0*rho[1][m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mzyx = l + mzy; - FFT_SCALAR w0 = x0*rho[0][l]; - for(int k = 0; k < nsplit; k++) - my_density[mzyx + k*ngrid_6] += x0*rho[0][l]; - } - } - } - } - } - - // reduce all the perthread_densities into global_density - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nthr, global_density) if(!_use_lrt) - #endif - { - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr); - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int i = ifrom; i < ito; i++) { - for(int j = 1; j < nthr; j++) { - global_density[i] += perthread_density[j-1][i]; - } - } - } - -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles - for ik scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of E-field on particle - - //double *q = atom->q; - //double **x = atom->x; - //double **f = atom->f; - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double *q = atom->q; - double **x = atom->x; - double **f = atom->f; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshiftone = shiftone; - const flt_t fqqrd2es = qqrd2e * scale; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid[i][0]; - int ny = part2grid[i][1]; - int nz = part2grid[i][2]; - - int nxsum = nx + nlower; - int nysum = ny + nlower; - int nzsum = nz + nlower;; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho0[k] = rho_lookup[idx][k]; - rho1[k] = rho_lookup[idy][k]; - rho2[k] = rho_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower; k <= nupper; k++) { - FFT_SCALAR r1 = rho_coeff[order-1][k]; - FFT_SCALAR r2 = rho_coeff[order-1][k]; - FFT_SCALAR r3 = rho_coeff[order-1][k]; - for (int l = order-2; l >= 0; l--) { - r1 = rho_coeff[l][k] + r1*dx; - r2 = rho_coeff[l][k] + r2*dy; - r3 = rho_coeff[l][k] + r3*dz; - } - - rho0[k-nlower] = r1; - rho1[k-nlower] = r2; - rho2[k-nlower] = r3; - } - } - - _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order; n++) { - int mz = n+nzsum; - FFT_SCALAR z0 = rho2[n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order; m++) { - int my = m+nysum; - FFT_SCALAR y0 = z0*rho1[m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l+nxsum; - FFT_SCALAR x0 = y0*rho0[l]; - ekx_arr[l] -= x0*vdx_brick[mz][my][mx]; - eky_arr[l] -= x0*vdy_brick[mz][my][mx]; - ekz_arr[l] -= x0*vdz_brick[mz][my][mx]; - - } - } - } - - FFT_SCALAR ekx, eky, ekz; - ekx = eky = ekz = ZEROF; - - - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - ekx += ekx_arr[l]; - eky += eky_arr[l]; - ekz += ekz_arr[l]; - } - - // convert E-field to force - - const flt_t qfactor = fqqrd2es * q[i]; - f[i][0] += qfactor*ekx; - f[i][1] += qfactor*eky; - if (slabflag != 2) f[i][2] += qfactor*ekz; - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles - for ad scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of E-field on particle - - //double *q = atom->q; - //double **x = atom->x; - //double **f = atom->f; - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; - FFT_SCALAR * _noalias const particle_eky = this->particle_eky; - FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double *prd; - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; - - double *q = atom->q; - double **x = atom->x; - double **f = atom->f; - const flt_t ftwo_pi = MY_PI * 2.0; - const flt_t ffour_pi = MY_PI * 4.0; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv; - const flt_t yi = delyinv; - const flt_t zi = delzinv; - const flt_t fshiftone = shiftone; - const flt_t fqqrd2es = qqrd2e * scale; - - const double xprd = prd[0]; - const double yprd = prd[1]; - const double zprd = prd[2]*slab_volfactor; - - const flt_t hx_inv = nx_pppm/xprd; - const flt_t hy_inv = ny_pppm/yprd; - const flt_t hz_inv = nz_pppm/zprd; - - const flt_t fsf_coeff0 = sf_coeff[0]; - const flt_t fsf_coeff1 = sf_coeff[1]; - const flt_t fsf_coeff2 = sf_coeff[2]; - const flt_t fsf_coeff3 = sf_coeff[3]; - const flt_t fsf_coeff4 = sf_coeff[4]; - const flt_t fsf_coeff5 = sf_coeff[5]; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid[i][0]; - int ny = part2grid[i][1]; - int nz = part2grid[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - int nxsum = nx + nlower; - int nysum = ny + nlower; - int nzsum = nz + nlower; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho_lookup[idx][k]; - rho[1][k] = rho_lookup[idy][k]; - rho[2][k] = rho_lookup[idz][k]; - drho[0][k] = drho_lookup[idx][k]; - drho[1][k] = drho_lookup[idy][k]; - drho[2][k] = drho_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower; k <= nupper; k++) { - FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; - dr1 = dr2 = dr3 = ZEROF; - - r1 = rho_coeff[order-1][k]; - r2 = rho_coeff[order-1][k]; - r3 = rho_coeff[order-1][k]; - for (int l = order-2; l >= 0; l--) { - r1 = rho_coeff[l][k] + r1 * dx; - r2 = rho_coeff[l][k] + r2 * dy; - r3 = rho_coeff[l][k] + r3 * dz; - dr1 = drho_coeff[l][k] + dr1 * dx; - dr2 = drho_coeff[l][k] + dr2 * dy; - dr3 = drho_coeff[l][k] + dr3 * dz; - } - rho[0][k-nlower] = r1; - rho[1][k-nlower] = r2; - rho[2][k-nlower] = r3; - drho[0][k-nlower] = dr1; - drho[1][k-nlower] = dr2; - drho[2][k-nlower] = dr3; - } - } - _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order; n++) { - int mz = n + nzsum; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order; m++) { - int my = m + nysum; - FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; - FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; - FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l + nxsum; - ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; - eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx]; - ekz[l] += rho[0][l] * ekz_p * u_brick[mz][my][mx]; - } - } - } - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ - particle_ekx[i] += ekx[l]; - particle_eky[i] += eky[l]; - particle_ekz[i] += ekz[l]; - } - } - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int i = ifrom; i < ito; i++) { - particle_ekx[i] *= hx_inv; - particle_eky[i] *= hy_inv; - particle_ekz[i] *= hz_inv; - - // convert E-field to force - - const flt_t qfactor = fqqrd2es * q[i]; - const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; - - const flt_t s1 = x[i][0] * hx_inv; - const flt_t s2 = x[i][1] * hy_inv; - const flt_t s3 = x[i][2] * hz_inv; - flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); - sf += fsf_coeff1 * sin(ffour_pi * s1); - sf *= twoqsq; - f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf; - - sf = fsf_coeff2 * sin(ftwo_pi * s2); - sf += fsf_coeff3 * sin(ffour_pi * s2); - sf *= twoqsq; - f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf; - - sf = fsf_coeff4 * sin(ftwo_pi * s3); - sf += fsf_coeff5 * sin(ffour_pi * s3); - sf *= twoqsq; - - if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf; - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get dispersion field & force on my particles - for geometric mixing rule -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of dispersion field on particle - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double lj; - int type; - double **x = atom->x; - double **f = atom->f; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshiftone = shiftone_6; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho0[k] = rho6_lookup[idx][k]; - rho1[k] = rho6_lookup[idy][k]; - rho2[k] = rho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; - FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; - FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; - for (int l = order_6-2; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1*dx; - r2 = rho_coeff_6[l][k] + r2*dy; - r3 = rho_coeff_6[l][k] + r3*dz; - } - - rho0[k-nlower_6] = r1; - rho1[k-nlower_6] = r2; - rho2[k-nlower_6] = r3; - } - } - - _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n+nzsum; - FFT_SCALAR z0 = rho2[n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m+nysum; - FFT_SCALAR y0 = z0*rho1[m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l+nxsum; - FFT_SCALAR x0 = y0*rho0[l]; - ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx]; - eky_arr[l] -= x0*vdy_brick_g[mz][my][mx]; - ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx]; - - } - } - } - - FFT_SCALAR ekx, eky, ekz; - ekx = eky = ekz = ZEROF; - - - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - ekx += ekx_arr[l]; - eky += eky_arr[l]; - ekz += ekz_arr[l]; - } - - // convert E-field to force - - type = atom->type[i]; - lj = B[type]; - f[i][0] += lj*ekx; - f[i][1] += lj*eky; - if (slabflag != 2) f[i][2] += lj*ekz; - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get dispersion field & force on my particles - for geometric mixing rule for ad scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of dispersion field on particle - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; - FFT_SCALAR * _noalias const particle_eky = this->particle_eky; - FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double *prd; - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; - - double **x = atom->x; - double **f = atom->f; - const flt_t ftwo_pi = MY_PI * 2.0; - const flt_t ffour_pi = MY_PI * 4.0; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshiftone = shiftone_6; - - const double xprd = prd[0]; - const double yprd = prd[1]; - const double zprd = prd[2]*slab_volfactor; - - const flt_t hx_inv = nx_pppm_6/xprd; - const flt_t hy_inv = ny_pppm_6/yprd; - const flt_t hz_inv = nz_pppm_6/zprd; - - const flt_t fsf_coeff0 = sf_coeff_6[0]; - const flt_t fsf_coeff1 = sf_coeff_6[1]; - const flt_t fsf_coeff2 = sf_coeff_6[2]; - const flt_t fsf_coeff3 = sf_coeff_6[3]; - const flt_t fsf_coeff4 = sf_coeff_6[4]; - const flt_t fsf_coeff5 = sf_coeff_6[5]; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho6_lookup[idx][k]; - rho[1][k] = rho6_lookup[idy][k]; - rho[2][k] = rho6_lookup[idz][k]; - drho[0][k] = drho6_lookup[idx][k]; - drho[1][k] = drho6_lookup[idy][k]; - drho[2][k] = drho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; - dr1 = dr2 = dr3 = ZEROF; - - r1 = rho_coeff_6[order_6-1][k]; - r2 = rho_coeff_6[order_6-1][k]; - r3 = rho_coeff_6[order_6-1][k]; - for (int l = order_6-2; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1 * dx; - r2 = rho_coeff_6[l][k] + r2 * dy; - r3 = rho_coeff_6[l][k] + r3 * dz; - dr1 = drho_coeff_6[l][k] + dr1 * dx; - dr2 = drho_coeff_6[l][k] + dr2 * dy; - dr3 = drho_coeff_6[l][k] + dr3 * dz; - } - rho[0][k-nlower_6] = r1; - rho[1][k-nlower_6] = r2; - rho[2][k-nlower_6] = r3; - drho[0][k-nlower_6] = dr1; - drho[1][k-nlower_6] = dr2; - drho[2][k-nlower_6] = dr3; - } - } - _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n + nzsum; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m + nysum; - FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; - FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; - FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l + nxsum; - ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx]; - eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx]; - ekz[l] += rho[0][l] * ekz_p * u_brick_g[mz][my][mx]; - } - } - } - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ - particle_ekx[i] += ekx[l]; - particle_eky[i] += eky[l]; - particle_ekz[i] += ekz[l]; - } - } - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int i = ifrom; i < ito; i++) { - particle_ekx[i] *= hx_inv; - particle_eky[i] *= hy_inv; - particle_ekz[i] *= hz_inv; - - // convert E-field to force - - const int type = atom->type[i]; - const flt_t lj = B[type]; - const flt_t twoljsq = 2.*lj*lj; - - const flt_t s1 = x[i][0] * hx_inv; - const flt_t s2 = x[i][1] * hy_inv; - const flt_t s3 = x[i][2] * hz_inv; - flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); - sf += fsf_coeff1 * sin(ffour_pi * s1); - sf *= twoljsq; - f[i][0] += lj * particle_ekx[i] - sf; - - sf = fsf_coeff2 * sin(ftwo_pi * s2); - sf += fsf_coeff3 * sin(ffour_pi * s2); - sf *= twoljsq; - f[i][1] += lj * particle_eky[i] - sf; - - sf = fsf_coeff4 * sin(ftwo_pi * s3); - sf += fsf_coeff5 * sin(ffour_pi * s3); - sf *= twoljsq; - - if (slabflag != 2) f[i][2] += lj * particle_ekz[i] - sf; - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get dispersion field & force on my particles - for arithmetic mixing rule and ik scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of dispersion field on particle - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - double **x = atom->x; - double **f = atom->f; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshiftone = shiftone_6; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho0[k] = rho6_lookup[idx][k]; - rho1[k] = rho6_lookup[idy][k]; - rho2[k] = rho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; - FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; - FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; - for (int l = order_6-2; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1*dx; - r2 = rho_coeff_6[l][k] + r2*dy; - r3 = rho_coeff_6[l][k] + r3*dz; - } - - rho0[k-nlower_6] = r1; - rho1[k-nlower_6] = r2; - rho2[k-nlower_6] = r3; - } - } - - _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n+nzsum; - FFT_SCALAR z0 = rho2[n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m+nysum; - FFT_SCALAR y0 = z0*rho1[m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l+nxsum; - FFT_SCALAR x0 = y0*rho0[l]; - ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx]; - eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx]; - ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx]; - ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx]; - eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx]; - ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx]; - ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx]; - eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx]; - ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx]; - ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx]; - eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx]; - ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx]; - ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx]; - eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx]; - ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx]; - ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx]; - eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx]; - ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx]; - ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx]; - eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx]; - ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx]; - } - } - } - - FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2; - FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5; - FFT_SCALAR ekx6, eky6, ekz6; - ekx0 = eky0 = ekz0 = ZEROF; - ekx1 = eky1 = ekz1 = ZEROF; - ekx2 = eky2 = ekz2 = ZEROF; - ekx3 = eky3 = ekz3 = ZEROF; - ekx4 = eky4 = ekz4 = ZEROF; - ekx5 = eky5 = ekz5 = ZEROF; - ekx6 = eky6 = ekz6 = ZEROF; - - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - ekx0 += ekx0_arr[l]; - eky0 += eky0_arr[l]; - ekz0 += ekz0_arr[l]; - ekx1 += ekx1_arr[l]; - eky1 += eky1_arr[l]; - ekz1 += ekz1_arr[l]; - ekx2 += ekx2_arr[l]; - eky2 += eky2_arr[l]; - ekz2 += ekz2_arr[l]; - ekx3 += ekx3_arr[l]; - eky3 += eky3_arr[l]; - ekz3 += ekz3_arr[l]; - ekx4 += ekx4_arr[l]; - eky4 += eky4_arr[l]; - ekz4 += ekz4_arr[l]; - ekx5 += ekx5_arr[l]; - eky5 += eky5_arr[l]; - ekz5 += ekz5_arr[l]; - ekx6 += ekx6_arr[l]; - eky6 += eky6_arr[l]; - ekz6 += ekz6_arr[l]; - } - - // convert D-field to force - - const int type = atom->type[i]; - const FFT_SCALAR lj0 = B[7*type+6]; - const FFT_SCALAR lj1 = B[7*type+5]; - const FFT_SCALAR lj2 = B[7*type+4]; - const FFT_SCALAR lj3 = B[7*type+3]; - const FFT_SCALAR lj4 = B[7*type+2]; - const FFT_SCALAR lj5 = B[7*type+1]; - const FFT_SCALAR lj6 = B[7*type]; - - f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + - lj4*ekx4 + lj5*ekx5 + lj6*ekx6; - f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + - lj4*eky4 + lj5*eky5 + lj6*eky6; - if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + - lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6; - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get dispersion field & force on my particles - for arithmetic mixing rule for the ad scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of dispersion field on particle - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0; - FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0; - FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0; - FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1; - FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1; - FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1; - FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2; - FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2; - FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2; - FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3; - FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3; - FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3; - FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4; - FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4; - FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4; - FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5; - FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5; - FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5; - FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6; - FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6; - FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double *prd; - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; - - double **x = atom->x; - double **f = atom->f; - const flt_t ftwo_pi = MY_PI * 2.0; - const flt_t ffour_pi = MY_PI * 4.0; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshiftone = shiftone_6; - - const double xprd = prd[0]; - const double yprd = prd[1]; - const double zprd = prd[2]*slab_volfactor; - - const flt_t hx_inv = nx_pppm_6/xprd; - const flt_t hy_inv = ny_pppm_6/yprd; - const flt_t hz_inv = nz_pppm_6/zprd; - - const flt_t fsf_coeff0 = sf_coeff_6[0]; - const flt_t fsf_coeff1 = sf_coeff_6[1]; - const flt_t fsf_coeff2 = sf_coeff_6[2]; - const flt_t fsf_coeff3 = sf_coeff_6[3]; - const flt_t fsf_coeff4 = sf_coeff_6[4]; - const flt_t fsf_coeff5 = sf_coeff_6[5]; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho6_lookup[idx][k]; - rho[1][k] = rho6_lookup[idy][k]; - rho[2][k] = rho6_lookup[idz][k]; - drho[0][k] = drho6_lookup[idx][k]; - drho[1][k] = drho6_lookup[idy][k]; - drho[2][k] = drho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; - dr1 = dr2 = dr3 = ZEROF; - - r1 = rho_coeff_6[order_6-1][k]; - r2 = rho_coeff_6[order_6-1][k]; - r3 = rho_coeff_6[order_6-1][k]; - for (int l = order_6-2; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1 * dx; - r2 = rho_coeff_6[l][k] + r2 * dy; - r3 = rho_coeff_6[l][k] + r3 * dz; - dr1 = drho_coeff_6[l][k] + dr1 * dx; - dr2 = drho_coeff_6[l][k] + dr2 * dy; - dr3 = drho_coeff_6[l][k] + dr3 * dz; - } - rho[0][k-nlower_6] = r1; - rho[1][k-nlower_6] = r2; - rho[2][k-nlower_6] = r3; - drho[0][k-nlower_6] = dr1; - drho[1][k-nlower_6] = dr2; - drho[2][k-nlower_6] = dr3; - } - } - _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF; - particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF; - particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF; - particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF; - particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF; - particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF; - particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF; - - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n + nzsum; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m + nysum; - FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; - FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; - FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l + nxsum; - FFT_SCALAR x0 = drho[0][l] * ekx_p; - FFT_SCALAR y0 = rho[0][l] * eky_p; - FFT_SCALAR z0 = rho[0][l] * ekz_p; - - ekx0[l] += x0 * u_brick_a0[mz][my][mx]; - eky0[l] += y0 * u_brick_a0[mz][my][mx]; - ekz0[l] += z0 * u_brick_a0[mz][my][mx]; - ekx1[l] += x0 * u_brick_a1[mz][my][mx]; - eky1[l] += y0 * u_brick_a1[mz][my][mx]; - ekz1[l] += z0 * u_brick_a1[mz][my][mx]; - ekx2[l] += x0 * u_brick_a2[mz][my][mx]; - eky2[l] += y0 * u_brick_a2[mz][my][mx]; - ekz2[l] += z0 * u_brick_a2[mz][my][mx]; - ekx3[l] += x0 * u_brick_a3[mz][my][mx]; - eky3[l] += y0 * u_brick_a3[mz][my][mx]; - ekz3[l] += z0 * u_brick_a3[mz][my][mx]; - ekx4[l] += x0 * u_brick_a4[mz][my][mx]; - eky4[l] += y0 * u_brick_a4[mz][my][mx]; - ekz4[l] += z0 * u_brick_a4[mz][my][mx]; - ekx5[l] += x0 * u_brick_a5[mz][my][mx]; - eky5[l] += y0 * u_brick_a5[mz][my][mx]; - ekz5[l] += z0 * u_brick_a5[mz][my][mx]; - ekx6[l] += x0 * u_brick_a6[mz][my][mx]; - eky6[l] += y0 * u_brick_a6[mz][my][mx]; - ekz6[l] += z0 * u_brick_a6[mz][my][mx]; - } - } - } - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ - particle_ekx0[i] += ekx0[l]; - particle_eky0[i] += eky0[l]; - particle_ekz0[i] += ekz0[l]; - particle_ekx1[i] += ekx1[l]; - particle_eky1[i] += eky1[l]; - particle_ekz1[i] += ekz1[l]; - particle_ekx2[i] += ekx2[l]; - particle_eky2[i] += eky2[l]; - particle_ekz2[i] += ekz2[l]; - particle_ekx3[i] += ekx3[l]; - particle_eky3[i] += eky3[l]; - particle_ekz3[i] += ekz3[l]; - particle_ekx4[i] += ekx4[l]; - particle_eky4[i] += eky4[l]; - particle_ekz4[i] += ekz4[l]; - particle_ekx5[i] += ekx5[l]; - particle_eky5[i] += eky5[l]; - particle_ekz5[i] += ekz5[l]; - particle_ekx6[i] += ekx6[l]; - particle_eky6[i] += eky6[l]; - particle_ekz6[i] += ekz6[l]; - } - } - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int i = ifrom; i < ito; i++) { - particle_ekx0[i] *= hx_inv; - particle_eky0[i] *= hy_inv; - particle_ekz0[i] *= hz_inv; - particle_ekx1[i] *= hx_inv; - particle_eky1[i] *= hy_inv; - particle_ekz1[i] *= hz_inv; - particle_ekx2[i] *= hx_inv; - particle_eky2[i] *= hy_inv; - particle_ekz2[i] *= hz_inv; - particle_ekx3[i] *= hx_inv; - particle_eky3[i] *= hy_inv; - particle_ekz3[i] *= hz_inv; - particle_ekx4[i] *= hx_inv; - particle_eky4[i] *= hy_inv; - particle_ekz4[i] *= hz_inv; - particle_ekx5[i] *= hx_inv; - particle_eky5[i] *= hy_inv; - particle_ekz5[i] *= hz_inv; - particle_ekx6[i] *= hx_inv; - particle_eky6[i] *= hy_inv; - particle_ekz6[i] *= hz_inv; - - // convert D-field to force - - const int type = atom->type[i]; - const FFT_SCALAR lj0 = B[7*type+6]; - const FFT_SCALAR lj1 = B[7*type+5]; - const FFT_SCALAR lj2 = B[7*type+4]; - const FFT_SCALAR lj3 = B[7*type+3]; - const FFT_SCALAR lj4 = B[7*type+2]; - const FFT_SCALAR lj5 = B[7*type+1]; - const FFT_SCALAR lj6 = B[7*type]; - - const flt_t s1 = x[i][0] * hx_inv; - const flt_t s2 = x[i][1] * hy_inv; - const flt_t s3 = x[i][2] * hz_inv; - flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); - sf += fsf_coeff1 * sin(ffour_pi * s1); - sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; - f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] + - lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] + - lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf; - - sf = fsf_coeff2 * sin(ftwo_pi * s2); - sf += fsf_coeff3 * sin(ffour_pi * s2); - sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; - f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] + - lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] + - lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf; - - sf = fsf_coeff4 * sin(ftwo_pi * s3); - sf += fsf_coeff5 * sin(ffour_pi * s3); - sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; - if (slabflag != 2) - f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] + - lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] + - lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf; - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get dispersion field & force on my particles - for no mixing rule and ik scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) -{ - - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of dispersion field on particle - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double lj; - int type; - double **x = atom->x; - double **f = atom->f; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshiftone = shiftone_6; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho0[k] = rho6_lookup[idx][k]; - rho1[k] = rho6_lookup[idy][k]; - rho2[k] = rho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; - FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; - FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; - for (int l = order_6-2; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1*dx; - r2 = rho_coeff_6[l][k] + r2*dy; - r3 = rho_coeff_6[l][k] + r3*dz; - } - - rho0[k-nlower_6] = r1; - rho1[k-nlower_6] = r2; - rho2[k-nlower_6] = r3; - } - } - - - _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); - _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); - _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); - - for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) { - ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF; - } - - for (int k = 0; k < nsplit; k++) { - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n+nzsum; - FFT_SCALAR z0 = rho2[n]; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m+nysum; - FFT_SCALAR y0 = z0*rho1[m]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l+nxsum; - FFT_SCALAR x0 = y0*rho0[l]; - ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= - x0*vdx_brick_none[k][mz][my][mx]; - eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= - x0*vdy_brick_none[k][mz][my][mx]; - ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= - x0*vdz_brick_none[k][mz][my][mx]; - } - } - } - } - - _alignvar(FFT_SCALAR ekx[nsplit], 64); - _alignvar(FFT_SCALAR eky[nsplit], 64); - _alignvar(FFT_SCALAR ekz[nsplit], 64); - for (int k = 0; k < nsplit; k++) { - ekx[k] = eky[k] = ekz[k] = ZEROF; - } - - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - for (int k = 0; k < nsplit; k++) { - ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; - eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; - ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; - } - } - - // convert E-field to force - - type = atom->type[i]; - for (int k = 0; k < nsplit; k++) { - lj = B[nsplit*type + k]; - f[i][0] += lj*ekx[k]; - f[i][1] += lj*eky[k]; - if (slabflag != 2) f[i][2] += lj*ekz[k]; - } - } - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get dispersion field & force on my particles - for no mixing rule for the ad scheme -------------------------------------------------------------------------- */ - -template<class flt_t, class acc_t, int use_table> -void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) -{ - // loop over my charges, interpolate electric field from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - // ek = 3 components of dispersion field on particle - - int nlocal = atom->nlocal; - int nthr = comm->nthreads; - - #if defined(_OPENMP) - #pragma omp parallel default(none) \ - shared(nlocal, nthr) if(!_use_lrt) - #endif - { - - double *prd; - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; - - double **x = atom->x; - double **f = atom->f; - const flt_t ftwo_pi = MY_PI * 2.0; - const flt_t ffour_pi = MY_PI * 4.0; - - const flt_t lo0 = boxlo[0]; - const flt_t lo1 = boxlo[1]; - const flt_t lo2 = boxlo[2]; - const flt_t xi = delxinv_6; - const flt_t yi = delyinv_6; - const flt_t zi = delzinv_6; - const flt_t fshiftone = shiftone_6; - - const double xprd = prd[0]; - const double yprd = prd[1]; - const double zprd = prd[2]*slab_volfactor; - - const flt_t hx_inv = nx_pppm_6/xprd; - const flt_t hy_inv = ny_pppm_6/yprd; - const flt_t hz_inv = nz_pppm_6/zprd; - - const flt_t fsf_coeff0 = sf_coeff_6[0]; - const flt_t fsf_coeff1 = sf_coeff_6[1]; - const flt_t fsf_coeff2 = sf_coeff_6[2]; - const flt_t fsf_coeff3 = sf_coeff_6[3]; - const flt_t fsf_coeff4 = sf_coeff_6[4]; - const flt_t fsf_coeff5 = sf_coeff_6[5]; - - int ifrom, ito, tid; - IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); - - _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - - for (int i = ifrom; i < ito; i++) { - int nx = part2grid_6[i][0]; - int ny = part2grid_6[i][1]; - int nz = part2grid_6[i][2]; - FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; - FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; - FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; - - int nxsum = nx + nlower_6; - int nysum = ny + nlower_6; - int nzsum = nz + nlower_6; - - if (use_table) { - dx = dx*half_rho_scale + half_rho_scale_plus; - int idx = dx; - dy = dy*half_rho_scale + half_rho_scale_plus; - int idy = dy; - dz = dz*half_rho_scale + half_rho_scale_plus; - int idz = dz; - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho[0][k] = rho6_lookup[idx][k]; - rho[1][k] = rho6_lookup[idy][k]; - rho[2][k] = rho6_lookup[idz][k]; - drho[0][k] = drho6_lookup[idx][k]; - drho[1][k] = drho6_lookup[idy][k]; - drho[2][k] = drho6_lookup[idz][k]; - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k = nlower_6; k <= nupper_6; k++) { - FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; - dr1 = dr2 = dr3 = ZEROF; - - r1 = rho_coeff_6[order_6-1][k]; - r2 = rho_coeff_6[order_6-1][k]; - r3 = rho_coeff_6[order_6-1][k]; - for (int l = order_6-2; l >= 0; l--) { - r1 = rho_coeff_6[l][k] + r1 * dx; - r2 = rho_coeff_6[l][k] + r2 * dy; - r3 = rho_coeff_6[l][k] + r3 * dz; - dr1 = drho_coeff_6[l][k] + dr1 * dx; - dr2 = drho_coeff_6[l][k] + dr2 * dy; - dr3 = drho_coeff_6[l][k] + dr3 * dz; - } - rho[0][k-nlower_6] = r1; - rho[1][k-nlower_6] = r2; - rho[2][k-nlower_6] = r3; - drho[0][k-nlower_6] = dr1; - drho[1][k-nlower_6] = dr2; - drho[2][k-nlower_6] = dr3; - } - } - _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); - _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); - _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); - - for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) { - ekx[k]=eky[k]=ekz[k]=ZEROF; - } - - for (int k = 0; k < nsplit; k++) { - particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int n = 0; n < order_6; n++) { - int mz = n + nzsum; - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count=7 - #endif - for (int m = 0; m < order_6; m++) { - int my = m + nysum; - FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; - FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; - FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { - int mx = l + nxsum; - ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p * - u_brick_none[k][mz][my][mx]; - eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * eky_p * - u_brick_none[k][mz][my][mx]; - ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * ekz_p * - u_brick_none[k][mz][my][mx]; - } - } - } - } - - _alignvar(FFT_SCALAR ekx_tot[nsplit], 64); - _alignvar(FFT_SCALAR eky_tot[nsplit], 64); - _alignvar(FFT_SCALAR ekz_tot[nsplit], 64); - for (int k = 0; k < nsplit; k++) { - ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF; - } - - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ - for (int k = 0; k < nsplit; k++) { - ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l]; - eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l]; - ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l]; - } - } - - for (int k = 0; k < nsplit; k++) { - ekx_tot[k] *= hx_inv; - eky_tot[k] *= hy_inv; - ekz_tot[k] *= hz_inv; - } - // convert D-field to force - - const int type = atom->type[i]; - - const flt_t s1 = x[i][0] * hx_inv; - const flt_t s2 = x[i][1] * hy_inv; - const flt_t s3 = x[i][2] * hz_inv; - flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1); - sf1 += fsf_coeff1 * sin(ffour_pi * s1); - - flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2); - sf2 += fsf_coeff3 * sin(ffour_pi * s2); - - flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3); - sf3 += fsf_coeff5 * sin(ffour_pi * s3); - for (int k = 0; k < nsplit; k++) { - const flt_t lj = B[nsplit*type + k]; - const flt_t twoljsq = lj*lj * B[k] * 2; - flt_t sf = sf1*twoljsq; - f[i][0] += lj * ekx_tot[k] - sf; - sf = sf2*twoljsq; - f[i][1] += lj * eky_tot[k] - sf; - sf = sf3*twoljsq; - if (slabflag != 2) f[i][2] += lj * ekz_tot[k] - sf; - } - } - } -} - -/* ---------------------------------------------------------------------- - precompute rho coefficients as a lookup table to save time in make_rho - and fieldforce. Instead of doing this polynomial for every atom 6 times - per time step, precompute it for some number of points. -------------------------------------------------------------------------- */ - -void PPPMDispIntel::precompute_rho() -{ - - half_rho_scale = (rho_points - 1.)/2.; - half_rho_scale_plus = half_rho_scale + 0.5; - - for (int i = 0; i < rho_points; i++) { - FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k=nlower; k<=nupper;k++){ - FFT_SCALAR r1 = ZEROF; - for(int l=order-1; l>=0; l--){ - r1 = rho_coeff[l][k] + r1*dx; - } - rho_lookup[i][k-nlower] = r1; - } - for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho_lookup[i][k] = 0; - } - if (differentiation_flag == 1) { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k=nlower; k<=nupper;k++){ - FFT_SCALAR r1 = ZEROF; - for(int l=order-2; l>=0; l--){ - r1 = drho_coeff[l][k] + r1*dx; - } - drho_lookup[i][k-nlower] = r1; - } - for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - drho_lookup[i][k] = 0; - } - } - } - for (int i = 0; i < rho_points; i++) { - FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for (int k=nlower_6; k<=nupper_6;k++){ - FFT_SCALAR r1 = ZEROF; - for(int l=order_6-1; l>=0; l--){ - r1 = rho_coeff_6[l][k] + r1*dx; - } - rho6_lookup[i][k-nlower_6] = r1; - } - for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - rho6_lookup[i][k] = 0; - } - if (differentiation_flag == 1) { - #if defined(LMP_SIMD_COMPILER) - #pragma simd - #endif - for(int k=nlower_6; k<=nupper_6;k++){ - FFT_SCALAR r1 = ZEROF; - for(int l=order_6-2; l>=0; l--){ - r1 = drho_coeff_6[l][k] + r1*dx; - } - drho6_lookup[i][k-nlower_6] = r1; - } - for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { - drho6_lookup[i][k] = 0; - } - } - } -} - -/* ---------------------------------------------------------------------- - Returns 0 if Intel optimizations for PPPM ignored due to offload -------------------------------------------------------------------------- */ - -#ifdef _LMP_INTEL_OFFLOAD -int PPPMDispIntel::use_base() { - return _use_base; -} -#endif +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#include <mpi.h> +#include <stdlib.h> +#include <math.h> +#include "pppm_disp_intel.h" +#include "atom.h" +#include "error.h" +#include "fft3d_wrap.h" +#include "gridcomm.h" +#include "math_const.h" +#include "math_special.h" +#include "memory.h" +#include "suffix.h" + +using namespace LAMMPS_NS; +using namespace MathConst; +using namespace MathSpecial; + +#define MAXORDER 7 +#define OFFSET 16384 +#define SMALL 0.00001 +#define LARGE 10000.0 +#define EPS_HOC 1.0e-7 + +enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER}; +enum{REVERSE_RHO, REVERSE_RHO_G, REVERSE_RHO_A, REVERSE_RHO_NONE}; +enum{FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM, + FORWARD_IK_G, FORWARD_AD_G, FORWARD_IK_PERATOM_G, FORWARD_AD_PERATOM_G, + FORWARD_IK_A, FORWARD_AD_A, FORWARD_IK_PERATOM_A, FORWARD_AD_PERATOM_A, + FORWARD_IK_NONE, FORWARD_AD_NONE, FORWARD_IK_PERATOM_NONE, + FORWARD_AD_PERATOM_NONE}; + +#ifdef FFT_SINGLE +#define ZEROF 0.0f +#define ONEF 1.0f +#else +#define ZEROF 0.0 +#define ONEF 1.0 +#endif + +/* ---------------------------------------------------------------------- */ + +PPPMDispIntel::PPPMDispIntel(LAMMPS *lmp, int narg, char **arg) : + PPPMDisp(lmp, narg, arg) +{ + suffix_flag |= Suffix::INTEL; + + order = 7; + order_6 = 7; //sets default stencil sizes to 7 + + perthread_density = NULL; + particle_ekx = particle_eky = particle_ekz = NULL; + particle_ekx0 = particle_eky0 = particle_ekz0 = NULL; + particle_ekx1 = particle_eky1 = particle_ekz1 = NULL; + particle_ekx2 = particle_eky2 = particle_ekz2 = NULL; + particle_ekx3 = particle_eky3 = particle_ekz3 = NULL; + particle_ekx4 = particle_eky4 = particle_ekz4 = NULL; + particle_ekx5 = particle_eky5 = particle_ekz5 = NULL; + particle_ekx6 = particle_eky6 = particle_ekz6 = NULL; + + rho_lookup = drho_lookup = NULL; + rho6_lookup = drho6_lookup = NULL; + rho_points = 0; + + _use_table = _use_packing = _use_lrt = 0; +} + +PPPMDispIntel::~PPPMDispIntel() +{ + memory->destroy(perthread_density); + memory->destroy(particle_ekx); + memory->destroy(particle_eky); + memory->destroy(particle_ekz); + + memory->destroy(rho_lookup); + memory->destroy(drho_lookup); + memory->destroy(rho6_lookup); + memory->destroy(drho6_lookup); +} + + + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + + +void PPPMDispIntel::init() +{ + + PPPMDisp::init(); + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + fix = static_cast<FixIntel *>(modify->fix[ifix]); + + #ifdef _LMP_INTEL_OFFLOAD + _use_base = 0; + if (fix->offload_balance() != 0.0) { + _use_base = 1; + return; + } + #endif + + fix->kspace_init_check(); + + _use_lrt = fix->lrt(); + if (_use_lrt) + error->all(FLERR, + "LRT mode is currently not supported for pppm/disp/intel"); + + + // For vectorization, we need some padding in the end + // The first thread computes on the global density + if ((comm->nthreads > 1) && !_use_lrt) { + memory->destroy(perthread_density); + memory->create(perthread_density, comm->nthreads-1, + ngrid + INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:perthread_density"); + } + + _use_table = fix->pppm_table(); + if (_use_table) { + rho_points = 5000; + memory->destroy(rho_lookup); + memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:rho_lookup"); + memory->destroy(rho6_lookup); + memory->create(rho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:rho6_lookup"); + + if(differentiation_flag == 1) { + memory->destroy(drho_lookup); + memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:drho_lookup"); + memory->destroy(drho6_lookup); + memory->create(drho6_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, + "pppmdispintel:drho6_lookup"); + } + precompute_rho(); + } + if (order > INTEL_P3M_MAXORDER) + error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); +} + +/* ---------------------------------------------------------------------- + compute the PPPMDispIntel long-range force, energy, virial +------------------------------------------------------------------------- */ + +void PPPMDispIntel::compute(int eflag, int vflag) +{ + #ifdef _LMP_INTEL_OFFLOAD + if (_use_base) { + PPPMDisp::compute(eflag, vflag); + return; + } + #endif + int i; + // convert atoms from box to lamda coords + + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = evflag_atom = eflag_global = vflag_global = + eflag_atom = vflag_atom = 0; + + if (evflag_atom && !peratom_allocate_flag) { + allocate_peratom(); + if (function[0]) { + cg_peratom->ghost_notify(); + cg_peratom->setup(); + } + if (function[1] + function[2] + function[3]) { + cg_peratom_6->ghost_notify(); + cg_peratom_6->setup(); + } + peratom_allocate_flag = 1; + } + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + // extend size of per-atom arrays if necessary + + if (atom->nmax > nmax) { + + if (function[0]) memory->destroy(part2grid); + if (function[1] + function[2] + function[3]) memory->destroy(part2grid_6); + if (differentiation_flag == 1) { + memory->destroy(particle_ekx); + memory->destroy(particle_eky); + memory->destroy(particle_ekz); + if (function[2] == 1){ + memory->destroy(particle_ekx0); + memory->destroy(particle_eky0); + memory->destroy(particle_ekz0); + memory->destroy(particle_ekx1); + memory->destroy(particle_eky1); + memory->destroy(particle_ekz1); + memory->destroy(particle_ekx2); + memory->destroy(particle_eky2); + memory->destroy(particle_ekz2); + memory->destroy(particle_ekx3); + memory->destroy(particle_eky3); + memory->destroy(particle_ekz3); + memory->destroy(particle_ekx4); + memory->destroy(particle_eky4); + memory->destroy(particle_ekz4); + memory->destroy(particle_ekx5); + memory->destroy(particle_eky5); + memory->destroy(particle_ekz5); + memory->destroy(particle_ekx6); + memory->destroy(particle_eky6); + memory->destroy(particle_ekz6); + } + + } + nmax = atom->nmax; + if (function[0]) memory->create(part2grid,nmax,3,"pppm/disp:part2grid"); + if (function[1] + function[2] + function[3]) + memory->create(part2grid_6,nmax,3,"pppm/disp:part2grid_6"); + if (differentiation_flag == 1) { + memory->create(particle_ekx, nmax, "pppmdispintel:pekx"); + memory->create(particle_eky, nmax, "pppmdispintel:peky"); + memory->create(particle_ekz, nmax, "pppmdispintel:pekz"); + if (function[2] == 1){ + memory->create(particle_ekx0, nmax, "pppmdispintel:pekx0"); + memory->create(particle_eky0, nmax, "pppmdispintel:peky0"); + memory->create(particle_ekz0, nmax, "pppmdispintel:pekz0"); + memory->create(particle_ekx1, nmax, "pppmdispintel:pekx1"); + memory->create(particle_eky1, nmax, "pppmdispintel:peky1"); + memory->create(particle_ekz1, nmax, "pppmdispintel:pekz1"); + memory->create(particle_ekx2, nmax, "pppmdispintel:pekx2"); + memory->create(particle_eky2, nmax, "pppmdispintel:peky2"); + memory->create(particle_ekz2, nmax, "pppmdispintel:pekz2"); + memory->create(particle_ekx3, nmax, "pppmdispintel:pekx3"); + memory->create(particle_eky3, nmax, "pppmdispintel:peky3"); + memory->create(particle_ekz3, nmax, "pppmdispintel:pekz3"); + memory->create(particle_ekx4, nmax, "pppmdispintel:pekx4"); + memory->create(particle_eky4, nmax, "pppmdispintel:peky4"); + memory->create(particle_ekz4, nmax, "pppmdispintel:pekz4"); + memory->create(particle_ekx5, nmax, "pppmdispintel:pekx5"); + memory->create(particle_eky5, nmax, "pppmdispintel:peky5"); + memory->create(particle_ekz5, nmax, "pppmdispintel:pekz5"); + memory->create(particle_ekx6, nmax, "pppmdispintel:pekx6"); + memory->create(particle_eky6, nmax, "pppmdispintel:peky6"); + memory->create(particle_ekz6, nmax, "pppmdispintel:pekz6"); + } + } + } + energy = 0.0; + energy_1 = 0.0; + energy_6 = 0.0; + if (vflag) for (i = 0; i < 6; i++) virial_6[i] = virial_1[i] = 0.0; + + // find grid points for all my particles + // distribute partcles' charges/dispersion coefficients on the grid + // communication between processors and remapping two fft + // Solution of poissons equation in k-space and backtransformation + // communication between processors + // calculation of forces + + if (function[0]) { + + //perform calculations for coulomb interactions only + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv, delyinv, delzinv, shift, part2grid, + nupper, nlower, nxlo_out, nylo_out, nzlo_out, + nxhi_out, nyhi_out, nzhi_out, + fix->get_mixed_buffers()); + make_rho_c<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv, delyinv, delzinv, shift, part2grid, + nupper, nlower, nxlo_out, nylo_out, + nzlo_out, nxhi_out, nyhi_out, nzhi_out, + fix->get_double_buffers()); + make_rho_c<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv, delyinv, delzinv, shift, part2grid, + nupper, nlower, nxlo_out, nylo_out, nzlo_out, + nxhi_out, nyhi_out, nzhi_out, + fix->get_single_buffers()); + make_rho_c<float,float>(fix->get_single_buffers()); + } + + cg->reverse_comm(this,REVERSE_RHO); + + brick2fft(nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, + density_brick, density_fft, work1,remap); + + if (differentiation_flag == 1) { + poisson_ad(work1, work2, density_fft, fft1, fft2, + nx_pppm, ny_pppm, nz_pppm, nfft, + nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft, + nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, + energy_1, greensfn, virial_1, vg,vg2, u_brick, v0_brick, + v1_brick, v2_brick, v3_brick, v4_brick, v5_brick); + + cg->forward_comm(this,FORWARD_AD); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_c_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_c_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_c_ad<float,float>(fix->get_single_buffers()); + } + + if (vflag_atom) cg_peratom->forward_comm(this, FORWARD_AD_PERATOM); + + } else { + poisson_ik(work1, work2, density_fft, fft1, fft2, + nx_pppm, ny_pppm, nz_pppm, nfft, + nxlo_fft, nylo_fft, nzlo_fft, nxhi_fft, nyhi_fft, nzhi_fft, + nxlo_in, nylo_in, nzlo_in, nxhi_in, nyhi_in, nzhi_in, + energy_1, greensfn, fkx, fky, fkz,fkx2, fky2, fkz2, + vdx_brick, vdy_brick, vdz_brick, virial_1, vg,vg2, + u_brick, v0_brick, v1_brick, v2_brick, v3_brick, v4_brick, + v5_brick); + + cg->forward_comm(this, FORWARD_IK); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_c_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_c_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_c_ik<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) cg_peratom->forward_comm(this, FORWARD_IK_PERATOM); + } + if (evflag_atom) fieldforce_c_peratom(); + } + + if (function[1]) { + //perfrom calculations for geometric mixing + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_mixed_buffers()); + make_rho_g<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_double_buffers()); + make_rho_g<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_single_buffers()); + make_rho_g<float,float>(fix->get_single_buffers()); + } + + + cg_6->reverse_comm(this, REVERSE_RHO_G); + + brick2fft(nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, nzhi_in_6, + density_brick_g, density_fft_g, work1_6,remap_6); + + if (differentiation_flag == 1) { + + poisson_ad(work1_6, work2_6, density_fft_g, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, + nxlo_fft_6, nylo_fft_6, nzlo_fft_6, nxhi_fft_6, + nyhi_fft_6, nzhi_fft_6, nxlo_in_6, nylo_in_6, nzlo_in_6, + nxhi_in_6, nyhi_in_6, nzhi_in_6, energy_6, greensfn_6, + virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, v1_brick_g, + v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g); + + cg_6->forward_comm(this,FORWARD_AD_G); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_g_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_g_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_g_ad<float,float>(fix->get_single_buffers()); + } + + if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_G); + + } else { + poisson_ik(work1_6, work2_6, density_fft_g, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, + nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, + nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, + nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6, + fkx2_6, fky2_6, fkz2_6, vdx_brick_g, vdy_brick_g, + vdz_brick_g, virial_6, vg_6, vg2_6, u_brick_g, v0_brick_g, + v1_brick_g, v2_brick_g, v3_brick_g, v4_brick_g, v5_brick_g); + + cg_6->forward_comm(this,FORWARD_IK_G); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_g_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_g_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_g_ik<float,float>(fix->get_single_buffers()); + } + + + if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_G); + } + if (evflag_atom) fieldforce_g_peratom(); + } + + if (function[2]) { + //perform calculations for arithmetic mixing + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, + nxlo_out_6, nylo_out_6, nzlo_out_6, + nxhi_out_6, nyhi_out_6, nzhi_out_6, + fix->get_mixed_buffers()); + make_rho_a<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_double_buffers()); + make_rho_a<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_single_buffers()); + make_rho_a<float,float>(fix->get_single_buffers()); + } + + cg_6->reverse_comm(this, REVERSE_RHO_A); + + brick2fft_a(); + + if ( differentiation_flag == 1) { + + poisson_ad(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, + nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, + nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, + nzhi_in_6, energy_6, greensfn_6, virial_6, vg_6, vg2_6, + u_brick_a3, v0_brick_a3, v1_brick_a3, v2_brick_a3, + v3_brick_a3, v4_brick_a3, v5_brick_a3); + poisson_2s_ad(density_fft_a0, density_fft_a6, u_brick_a0, v0_brick_a0, + v1_brick_a0, v2_brick_a0, v3_brick_a0, v4_brick_a0, + v5_brick_a0, u_brick_a6, v0_brick_a6, v1_brick_a6, + v2_brick_a6, v3_brick_a6, v4_brick_a6, v5_brick_a6); + poisson_2s_ad(density_fft_a1, density_fft_a5, u_brick_a1, v0_brick_a1, + v1_brick_a1, v2_brick_a1, v3_brick_a1, v4_brick_a1, + v5_brick_a1, u_brick_a5, v0_brick_a5, v1_brick_a5, + v2_brick_a5, v3_brick_a5, v4_brick_a5, v5_brick_a5); + poisson_2s_ad(density_fft_a2, density_fft_a4, u_brick_a2, v0_brick_a2, + v1_brick_a2, v2_brick_a2, v3_brick_a2, v4_brick_a2, + v5_brick_a2, u_brick_a4, v0_brick_a4, v1_brick_a4, + v2_brick_a4, v3_brick_a4, v4_brick_a4, v5_brick_a4); + + cg_6->forward_comm(this, FORWARD_AD_A); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_a_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_a_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_a_ad<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_AD_PERATOM_A); + + } else { + + poisson_ik(work1_6, work2_6, density_fft_a3, fft1_6, fft2_6, + nx_pppm_6, ny_pppm_6, nz_pppm_6, nfft_6, nxlo_fft_6, + nylo_fft_6, nzlo_fft_6, nxhi_fft_6, nyhi_fft_6, nzhi_fft_6, + nxlo_in_6, nylo_in_6, nzlo_in_6, nxhi_in_6, nyhi_in_6, + nzhi_in_6, energy_6, greensfn_6, fkx_6, fky_6, fkz_6,fkx2_6, + fky2_6, fkz2_6, vdx_brick_a3, vdy_brick_a3, vdz_brick_a3, + virial_6, vg_6, vg2_6, u_brick_a3, v0_brick_a3, v1_brick_a3, + v2_brick_a3, v3_brick_a3, v4_brick_a3, v5_brick_a3); + poisson_2s_ik(density_fft_a0, density_fft_a6, vdx_brick_a0, + vdy_brick_a0, vdz_brick_a0, vdx_brick_a6, vdy_brick_a6, + vdz_brick_a6, u_brick_a0, v0_brick_a0, v1_brick_a0, + v2_brick_a0, v3_brick_a0, v4_brick_a0, v5_brick_a0, + u_brick_a6, v0_brick_a6, v1_brick_a6, v2_brick_a6, + v3_brick_a6, v4_brick_a6, v5_brick_a6); + poisson_2s_ik(density_fft_a1, density_fft_a5, vdx_brick_a1, + vdy_brick_a1, vdz_brick_a1, vdx_brick_a5, vdy_brick_a5, + vdz_brick_a5, u_brick_a1, v0_brick_a1, v1_brick_a1, + v2_brick_a1, v3_brick_a1, v4_brick_a1, v5_brick_a1, + u_brick_a5, v0_brick_a5, v1_brick_a5, v2_brick_a5, + v3_brick_a5, v4_brick_a5, v5_brick_a5); + poisson_2s_ik(density_fft_a2, density_fft_a4, vdx_brick_a2, + vdy_brick_a2, vdz_brick_a2, vdx_brick_a4, vdy_brick_a4, + vdz_brick_a4, u_brick_a2, v0_brick_a2, v1_brick_a2, + v2_brick_a2, v3_brick_a2, v4_brick_a2, v5_brick_a2, + u_brick_a4, v0_brick_a4, v1_brick_a4, v2_brick_a4, + v3_brick_a4, v4_brick_a4, v5_brick_a4); + + cg_6->forward_comm(this, FORWARD_IK_A); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_a_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_a_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_a_ik<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_A); + } + if (evflag_atom) fieldforce_a_peratom(); + } + + if (function[3]) { + //perform calculations if no mixing rule applies + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + particle_map<float,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_mixed_buffers()); + make_rho_none<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + particle_map<double,double>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_double_buffers()); + make_rho_none<double,double>(fix->get_double_buffers()); + } else { + particle_map<float,float>(delxinv_6, delyinv_6, delzinv_6, shift_6, + part2grid_6, nupper_6, nlower_6, nxlo_out_6, + nylo_out_6, nzlo_out_6, nxhi_out_6, + nyhi_out_6, nzhi_out_6, + fix->get_single_buffers()); + make_rho_none<float,float>(fix->get_single_buffers()); + } + + cg_6->reverse_comm(this, REVERSE_RHO_NONE); + + brick2fft_none(); + + if (differentiation_flag == 1) { + + int n = 0; + for (int k = 0; k<nsplit_alloc/2; k++) { + poisson_none_ad(n,n+1,density_fft_none[n],density_fft_none[n+1], + u_brick_none[n],u_brick_none[n+1], + v0_brick_none, v1_brick_none, v2_brick_none, + v3_brick_none, v4_brick_none, v5_brick_none); + n += 2; + } + + cg_6->forward_comm(this,FORWARD_AD_NONE); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_none_ad<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_none_ad<double,double>(fix->get_double_buffers()); + } else { + fieldforce_none_ad<float,float>(fix->get_single_buffers()); + } + + if (vflag_atom) cg_peratom_6->forward_comm(this,FORWARD_AD_PERATOM_NONE); + + } else { + int n = 0; + for (int k = 0; k<nsplit_alloc/2; k++) { + + poisson_none_ik(n,n+1,density_fft_none[n], density_fft_none[n+1], + vdx_brick_none[n], vdy_brick_none[n], + vdz_brick_none[n], vdx_brick_none[n+1], + vdy_brick_none[n+1], vdz_brick_none[n+1], + u_brick_none, v0_brick_none, v1_brick_none, + v2_brick_none, v3_brick_none, v4_brick_none, + v5_brick_none); + n += 2; + } + + cg_6->forward_comm(this,FORWARD_IK_NONE); + + if (fix->precision() == FixIntel::PREC_MODE_MIXED) { + fieldforce_none_ik<float,double>(fix->get_mixed_buffers()); + } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + fieldforce_none_ik<double,double>(fix->get_double_buffers()); + } else { + fieldforce_none_ik<float,float>(fix->get_single_buffers()); + } + + if (evflag_atom) + cg_peratom_6->forward_comm(this, FORWARD_IK_PERATOM_NONE); + } + if (evflag_atom) fieldforce_none_peratom(); + } + + // update qsum and qsqsum, if atom count has changed and energy needed + + if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) { + qsum_qsq(); + natoms_original = atom->natoms; + } + + // sum energy across procs and add in volume-dependent term + + const double qscale = force->qqrd2e * scale; + if (eflag_global) { + double energy_all; + MPI_Allreduce(&energy_1,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy_1 = energy_all; + MPI_Allreduce(&energy_6,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy_6 = energy_all; + + energy_1 *= 0.5*volume; + energy_6 *= 0.5*volume; + + energy_1 -= g_ewald*qsqsum/MY_PIS + + MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume); + energy_6 += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij + + 1.0/12.0*pow(g_ewald_6,6)*csum; + energy_1 *= qscale; + } + + // sum virial across procs + + if (vflag_global) { + double virial_all[6]; + MPI_Allreduce(virial_1,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; + MPI_Allreduce(virial_6,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] += 0.5*volume*virial_all[i]; + if (function[1]+function[2]+function[3]){ + double a = MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumij; + virial[0] -= a; + virial[1] -= a; + virial[2] -= a; + } + } + + if (eflag_atom) { + if (function[0]) { + double *q = atom->q; + for (i = 0; i < atom->nlocal; i++) { + eatom[i] -= qscale*g_ewald*q[i]*q[i]/MY_PIS + qscale*MY_PI2*q[i]* + qsum / (g_ewald*g_ewald*volume); //coulomb self energy correction + } + } + if (function[1] + function[2] + function[3]) { + int tmp; + for (i = 0; i < atom->nlocal; i++) { + tmp = atom->type[i]; + eatom[i] += - MY_PI*MY_PIS/(6*volume)*pow(g_ewald_6,3)*csumi[tmp] + + 1.0/12.0*pow(g_ewald_6,6)*cii[tmp]; + } + } + } + + if (vflag_atom) { + if (function[1] + function[2] + function[3]) { + int tmp; + for (i = 0; i < atom->nlocal; i++) { + tmp = atom->type[i]; + //dispersion self virial correction + for (int n = 0; n < 3; n++) vatom[i][n] -= MY_PI*MY_PIS/(6*volume)* + pow(g_ewald_6,3)*csumi[tmp]; + } + } + } + + + // 2d slab correction + + if (slabflag) slabcorr(eflag); + if (function[0]) energy += energy_1; + if (function[1] + function[2] + function[3]) energy += energy_6; + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); +} + + +/* ---------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + find center grid pt for each of my particles + check that full stencil for the particle will fit in my 3d brick + store central grid pt indices in part2grid array +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t> +void PPPMDispIntel::particle_map(double delx, double dely, double delz, + double sft, int** p2g, int nup, int nlow, + int nxlo, int nylo, int nzlo, + int nxhi, int nyhi, int nzhi, + IntelBuffers<flt_t,acc_t> *buffers) +{ + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2])) + error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); + + int flag = 0; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr, delx, dely, delz, sft, p2g, nup, nlow, nxlo,\ + nylo, nzlo, nxhi, nyhi, nzhi) reduction(+:flag) if(!_use_lrt) + #endif + { + double **x = atom->x; + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delx; + const flt_t yi = dely; + const flt_t zi = delz; + const flt_t fshift = sft; + + + int iifrom, iito, tid; + IP_PRE_omp_range_id_align(iifrom, iito, tid, nlocal, nthr, sizeof(ATOM_T)); + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma simd reduction(+:flag) + #endif + for (int i = iifrom; i < iito; i++) { + + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // current particle coord can be outside global and local box + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + int nx = static_cast<int> ((x[i][0]-lo0)*xi+fshift) - OFFSET; + int ny = static_cast<int> ((x[i][1]-lo1)*yi+fshift) - OFFSET; + int nz = static_cast<int> ((x[i][2]-lo2)*zi+fshift) - OFFSET; + + p2g[i][0] = nx; + p2g[i][1] = ny; + p2g[i][2] = nz; + + // check that entire stencil around nx,ny,nz will fit in my 3d brick + + if (nx+nlow < nxlo || nx+nup > nxhi || + ny+nlow < nylo || ny+nup > nyhi || + nz+nlow < nzlo || nz+nup > nzhi) + flag = 1; + } + } + + if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPMDisp"); +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) +{ + // clear 3d density array + + FFT_SCALAR * _noalias global_density = + &(density_brick[nzlo_out][nylo_out][nxlo_out]); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + //double *q = atom->q; + //double **x = atom->x; + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + double *q = atom->q; + double **x = atom->x; + + const int nix = nxhi_out - nxlo_out + 1; + const int niy = nyhi_out - nylo_out + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshift = shift; + const flt_t fshiftone = shiftone; + const flt_t fdelvolinv = delvolinv; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : + perthread_density[tid - 1]; + // clear 3d density array + memset(my_density, 0, ngrid * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + + int nysum = nlower + ny - nylo_out; + int nxsum = nlower + nx - nxlo_out; + int nzsum = (nlower + nz - nzlo_out)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho_lookup[idx][k]; + rho[1][k] = rho_lookup[idy][k]; + rho[2][k] = rho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order-1; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + rho[0][k-nlower] = r1; + rho[1][k-nlower] = r2; + rho[2][k-nlower] = r3; + } + } + + FFT_SCALAR z0 = fdelvolinv * q[i]; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + my_density[mzyx] += x0*rho[0][l]; + } + } + } + } + } + + // reduce all the perthread_densities into global_density + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; + } + } + } +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = dispersion "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid --- geometric mixing +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) +{ + // clear 3d density array + + FFT_SCALAR * _noalias global_density = + &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + int type; + double **x = atom->x; + + const int nix = nxhi_out_6 - nxlo_out_6 + 1; + const int niy = nyhi_out_6 - nylo_out_6 + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshift = shift_6; + const flt_t fshiftone = shiftone_6; + const flt_t fdelvolinv = delvolinv_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : + perthread_density[tid - 1]; + + // clear 3d density array + memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nysum = nlower_6 + ny - nylo_out_6; + int nxsum = nlower_6 + nx - nxlo_out_6; + int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order_6-1; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + } + } + + type = atom->type[i]; + FFT_SCALAR z0 = fdelvolinv * B[type]; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + my_density[mzyx] += x0*rho[0][l]; + } + } + } + } + } + + // reduce all the perthread_densities into global_density + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; + } + } + } + +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = dispersion "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid --- arithmetic mixing +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) +{ + // clear 3d density array + + memset(&(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + memset(&(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]),0, + ngrid_6*sizeof(FFT_SCALAR)); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atom->nlocal; + + double **x = atom->x; + + const int nix = nxhi_out_6 - nxlo_out_6 + 1; + const int niy = nyhi_out_6 - nylo_out_6 + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshift = shift_6; + const flt_t fshiftone = shiftone_6; + const flt_t fdelvolinv = delvolinv_6; + + for (int i = 0; i < nlocal; i++) { + + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order_6-1; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + } + } + + const int type = atom->type[i]; + FFT_SCALAR z0 = fdelvolinv; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + FFT_SCALAR w = x0*rho[0][l]; + density_brick_a0[mz][my][mx] += w*B[7*type]; + density_brick_a1[mz][my][mx] += w*B[7*type+1]; + density_brick_a2[mz][my][mx] += w*B[7*type+2]; + density_brick_a3[mz][my][mx] += w*B[7*type+3]; + density_brick_a4[mz][my][mx] += w*B[7*type+4]; + density_brick_a5[mz][my][mx] += w*B[7*type+5]; + density_brick_a6[mz][my][mx] += w*B[7*type+6]; + } + } + } + } +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = dispersion "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid --- case when mixing rules don't apply +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) +{ + + FFT_SCALAR * _noalias global_density = &(density_brick_none[0][nzlo_out_6][nylo_out_6][nxlo_out_6]); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, nlocal, global_density) if(!_use_lrt) + #endif + { + int type; + double **x = atom->x; + + const int nix = nxhi_out_6 - nxlo_out_6 + 1; + const int niy = nyhi_out_6 - nylo_out_6 + 1; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshift = shift_6; + const flt_t fshiftone = shiftone_6; + const flt_t fdelvolinv = delvolinv_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + FFT_SCALAR * _noalias my_density = tid == 0 ? global_density : + perthread_density[tid - 1]; + // clear 3d density array + memset(my_density, 0, ngrid_6 * sizeof(FFT_SCALAR)); + + for (int i = ifrom; i < ito; i++) { + + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nysum = nlower_6 + ny - nylo_out_6; + int nxsum = nlower_6 + nx - nxlo_out_6; + int nzsum = (nlower_6 + nz - nzlo_out_6)*nix*niy + nysum*nix + nxsum; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3; + r1 = r2 = r3 = ZEROF; + + for (int l = order_6-1; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + } + } + + type = atom->type[i]; + FFT_SCALAR z0 = fdelvolinv; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n*nix*niy + nzsum; + FFT_SCALAR y0 = z0*rho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int mzy = m*nix + mz; + FFT_SCALAR x0 = y0*rho[1][m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mzyx = l + mzy; + FFT_SCALAR w0 = x0*rho[0][l]; + for(int k = 0; k < nsplit; k++) + my_density[mzyx + k*ngrid_6] += x0*rho[0][l]; + } + } + } + } + } + + // reduce all the perthread_densities into global_density + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nthr, global_density) if(!_use_lrt) + #endif + { + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, ngrid_6*nsplit, nthr); + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + for(int j = 1; j < nthr; j++) { + global_density[i] += perthread_density[j-1][i]; + } + } + } + +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles + for ik scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + //double *q = atom->q; + //double **x = atom->x; + //double **f = atom->f; + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshiftone = shiftone; + const flt_t fqqrd2es = qqrd2e * scale; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + + int nxsum = nx + nlower; + int nysum = ny + nlower; + int nzsum = nz + nlower;; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho_lookup[idx][k]; + rho1[k] = rho_lookup[idy][k]; + rho2[k] = rho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1 = rho_coeff[order-1][k]; + FFT_SCALAR r2 = rho_coeff[order-1][k]; + FFT_SCALAR r3 = rho_coeff[order-1][k]; + for (int l = order-2; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + + rho0[k-nlower] = r1; + rho1[k-nlower] = r2; + rho2[k-nlower] = r3; + } + } + + _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx_arr[l] -= x0*vdx_brick[mz][my][mx]; + eky_arr[l] -= x0*vdy_brick[mz][my][mx]; + ekz_arr[l] -= x0*vdz_brick[mz][my][mx]; + + } + } + } + + FFT_SCALAR ekx, eky, ekz; + ekx = eky = ekz = ZEROF; + + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } + + // convert E-field to force + + const flt_t qfactor = fqqrd2es * q[i]; + f[i][0] += qfactor*ekx; + f[i][1] += qfactor*eky; + if (slabflag != 2) f[i][2] += qfactor*ekz; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles + for ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + //double *q = atom->q; + //double **x = atom->x; + //double **f = atom->f; + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; + FFT_SCALAR * _noalias const particle_eky = this->particle_eky; + FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv; + const flt_t yi = delyinv; + const flt_t zi = delzinv; + const flt_t fshiftone = shiftone; + const flt_t fqqrd2es = qqrd2e * scale; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm/xprd; + const flt_t hy_inv = ny_pppm/yprd; + const flt_t hz_inv = nz_pppm/zprd; + + const flt_t fsf_coeff0 = sf_coeff[0]; + const flt_t fsf_coeff1 = sf_coeff[1]; + const flt_t fsf_coeff2 = sf_coeff[2]; + const flt_t fsf_coeff3 = sf_coeff[3]; + const flt_t fsf_coeff4 = sf_coeff[4]; + const flt_t fsf_coeff5 = sf_coeff[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid[i][0]; + int ny = part2grid[i][1]; + int nz = part2grid[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower; + int nysum = ny + nlower; + int nzsum = nz + nlower; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho_lookup[idx][k]; + rho[1][k] = rho_lookup[idy][k]; + rho[2][k] = rho_lookup[idz][k]; + drho[0][k] = drho_lookup[idx][k]; + drho[1][k] = drho_lookup[idy][k]; + drho[2][k] = drho_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower; k <= nupper; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff[order-1][k]; + r2 = rho_coeff[order-1][k]; + r3 = rho_coeff[order-1][k]; + for (int l = order-2; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1 * dx; + r2 = rho_coeff[l][k] + r2 * dy; + r3 = rho_coeff[l][k] + r3 * dz; + dr1 = drho_coeff[l][k] + dr1 * dx; + dr2 = drho_coeff[l][k] + dr2 * dy; + dr3 = drho_coeff[l][k] + dr3 * dz; + } + rho[0][k-nlower] = r1; + rho[1][k-nlower] = r2; + rho[2][k-nlower] = r3; + drho[0][k-nlower] = dr1; + drho[1][k-nlower] = dr2; + drho[2][k-nlower] = dr3; + } + } + _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; + eky[l] += rho[0][l] * eky_p * u_brick[mz][my][mx]; + ekz[l] += rho[0][l] * ekz_p * u_brick[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx[i] += ekx[l]; + particle_eky[i] += eky[l]; + particle_ekz[i] += ekz[l]; + } + } + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx[i] *= hx_inv; + particle_eky[i] *= hy_inv; + particle_ekz[i] *= hz_inv; + + // convert E-field to force + + const flt_t qfactor = fqqrd2es * q[i]; + const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= twoqsq; + f[i][0] += qfactor * particle_ekx[i] - fqqrd2es * sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= twoqsq; + f[i][1] += qfactor * particle_eky[i] - fqqrd2es * sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= twoqsq; + + if (slabflag != 2) f[i][2] += qfactor * particle_ekz[i] - fqqrd2es * sf; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for geometric mixing rule +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double lj; + int type; + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho6_lookup[idx][k]; + rho1[k] = rho6_lookup[idy][k]; + rho2[k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + + rho0[k-nlower_6] = r1; + rho1[k-nlower_6] = r2; + rho2[k-nlower_6] = r3; + } + } + + _alignvar(FFT_SCALAR ekx_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx_arr[l] -= x0*vdx_brick_g[mz][my][mx]; + eky_arr[l] -= x0*vdy_brick_g[mz][my][mx]; + ekz_arr[l] -= x0*vdz_brick_g[mz][my][mx]; + + } + } + } + + FFT_SCALAR ekx, eky, ekz; + ekx = eky = ekz = ZEROF; + + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx += ekx_arr[l]; + eky += eky_arr[l]; + ekz += ekz_arr[l]; + } + + // convert E-field to force + + type = atom->type[i]; + lj = B[type]; + f[i][0] += lj*ekx; + f[i][1] += lj*eky; + if (slabflag != 2) f[i][2] += lj*ekz; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for geometric mixing rule for ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx = this->particle_ekx; + FFT_SCALAR * _noalias const particle_eky = this->particle_eky; + FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm_6/xprd; + const flt_t hy_inv = ny_pppm_6/yprd; + const flt_t hz_inv = nz_pppm_6/zprd; + + const flt_t fsf_coeff0 = sf_coeff_6[0]; + const flt_t fsf_coeff1 = sf_coeff_6[1]; + const flt_t fsf_coeff2 = sf_coeff_6[2]; + const flt_t fsf_coeff3 = sf_coeff_6[3]; + const flt_t fsf_coeff4 = sf_coeff_6[4]; + const flt_t fsf_coeff5 = sf_coeff_6[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + drho[0][k] = drho6_lookup[idx][k]; + drho[1][k] = drho6_lookup[idy][k]; + drho[2][k] = drho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff_6[order_6-1][k]; + r2 = rho_coeff_6[order_6-1][k]; + r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1 * dx; + r2 = rho_coeff_6[l][k] + r2 * dy; + r3 = rho_coeff_6[l][k] + r3 * dz; + dr1 = drho_coeff_6[l][k] + dr1 * dx; + dr2 = drho_coeff_6[l][k] + dr2 * dy; + dr3 = drho_coeff_6[l][k] + dr3 * dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + drho[0][k-nlower_6] = dr1; + drho[1][k-nlower_6] = dr2; + drho[2][k-nlower_6] = dr3; + } + } + _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[l] += drho[0][l] * ekx_p * u_brick_g[mz][my][mx]; + eky[l] += rho[0][l] * eky_p * u_brick_g[mz][my][mx]; + ekz[l] += rho[0][l] * ekz_p * u_brick_g[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx[i] += ekx[l]; + particle_eky[i] += eky[l]; + particle_ekz[i] += ekz[l]; + } + } + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx[i] *= hx_inv; + particle_eky[i] *= hy_inv; + particle_ekz[i] *= hz_inv; + + // convert E-field to force + + const int type = atom->type[i]; + const flt_t lj = B[type]; + const flt_t twoljsq = 2.*lj*lj; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= twoljsq; + f[i][0] += lj * particle_ekx[i] - sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= twoljsq; + f[i][1] += lj * particle_eky[i] - sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= twoljsq; + + if (slabflag != 2) f[i][2] += lj * particle_ekz[i] - sf; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for arithmetic mixing rule and ik scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho6_lookup[idx][k]; + rho1[k] = rho6_lookup[idy][k]; + rho2[k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + + rho0[k-nlower_6] = r1; + rho1[k-nlower_6] = r2; + rho2[k-nlower_6] = r3; + } + } + + _alignvar(FFT_SCALAR ekx0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz0_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz1_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz2_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz3_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz4_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz5_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz6_arr[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx0_arr[l] -= x0*vdx_brick_a0[mz][my][mx]; + eky0_arr[l] -= x0*vdy_brick_a0[mz][my][mx]; + ekz0_arr[l] -= x0*vdz_brick_a0[mz][my][mx]; + ekx1_arr[l] -= x0*vdx_brick_a1[mz][my][mx]; + eky1_arr[l] -= x0*vdy_brick_a1[mz][my][mx]; + ekz1_arr[l] -= x0*vdz_brick_a1[mz][my][mx]; + ekx2_arr[l] -= x0*vdx_brick_a2[mz][my][mx]; + eky2_arr[l] -= x0*vdy_brick_a2[mz][my][mx]; + ekz2_arr[l] -= x0*vdz_brick_a2[mz][my][mx]; + ekx3_arr[l] -= x0*vdx_brick_a3[mz][my][mx]; + eky3_arr[l] -= x0*vdy_brick_a3[mz][my][mx]; + ekz3_arr[l] -= x0*vdz_brick_a3[mz][my][mx]; + ekx4_arr[l] -= x0*vdx_brick_a4[mz][my][mx]; + eky4_arr[l] -= x0*vdy_brick_a4[mz][my][mx]; + ekz4_arr[l] -= x0*vdz_brick_a4[mz][my][mx]; + ekx5_arr[l] -= x0*vdx_brick_a5[mz][my][mx]; + eky5_arr[l] -= x0*vdy_brick_a5[mz][my][mx]; + ekz5_arr[l] -= x0*vdz_brick_a5[mz][my][mx]; + ekx6_arr[l] -= x0*vdx_brick_a6[mz][my][mx]; + eky6_arr[l] -= x0*vdy_brick_a6[mz][my][mx]; + ekz6_arr[l] -= x0*vdz_brick_a6[mz][my][mx]; + } + } + } + + FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2; + FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5; + FFT_SCALAR ekx6, eky6, ekz6; + ekx0 = eky0 = ekz0 = ZEROF; + ekx1 = eky1 = ekz1 = ZEROF; + ekx2 = eky2 = ekz2 = ZEROF; + ekx3 = eky3 = ekz3 = ZEROF; + ekx4 = eky4 = ekz4 = ZEROF; + ekx5 = eky5 = ekz5 = ZEROF; + ekx6 = eky6 = ekz6 = ZEROF; + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + ekx0 += ekx0_arr[l]; + eky0 += eky0_arr[l]; + ekz0 += ekz0_arr[l]; + ekx1 += ekx1_arr[l]; + eky1 += eky1_arr[l]; + ekz1 += ekz1_arr[l]; + ekx2 += ekx2_arr[l]; + eky2 += eky2_arr[l]; + ekz2 += ekz2_arr[l]; + ekx3 += ekx3_arr[l]; + eky3 += eky3_arr[l]; + ekz3 += ekz3_arr[l]; + ekx4 += ekx4_arr[l]; + eky4 += eky4_arr[l]; + ekz4 += ekz4_arr[l]; + ekx5 += ekx5_arr[l]; + eky5 += eky5_arr[l]; + ekz5 += ekz5_arr[l]; + ekx6 += ekx6_arr[l]; + eky6 += eky6_arr[l]; + ekz6 += ekz6_arr[l]; + } + + // convert D-field to force + + const int type = atom->type[i]; + const FFT_SCALAR lj0 = B[7*type+6]; + const FFT_SCALAR lj1 = B[7*type+5]; + const FFT_SCALAR lj2 = B[7*type+4]; + const FFT_SCALAR lj3 = B[7*type+3]; + const FFT_SCALAR lj4 = B[7*type+2]; + const FFT_SCALAR lj5 = B[7*type+1]; + const FFT_SCALAR lj6 = B[7*type]; + + f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + + lj4*ekx4 + lj5*ekx5 + lj6*ekx6; + f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + + lj4*eky4 + lj5*eky5 + lj6*eky6; + if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for arithmetic mixing rule for the ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + FFT_SCALAR * _noalias const particle_ekx0 = this->particle_ekx0; + FFT_SCALAR * _noalias const particle_eky0 = this->particle_eky0; + FFT_SCALAR * _noalias const particle_ekz0 = this->particle_ekz0; + FFT_SCALAR * _noalias const particle_ekx1 = this->particle_ekx1; + FFT_SCALAR * _noalias const particle_eky1 = this->particle_eky1; + FFT_SCALAR * _noalias const particle_ekz1 = this->particle_ekz1; + FFT_SCALAR * _noalias const particle_ekx2 = this->particle_ekx2; + FFT_SCALAR * _noalias const particle_eky2 = this->particle_eky2; + FFT_SCALAR * _noalias const particle_ekz2 = this->particle_ekz2; + FFT_SCALAR * _noalias const particle_ekx3 = this->particle_ekx3; + FFT_SCALAR * _noalias const particle_eky3 = this->particle_eky3; + FFT_SCALAR * _noalias const particle_ekz3 = this->particle_ekz3; + FFT_SCALAR * _noalias const particle_ekx4 = this->particle_ekx4; + FFT_SCALAR * _noalias const particle_eky4 = this->particle_eky4; + FFT_SCALAR * _noalias const particle_ekz4 = this->particle_ekz4; + FFT_SCALAR * _noalias const particle_ekx5 = this->particle_ekx5; + FFT_SCALAR * _noalias const particle_eky5 = this->particle_eky5; + FFT_SCALAR * _noalias const particle_ekz5 = this->particle_ekz5; + FFT_SCALAR * _noalias const particle_ekx6 = this->particle_ekx6; + FFT_SCALAR * _noalias const particle_eky6 = this->particle_eky6; + FFT_SCALAR * _noalias const particle_ekz6 = this->particle_ekz6; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm_6/xprd; + const flt_t hy_inv = ny_pppm_6/yprd; + const flt_t hz_inv = nz_pppm_6/zprd; + + const flt_t fsf_coeff0 = sf_coeff_6[0]; + const flt_t fsf_coeff1 = sf_coeff_6[1]; + const flt_t fsf_coeff2 = sf_coeff_6[2]; + const flt_t fsf_coeff3 = sf_coeff_6[3]; + const flt_t fsf_coeff4 = sf_coeff_6[4]; + const flt_t fsf_coeff5 = sf_coeff_6[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + drho[0][k] = drho6_lookup[idx][k]; + drho[1][k] = drho6_lookup[idy][k]; + drho[2][k] = drho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff_6[order_6-1][k]; + r2 = rho_coeff_6[order_6-1][k]; + r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1 * dx; + r2 = rho_coeff_6[l][k] + r2 * dy; + r3 = rho_coeff_6[l][k] + r3 * dz; + dr1 = drho_coeff_6[l][k] + dr1 * dx; + dr2 = drho_coeff_6[l][k] + dr2 * dy; + dr3 = drho_coeff_6[l][k] + dr3 * dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + drho[0][k-nlower_6] = dr1; + drho[1][k-nlower_6] = dr2; + drho[2][k-nlower_6] = dr3; + } + } + _alignvar(FFT_SCALAR ekx0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz1[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz2[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz3[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz4[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz5[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekx6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR eky6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(FFT_SCALAR ekz6[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + particle_ekx0[i] = particle_eky0[i] = particle_ekz0[i] = ZEROF; + particle_ekx1[i] = particle_eky1[i] = particle_ekz1[i] = ZEROF; + particle_ekx2[i] = particle_eky2[i] = particle_ekz2[i] = ZEROF; + particle_ekx3[i] = particle_eky3[i] = particle_ekz3[i] = ZEROF; + particle_ekx4[i] = particle_eky4[i] = particle_ekz4[i] = ZEROF; + particle_ekx5[i] = particle_eky5[i] = particle_ekz5[i] = ZEROF; + particle_ekx6[i] = particle_eky6[i] = particle_ekz6[i] = ZEROF; + + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + FFT_SCALAR x0 = drho[0][l] * ekx_p; + FFT_SCALAR y0 = rho[0][l] * eky_p; + FFT_SCALAR z0 = rho[0][l] * ekz_p; + + ekx0[l] += x0 * u_brick_a0[mz][my][mx]; + eky0[l] += y0 * u_brick_a0[mz][my][mx]; + ekz0[l] += z0 * u_brick_a0[mz][my][mx]; + ekx1[l] += x0 * u_brick_a1[mz][my][mx]; + eky1[l] += y0 * u_brick_a1[mz][my][mx]; + ekz1[l] += z0 * u_brick_a1[mz][my][mx]; + ekx2[l] += x0 * u_brick_a2[mz][my][mx]; + eky2[l] += y0 * u_brick_a2[mz][my][mx]; + ekz2[l] += z0 * u_brick_a2[mz][my][mx]; + ekx3[l] += x0 * u_brick_a3[mz][my][mx]; + eky3[l] += y0 * u_brick_a3[mz][my][mx]; + ekz3[l] += z0 * u_brick_a3[mz][my][mx]; + ekx4[l] += x0 * u_brick_a4[mz][my][mx]; + eky4[l] += y0 * u_brick_a4[mz][my][mx]; + ekz4[l] += z0 * u_brick_a4[mz][my][mx]; + ekx5[l] += x0 * u_brick_a5[mz][my][mx]; + eky5[l] += y0 * u_brick_a5[mz][my][mx]; + ekz5[l] += z0 * u_brick_a5[mz][my][mx]; + ekx6[l] += x0 * u_brick_a6[mz][my][mx]; + eky6[l] += y0 * u_brick_a6[mz][my][mx]; + ekz6[l] += z0 * u_brick_a6[mz][my][mx]; + } + } + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + particle_ekx0[i] += ekx0[l]; + particle_eky0[i] += eky0[l]; + particle_ekz0[i] += ekz0[l]; + particle_ekx1[i] += ekx1[l]; + particle_eky1[i] += eky1[l]; + particle_ekz1[i] += ekz1[l]; + particle_ekx2[i] += ekx2[l]; + particle_eky2[i] += eky2[l]; + particle_ekz2[i] += ekz2[l]; + particle_ekx3[i] += ekx3[l]; + particle_eky3[i] += eky3[l]; + particle_ekz3[i] += ekz3[l]; + particle_ekx4[i] += ekx4[l]; + particle_eky4[i] += eky4[l]; + particle_ekz4[i] += ekz4[l]; + particle_ekx5[i] += ekx5[l]; + particle_eky5[i] += eky5[l]; + particle_ekz5[i] += ekz5[l]; + particle_ekx6[i] += ekx6[l]; + particle_eky6[i] += eky6[l]; + particle_ekz6[i] += ekz6[l]; + } + } + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int i = ifrom; i < ito; i++) { + particle_ekx0[i] *= hx_inv; + particle_eky0[i] *= hy_inv; + particle_ekz0[i] *= hz_inv; + particle_ekx1[i] *= hx_inv; + particle_eky1[i] *= hy_inv; + particle_ekz1[i] *= hz_inv; + particle_ekx2[i] *= hx_inv; + particle_eky2[i] *= hy_inv; + particle_ekz2[i] *= hz_inv; + particle_ekx3[i] *= hx_inv; + particle_eky3[i] *= hy_inv; + particle_ekz3[i] *= hz_inv; + particle_ekx4[i] *= hx_inv; + particle_eky4[i] *= hy_inv; + particle_ekz4[i] *= hz_inv; + particle_ekx5[i] *= hx_inv; + particle_eky5[i] *= hy_inv; + particle_ekz5[i] *= hz_inv; + particle_ekx6[i] *= hx_inv; + particle_eky6[i] *= hy_inv; + particle_ekz6[i] *= hz_inv; + + // convert D-field to force + + const int type = atom->type[i]; + const FFT_SCALAR lj0 = B[7*type+6]; + const FFT_SCALAR lj1 = B[7*type+5]; + const FFT_SCALAR lj2 = B[7*type+4]; + const FFT_SCALAR lj3 = B[7*type+3]; + const FFT_SCALAR lj4 = B[7*type+2]; + const FFT_SCALAR lj5 = B[7*type+1]; + const FFT_SCALAR lj6 = B[7*type]; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); + sf += fsf_coeff1 * sin(ffour_pi * s1); + sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; + f[i][0] += lj0*particle_ekx0[i] + lj1*particle_ekx1[i] + + lj2*particle_ekx2[i] + lj3*particle_ekx3[i] + lj4*particle_ekx4[i] + + lj5*particle_ekx5[i] + lj6*particle_ekx6[i] - sf; + + sf = fsf_coeff2 * sin(ftwo_pi * s2); + sf += fsf_coeff3 * sin(ffour_pi * s2); + sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; + f[i][1] += lj0*particle_eky0[i] + lj1*particle_eky1[i] + + lj2*particle_eky2[i] + lj3*particle_eky3[i] + lj4*particle_eky4[i] + + lj5*particle_eky5[i] + lj6*particle_eky6[i] - sf; + + sf = fsf_coeff4 * sin(ftwo_pi * s3); + sf += fsf_coeff5 * sin(ffour_pi * s3); + sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3; + if (slabflag != 2) + f[i][2] += lj0*particle_ekz0[i] + lj1*particle_ekz1[i] + + lj2*particle_ekz2[i] + lj3*particle_ekz3[i] + lj4*particle_ekz4[i] + + lj5*particle_ekz5[i] + lj6*particle_ekz6[i] - sf; + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for no mixing rule and ik scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) +{ + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double lj; + int type; + double **x = atom->x; + double **f = atom->f; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho0[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t rho1[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + _alignvar(flt_t rho2[INTEL_P3M_ALIGNED_MAXORDER] , 64)= {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho0[k] = rho6_lookup[idx][k]; + rho1[k] = rho6_lookup[idy][k]; + rho2[k] = rho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r2 = rho_coeff_6[order_6-1][k]; + FFT_SCALAR r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1*dx; + r2 = rho_coeff_6[l][k] + r2*dy; + r3 = rho_coeff_6[l][k] + r3*dz; + } + + rho0[k-nlower_6] = r1; + rho1[k-nlower_6] = r2; + rho2[k-nlower_6] = r3; + } + } + + + _alignvar(FFT_SCALAR ekx_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); + _alignvar(FFT_SCALAR eky_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); + _alignvar(FFT_SCALAR ekz_arr[nsplit*INTEL_P3M_ALIGNED_MAXORDER],64); + + for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) { + ekx_arr[k] = eky_arr[k] = ekz_arr[k] = ZEROF; + } + + for (int k = 0; k < nsplit; k++) { + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n+nzsum; + FFT_SCALAR z0 = rho2[n]; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m+nysum; + FFT_SCALAR y0 = z0*rho1[m]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l+nxsum; + FFT_SCALAR x0 = y0*rho0[l]; + ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= + x0*vdx_brick_none[k][mz][my][mx]; + eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= + x0*vdy_brick_none[k][mz][my][mx]; + ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l] -= + x0*vdz_brick_none[k][mz][my][mx]; + } + } + } + } + + _alignvar(FFT_SCALAR ekx[nsplit], 64); + _alignvar(FFT_SCALAR eky[nsplit], 64); + _alignvar(FFT_SCALAR ekz[nsplit], 64); + for (int k = 0; k < nsplit; k++) { + ekx[k] = eky[k] = ekz[k] = ZEROF; + } + + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + for (int k = 0; k < nsplit; k++) { + ekx[k] += ekx_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + eky[k] += eky_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + ekz[k] += ekz_arr[k*INTEL_P3M_ALIGNED_MAXORDER + l]; + } + } + + // convert E-field to force + + type = atom->type[i]; + for (int k = 0; k < nsplit; k++) { + lj = B[nsplit*type + k]; + f[i][0] += lj*ekx[k]; + f[i][1] += lj*eky[k]; + if (slabflag != 2) f[i][2] += lj*ekz[k]; + } + } + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get dispersion field & force on my particles + for no mixing rule for the ad scheme +------------------------------------------------------------------------- */ + +template<class flt_t, class acc_t, int use_table> +void PPPMDispIntel::fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) +{ + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of dispersion field on particle + + int nlocal = atom->nlocal; + int nthr = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel default(none) \ + shared(nlocal, nthr) if(!_use_lrt) + #endif + { + + double *prd; + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double **x = atom->x; + double **f = atom->f; + const flt_t ftwo_pi = MY_PI * 2.0; + const flt_t ffour_pi = MY_PI * 4.0; + + const flt_t lo0 = boxlo[0]; + const flt_t lo1 = boxlo[1]; + const flt_t lo2 = boxlo[2]; + const flt_t xi = delxinv_6; + const flt_t yi = delyinv_6; + const flt_t zi = delzinv_6; + const flt_t fshiftone = shiftone_6; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]*slab_volfactor; + + const flt_t hx_inv = nx_pppm_6/xprd; + const flt_t hy_inv = ny_pppm_6/yprd; + const flt_t hz_inv = nz_pppm_6/zprd; + + const flt_t fsf_coeff0 = sf_coeff_6[0]; + const flt_t fsf_coeff1 = sf_coeff_6[1]; + const flt_t fsf_coeff2 = sf_coeff_6[2]; + const flt_t fsf_coeff3 = sf_coeff_6[3]; + const flt_t fsf_coeff4 = sf_coeff_6[4]; + const flt_t fsf_coeff5 = sf_coeff_6[5]; + + int ifrom, ito, tid; + IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); + + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; + + for (int i = ifrom; i < ito; i++) { + int nx = part2grid_6[i][0]; + int ny = part2grid_6[i][1]; + int nz = part2grid_6[i][2]; + FFT_SCALAR dx = nx+fshiftone - (x[i][0]-lo0)*xi; + FFT_SCALAR dy = ny+fshiftone - (x[i][1]-lo1)*yi; + FFT_SCALAR dz = nz+fshiftone - (x[i][2]-lo2)*zi; + + int nxsum = nx + nlower_6; + int nysum = ny + nlower_6; + int nzsum = nz + nlower_6; + + if (use_table) { + dx = dx*half_rho_scale + half_rho_scale_plus; + int idx = dx; + dy = dy*half_rho_scale + half_rho_scale_plus; + int idy = dy; + dz = dz*half_rho_scale + half_rho_scale_plus; + int idz = dz; + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho[0][k] = rho6_lookup[idx][k]; + rho[1][k] = rho6_lookup[idy][k]; + rho[2][k] = rho6_lookup[idz][k]; + drho[0][k] = drho6_lookup[idx][k]; + drho[1][k] = drho6_lookup[idy][k]; + drho[2][k] = drho6_lookup[idz][k]; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k = nlower_6; k <= nupper_6; k++) { + FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; + dr1 = dr2 = dr3 = ZEROF; + + r1 = rho_coeff_6[order_6-1][k]; + r2 = rho_coeff_6[order_6-1][k]; + r3 = rho_coeff_6[order_6-1][k]; + for (int l = order_6-2; l >= 0; l--) { + r1 = rho_coeff_6[l][k] + r1 * dx; + r2 = rho_coeff_6[l][k] + r2 * dy; + r3 = rho_coeff_6[l][k] + r3 * dz; + dr1 = drho_coeff_6[l][k] + dr1 * dx; + dr2 = drho_coeff_6[l][k] + dr2 * dy; + dr3 = drho_coeff_6[l][k] + dr3 * dz; + } + rho[0][k-nlower_6] = r1; + rho[1][k-nlower_6] = r2; + rho[2][k-nlower_6] = r3; + drho[0][k-nlower_6] = dr1; + drho[1][k-nlower_6] = dr2; + drho[2][k-nlower_6] = dr3; + } + } + _alignvar(FFT_SCALAR ekx[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); + _alignvar(FFT_SCALAR eky[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); + _alignvar(FFT_SCALAR ekz[nsplit*INTEL_P3M_ALIGNED_MAXORDER], 64); + + for (int k = 0; k < nsplit*INTEL_P3M_ALIGNED_MAXORDER; k++) { + ekx[k]=eky[k]=ekz[k]=ZEROF; + } + + for (int k = 0; k < nsplit; k++) { + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int n = 0; n < order_6; n++) { + int mz = n + nzsum; + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count=7 + #endif + for (int m = 0; m < order_6; m++) { + int my = m + nysum; + FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; + FFT_SCALAR eky_p = drho[1][m] * rho[2][n]; + FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { + int mx = l + nxsum; + ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l] += drho[0][l] * ekx_p * + u_brick_none[k][mz][my][mx]; + eky[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * eky_p * + u_brick_none[k][mz][my][mx]; + ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l] += rho[0][l] * ekz_p * + u_brick_none[k][mz][my][mx]; + } + } + } + } + + _alignvar(FFT_SCALAR ekx_tot[nsplit], 64); + _alignvar(FFT_SCALAR eky_tot[nsplit], 64); + _alignvar(FFT_SCALAR ekz_tot[nsplit], 64); + for (int k = 0; k < nsplit; k++) { + ekx_tot[k] = eky_tot[k] = ekz_tot[k] = ZEROF; + } + + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ + for (int k = 0; k < nsplit; k++) { + ekx_tot[k] += ekx[k*INTEL_P3M_ALIGNED_MAXORDER+l]; + eky_tot[k] += eky[k*INTEL_P3M_ALIGNED_MAXORDER+l]; + ekz_tot[k] += ekz[k*INTEL_P3M_ALIGNED_MAXORDER+l]; + } + } + + for (int k = 0; k < nsplit; k++) { + ekx_tot[k] *= hx_inv; + eky_tot[k] *= hy_inv; + ekz_tot[k] *= hz_inv; + } + // convert D-field to force + + const int type = atom->type[i]; + + const flt_t s1 = x[i][0] * hx_inv; + const flt_t s2 = x[i][1] * hy_inv; + const flt_t s3 = x[i][2] * hz_inv; + flt_t sf1 = fsf_coeff0 * sin(ftwo_pi * s1); + sf1 += fsf_coeff1 * sin(ffour_pi * s1); + + flt_t sf2 = fsf_coeff2 * sin(ftwo_pi * s2); + sf2 += fsf_coeff3 * sin(ffour_pi * s2); + + flt_t sf3 = fsf_coeff4 * sin(ftwo_pi * s3); + sf3 += fsf_coeff5 * sin(ffour_pi * s3); + for (int k = 0; k < nsplit; k++) { + const flt_t lj = B[nsplit*type + k]; + const flt_t twoljsq = lj*lj * B[k] * 2; + flt_t sf = sf1*twoljsq; + f[i][0] += lj * ekx_tot[k] - sf; + sf = sf2*twoljsq; + f[i][1] += lj * eky_tot[k] - sf; + sf = sf3*twoljsq; + if (slabflag != 2) f[i][2] += lj * ekz_tot[k] - sf; + } + } + } +} + +/* ---------------------------------------------------------------------- + precompute rho coefficients as a lookup table to save time in make_rho + and fieldforce. Instead of doing this polynomial for every atom 6 times + per time step, precompute it for some number of points. +------------------------------------------------------------------------- */ + +void PPPMDispIntel::precompute_rho() +{ + + half_rho_scale = (rho_points - 1.)/2.; + half_rho_scale_plus = half_rho_scale + 0.5; + + for (int i = 0; i < rho_points; i++) { + FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k=nlower; k<=nupper;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order-1; l>=0; l--){ + r1 = rho_coeff[l][k] + r1*dx; + } + rho_lookup[i][k-nlower] = r1; + } + for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho_lookup[i][k] = 0; + } + if (differentiation_flag == 1) { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k=nlower; k<=nupper;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order-2; l>=0; l--){ + r1 = drho_coeff[l][k] + r1*dx; + } + drho_lookup[i][k-nlower] = r1; + } + for (int k = nupper-nlower+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + drho_lookup[i][k] = 0; + } + } + } + for (int i = 0; i < rho_points; i++) { + FFT_SCALAR dx = -1. + 1./half_rho_scale * (FFT_SCALAR)i; + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for (int k=nlower_6; k<=nupper_6;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order_6-1; l>=0; l--){ + r1 = rho_coeff_6[l][k] + r1*dx; + } + rho6_lookup[i][k-nlower_6] = r1; + } + for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + rho6_lookup[i][k] = 0; + } + if (differentiation_flag == 1) { + #if defined(LMP_SIMD_COMPILER) + #pragma simd + #endif + for(int k=nlower_6; k<=nupper_6;k++){ + FFT_SCALAR r1 = ZEROF; + for(int l=order_6-2; l>=0; l--){ + r1 = drho_coeff_6[l][k] + r1*dx; + } + drho6_lookup[i][k-nlower_6] = r1; + } + for (int k = nupper_6-nlower_6+1; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { + drho6_lookup[i][k] = 0; + } + } + } +} + +/* ---------------------------------------------------------------------- + Returns 0 if Intel optimizations for PPPM ignored due to offload +------------------------------------------------------------------------- */ + +#ifdef _LMP_INTEL_OFFLOAD +int PPPMDispIntel::use_base() { + return _use_base; +} +#endif diff --git a/src/USER-INTEL/pppm_disp_intel.h b/src/USER-INTEL/pppm_disp_intel.h index 166152004e..65c43dd486 100644 --- a/src/USER-INTEL/pppm_disp_intel.h +++ b/src/USER-INTEL/pppm_disp_intel.h @@ -1,238 +1,238 @@ -/* -*- c++ -*- ---------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -/* ---------------------------------------------------------------------- - Contributing authors: William McDoniel (RWTH Aachen University) -------------------------------------------------------------------------- */ - -#ifdef KSPACE_CLASS - -KSpaceStyle(pppm/disp/intel,PPPMDispIntel) - -#else - -#ifndef LMP_PPPMINTEL_DISP_H -#define LMP_PPPMINTEL_DISP_H - -#include "pppm_disp.h" -#include "fix_intel.h" - -namespace LAMMPS_NS { - - class PPPMDispIntel : public PPPMDisp { - public: - PPPMDispIntel(class LAMMPS *, int, char **); - virtual ~PPPMDispIntel(); - virtual void init(); - virtual void compute(int, int); - - #ifdef _LMP_INTEL_OFFLOAD - int use_base(); - #endif - - protected: - FixIntel *fix; - - int _use_lrt; - FFT_SCALAR **perthread_density; - FFT_SCALAR *particle_ekx; - FFT_SCALAR *particle_eky; - FFT_SCALAR *particle_ekz; - FFT_SCALAR *particle_ekx0; - FFT_SCALAR *particle_eky0; - FFT_SCALAR *particle_ekz0; - FFT_SCALAR *particle_ekx1; - FFT_SCALAR *particle_eky1; - FFT_SCALAR *particle_ekz1; - FFT_SCALAR *particle_ekx2; - FFT_SCALAR *particle_eky2; - FFT_SCALAR *particle_ekz2; - FFT_SCALAR *particle_ekx3; - FFT_SCALAR *particle_eky3; - FFT_SCALAR *particle_ekz3; - FFT_SCALAR *particle_ekx4; - FFT_SCALAR *particle_eky4; - FFT_SCALAR *particle_ekz4; - FFT_SCALAR *particle_ekx5; - FFT_SCALAR *particle_eky5; - FFT_SCALAR *particle_ekz5; - FFT_SCALAR *particle_ekx6; - FFT_SCALAR *particle_eky6; - FFT_SCALAR *particle_ekz6; - - - - int _use_table; - int rho_points; - FFT_SCALAR **rho_lookup; - FFT_SCALAR **rho6_lookup; - FFT_SCALAR **drho_lookup; - FFT_SCALAR **drho6_lookup; - FFT_SCALAR half_rho_scale, half_rho_scale_plus; - - int _use_packing; - - - #ifdef _LMP_INTEL_OFFLOAD - int _use_base; - #endif - - template<class flt_t, class acc_t> - void particle_map(double, double, double, - double, int **, int, int, - int, int, int, - int, int, int, - IntelBuffers<flt_t,acc_t> *buffers); - - template<class flt_t, class acc_t, int use_table> - void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - make_rho_c<flt_t,acc_t,1>(buffers); - } else { - make_rho_c<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - make_rho_g<flt_t,acc_t,1>(buffers); - } else { - make_rho_g<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - make_rho_a<flt_t,acc_t,1>(buffers); - } else { - make_rho_a<flt_t,acc_t,0>(buffers); - } - } - - - template<class flt_t, class acc_t, int use_table> - void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - make_rho_none<flt_t,acc_t,1>(buffers); - } else { - make_rho_none<flt_t,acc_t,0>(buffers); - } - } - - - template<class flt_t, class acc_t, int use_table> - void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_c_ik<flt_t,acc_t,1>(buffers); - } else { - fieldforce_c_ik<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_c_ad<flt_t,acc_t,1>(buffers); - } else { - fieldforce_c_ad<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_g_ik<flt_t,acc_t,1>(buffers); - } else { - fieldforce_g_ik<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_g_ad<flt_t,acc_t,1>(buffers); - } else { - fieldforce_g_ad<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_a_ik<flt_t,acc_t,1>(buffers); - } else { - fieldforce_a_ik<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_a_ad<flt_t,acc_t,1>(buffers); - } else { - fieldforce_a_ad<flt_t,acc_t,0>(buffers); - } - } - template<class flt_t, class acc_t, int use_table> - void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_none_ik<flt_t,acc_t,1>(buffers); - } else { - fieldforce_none_ik<flt_t,acc_t,0>(buffers); - } - } - - template<class flt_t, class acc_t, int use_table> - void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers); - template<class flt_t, class acc_t> - void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) { - if (_use_table == 1) { - fieldforce_none_ad<flt_t,acc_t,1>(buffers); - } else { - fieldforce_none_ad<flt_t,acc_t,0>(buffers); - } - } - - void precompute_rho(); - - }; - -} -#endif -#endif - - +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: William McDoniel (RWTH Aachen University) +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm/disp/intel,PPPMDispIntel) + +#else + +#ifndef LMP_PPPMINTEL_DISP_H +#define LMP_PPPMINTEL_DISP_H + +#include "pppm_disp.h" +#include "fix_intel.h" + +namespace LAMMPS_NS { + + class PPPMDispIntel : public PPPMDisp { + public: + PPPMDispIntel(class LAMMPS *, int, char **); + virtual ~PPPMDispIntel(); + virtual void init(); + virtual void compute(int, int); + + #ifdef _LMP_INTEL_OFFLOAD + int use_base(); + #endif + + protected: + FixIntel *fix; + + int _use_lrt; + FFT_SCALAR **perthread_density; + FFT_SCALAR *particle_ekx; + FFT_SCALAR *particle_eky; + FFT_SCALAR *particle_ekz; + FFT_SCALAR *particle_ekx0; + FFT_SCALAR *particle_eky0; + FFT_SCALAR *particle_ekz0; + FFT_SCALAR *particle_ekx1; + FFT_SCALAR *particle_eky1; + FFT_SCALAR *particle_ekz1; + FFT_SCALAR *particle_ekx2; + FFT_SCALAR *particle_eky2; + FFT_SCALAR *particle_ekz2; + FFT_SCALAR *particle_ekx3; + FFT_SCALAR *particle_eky3; + FFT_SCALAR *particle_ekz3; + FFT_SCALAR *particle_ekx4; + FFT_SCALAR *particle_eky4; + FFT_SCALAR *particle_ekz4; + FFT_SCALAR *particle_ekx5; + FFT_SCALAR *particle_eky5; + FFT_SCALAR *particle_ekz5; + FFT_SCALAR *particle_ekx6; + FFT_SCALAR *particle_eky6; + FFT_SCALAR *particle_ekz6; + + + + int _use_table; + int rho_points; + FFT_SCALAR **rho_lookup; + FFT_SCALAR **rho6_lookup; + FFT_SCALAR **drho_lookup; + FFT_SCALAR **drho6_lookup; + FFT_SCALAR half_rho_scale, half_rho_scale_plus; + + int _use_packing; + + + #ifdef _LMP_INTEL_OFFLOAD + int _use_base; + #endif + + template<class flt_t, class acc_t> + void particle_map(double, double, double, + double, int **, int, int, + int, int, int, + int, int, int, + IntelBuffers<flt_t,acc_t> *buffers); + + template<class flt_t, class acc_t, int use_table> + void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_c(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_c<flt_t,acc_t,1>(buffers); + } else { + make_rho_c<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_g(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_g<flt_t,acc_t,1>(buffers); + } else { + make_rho_g<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_a(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_a<flt_t,acc_t,1>(buffers); + } else { + make_rho_a<flt_t,acc_t,0>(buffers); + } + } + + + template<class flt_t, class acc_t, int use_table> + void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void make_rho_none(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + make_rho_none<flt_t,acc_t,1>(buffers); + } else { + make_rho_none<flt_t,acc_t,0>(buffers); + } + } + + + template<class flt_t, class acc_t, int use_table> + void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_c_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_c_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_c_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_c_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_c_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_c_ad<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_g_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_g_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_g_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_g_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_g_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_g_ad<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_a_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_a_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_a_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_a_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_a_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_a_ad<flt_t,acc_t,0>(buffers); + } + } + template<class flt_t, class acc_t, int use_table> + void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_none_ik(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_none_ik<flt_t,acc_t,1>(buffers); + } else { + fieldforce_none_ik<flt_t,acc_t,0>(buffers); + } + } + + template<class flt_t, class acc_t, int use_table> + void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers); + template<class flt_t, class acc_t> + void fieldforce_none_ad(IntelBuffers<flt_t,acc_t> *buffers) { + if (_use_table == 1) { + fieldforce_none_ad<flt_t,acc_t,1>(buffers); + } else { + fieldforce_none_ad<flt_t,acc_t,0>(buffers); + } + } + + void precompute_rho(); + + }; + +} +#endif +#endif + + diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp index 42bdec46ee..8416b6f3a3 100644 --- a/src/USER-INTEL/pppm_intel.cpp +++ b/src/USER-INTEL/pppm_intel.cpp @@ -14,7 +14,7 @@ /* ---------------------------------------------------------------------- Contributing authors: William McDoniel (RWTH Aachen University) Rodrigo Canales (RWTH Aachen University) - Markus Hoehnerbach (RWTH Aachen University) + Markus Hoehnerbach (RWTH Aachen University) W. Michael Brown (Intel) ------------------------------------------------------------------------- */ @@ -62,10 +62,10 @@ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg) perthread_density = NULL; particle_ekx = particle_eky = particle_ekz = NULL; - + rho_lookup = drho_lookup = NULL; rho_points = 0; - + vdxy_brick = vdz0_brick = NULL; work3 = NULL; cg_pack = NULL; @@ -120,20 +120,20 @@ void PPPMIntel::init() if ((comm->nthreads > 1) && !_use_lrt) { memory->destroy(perthread_density); memory->create(perthread_density, comm->nthreads-1, - ngrid + INTEL_P3M_ALIGNED_MAXORDER, + ngrid + INTEL_P3M_ALIGNED_MAXORDER, "pppmintel:perthread_density"); } - + _use_table = fix->pppm_table(); if (_use_table) { rho_points = 5000; memory->destroy(rho_lookup); memory->create(rho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, - "pppmintel:rho_lookup"); + "pppmintel:rho_lookup"); if(differentiation_flag == 1) { memory->destroy(drho_lookup); memory->create(drho_lookup, rho_points, INTEL_P3M_ALIGNED_MAXORDER, - "pppmintel:drho_lookup"); + "pppmintel:drho_lookup"); } precompute_rho(); } @@ -141,7 +141,7 @@ void PPPMIntel::init() if (order > INTEL_P3M_MAXORDER) error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); - _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16) + _use_packing = (order == 7) && (INTEL_VECTOR_WIDTH == 16) && (sizeof(FFT_SCALAR) == sizeof(float)) && (differentiation_flag == 0); if (_use_packing) { @@ -149,13 +149,13 @@ void PPPMIntel::init() memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); memory->destroy3d_offset(vdxy_brick, nzlo_out, nylo_out, 2*nxlo_out); - memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, - nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, - "pppmintel:vdxy_brick"); + memory->create3d_offset(vdxy_brick, nzlo_out, nzhi_out+2, + nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, + "pppmintel:vdxy_brick"); memory->destroy3d_offset(vdz0_brick, nzlo_out, nylo_out, 2*nxlo_out); - memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, - nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, - "pppmintel:vdz0_brick"); + memory->create3d_offset(vdz0_brick, nzlo_out, nzhi_out+2, + nylo_out, nyhi_out, 2*nxlo_out, 2*nxhi_out+1, + "pppmintel:vdz0_brick"); memory->destroy(work3); memory->create(work3, 2*nfft_both, "pppmintel:work3"); @@ -163,10 +163,10 @@ void PPPMIntel::init() delete cg_pack; int (*procneigh)[2] = comm->procneigh; cg_pack = new GridComm(lmp,world,2,0, 2*nxlo_in,2*nxhi_in+1,nylo_in, - nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1, - nylo_out,nyhi_out,nzlo_out,nzhi_out, - procneigh[0][0],procneigh[0][1],procneigh[1][0], - procneigh[1][1],procneigh[2][0],procneigh[2][1]); + nyhi_in,nzlo_in,nzhi_in, 2*nxlo_out,2*nxhi_out+1, + nylo_out,nyhi_out,nzlo_out,nzhi_out, + procneigh[0][0],procneigh[0][1],procneigh[1][0], + procneigh[1][1],procneigh[2][0],procneigh[2][1]); cg_pack->ghost_notify(); cg_pack->setup(); @@ -484,7 +484,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) { const int nix = nxhi_out - nxlo_out + 1; const int niy = nyhi_out - nylo_out + 1; - + const flt_t lo0 = boxlo[0]; const flt_t lo1 = boxlo[1]; const flt_t lo2 = boxlo[2]; @@ -503,7 +503,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) memset(my_density, 0, ngrid * sizeof(FFT_SCALAR)); for (int i = ifrom; i < ito; i++) { - + int nx = part2grid[i][0]; int ny = part2grid[i][1]; int nz = part2grid[i][2]; @@ -515,9 +515,9 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; - + _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - + if (use_table) { dx = dx*half_rho_scale + half_rho_scale_plus; int idx = dx; @@ -527,7 +527,7 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) int idz = dz; #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho_lookup[idx][k]; rho[1][k] = rho_lookup[idy][k]; @@ -536,11 +536,11 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) } else { #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3; r1 = r2 = r3 = ZEROF; - + for (int l = order-1; l >= 0; l--) { r1 = rho_coeff[l][k] + r1*dx; r2 = rho_coeff[l][k] + r2*dy; @@ -551,24 +551,24 @@ void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers) rho[2][k-nlower] = r3; } } - + FFT_SCALAR z0 = fdelvolinv * q[i]; #if defined(LMP_SIMD_COMPILER) #pragma loop_count=7 - #endif + #endif for (int n = 0; n < order; n++) { int mz = n*nix*niy + nzsum; FFT_SCALAR y0 = z0*rho[2][n]; #if defined(LMP_SIMD_COMPILER) #pragma loop_count=7 - #endif + #endif for (int m = 0; m < order; m++) { int mzy = m*nix + mz; FFT_SCALAR x0 = y0*rho[1][m]; #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { int mzyx = l + mzy; my_density[mzyx] += x0*rho[0][l]; @@ -709,21 +709,21 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) #if defined(LMP_SIMD_COMPILER) #pragma loop_count=7 - #endif + #endif for (int n = 0; n < order; n++) { int mz = n+nzsum; FFT_SCALAR z0 = rho2[n]; #if defined(LMP_SIMD_COMPILER) #pragma loop_count=7 - #endif + #endif for (int m = 0; m < order; m++) { int my = m+nysum; FFT_SCALAR y0 = z0*rho1[m]; #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for (int l = 0; l < (use_packing ? 2 : 1) * - INTEL_P3M_ALIGNED_MAXORDER; l++) { + INTEL_P3M_ALIGNED_MAXORDER; l++) { int mx = l+nxsum; FFT_SCALAR x0 = y0*rho0[l]; if (use_packing) { @@ -824,13 +824,13 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) const flt_t fsf_coeff3 = sf_coeff[3]; const flt_t fsf_coeff4 = sf_coeff[4]; const flt_t fsf_coeff5 = sf_coeff[5]; - + int ifrom, ito, tid; IP_PRE_omp_range_id(ifrom, ito, tid, nlocal, nthr); _alignvar(flt_t rho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(flt_t drho[3][INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - + for (int i = ifrom; i < ito; i++) { int nx = part2grid[i][0]; int ny = part2grid[i][1]; @@ -838,11 +838,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; - + int nxsum = nx + nlower; int nysum = ny + nlower; int nzsum = nz + nlower; - + if (use_table) { dx = dx*half_rho_scale + half_rho_scale_plus; int idx = dx; @@ -852,7 +852,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) int idz = dz; #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for(int k = 0; k < INTEL_P3M_ALIGNED_MAXORDER; k++) { rho[0][k] = rho_lookup[idx][k]; rho[1][k] = rho_lookup[idy][k]; @@ -864,11 +864,11 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) } else { #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; dr1 = dr2 = dr3 = ZEROF; - + r1 = rho_coeff[order-1][k]; r2 = rho_coeff[order-1][k]; r3 = rho_coeff[order-1][k]; @@ -888,21 +888,21 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) drho[2][k-nlower] = dr3; } } - + _alignvar(FFT_SCALAR ekx[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(FFT_SCALAR eky[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; _alignvar(FFT_SCALAR ekz[INTEL_P3M_ALIGNED_MAXORDER], 64) = {0}; - + particle_ekx[i] = particle_eky[i] = particle_ekz[i] = ZEROF; - + #if defined(LMP_SIMD_COMPILER) #pragma loop_count=7 - #endif + #endif for (int n = 0; n < order; n++) { int mz = n + nzsum; #if defined(LMP_SIMD_COMPILER) #pragma loop_count=7 - #endif + #endif for (int m = 0; m < order; m++) { int my = m + nysum; FFT_SCALAR ekx_p = rho[1][m] * rho[2][n]; @@ -910,7 +910,7 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) FFT_SCALAR ekz_p = rho[1][m] * drho[2][n]; #if defined(LMP_SIMD_COMPILER) #pragma simd - #endif + #endif for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++) { int mx = l + nxsum; ekx[l] += drho[0][l] * ekx_p * u_brick[mz][my][mx]; @@ -919,17 +919,17 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) } } } - + #if defined(LMP_SIMD_COMPILER) #pragma simd #endif for (int l = 0; l < INTEL_P3M_ALIGNED_MAXORDER; l++){ - particle_ekx[i] += ekx[l]; - particle_eky[i] += eky[l]; - particle_ekz[i] += ekz[l]; + particle_ekx[i] += ekx[l]; + particle_eky[i] += eky[l]; + particle_ekz[i] += ekz[l]; } } - + #if defined(LMP_SIMD_COMPILER) #pragma simd #endif @@ -937,12 +937,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) particle_ekx[i] *= hx_inv; particle_eky[i] *= hy_inv; particle_ekz[i] *= hz_inv; - + // convert E-field to force - + const flt_t qfactor = fqqrd2es * q[i]; const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; - + const flt_t s1 = x[i].x * hx_inv; const flt_t s2 = x[i].y * hy_inv; const flt_t s3 = x[i].z * hz_inv; @@ -950,16 +950,16 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) sf += fsf_coeff1 * sin(ffour_pi * s1); sf *= twoqsq; f[i].x += qfactor * particle_ekx[i] - fqqrd2es * sf; - + sf = fsf_coeff2 * sin(ftwo_pi * s2); sf += fsf_coeff3 * sin(ffour_pi * s2); sf *= twoqsq; f[i].y += qfactor * particle_eky[i] - fqqrd2es * sf; - + sf = fsf_coeff4 * sin(ftwo_pi * s3); sf += fsf_coeff5 * sin(ffour_pi * s3); sf *= twoqsq; - + if (slabflag != 2) f[i].z += qfactor * particle_ekz[i] - fqqrd2es * sf; } } @@ -1000,7 +1000,7 @@ void PPPMIntel::poisson_ik_intel() n = 0; for (i = 0; i < nfft; i++) { eng = s2 * greensfn[i] * (work1[n]*work1[n] + - work1[n+1]*work1[n+1]); + work1[n+1]*work1[n+1]); for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; if (eflag_global) energy += eng; n += 2; @@ -1069,10 +1069,10 @@ void PPPMIntel::poisson_ik_intel() for (j = nylo_in; j <= nyhi_in; j++) for (i = nxlo_in; i <= nxhi_in; i++) { vdxy_brick[k][j][2*i] = work2[n]; - vdxy_brick[k][j][2*i+1] = work3[n]; + vdxy_brick[k][j][2*i+1] = work3[n]; n += 2; } - + // z direction gradient n = 0; @@ -1091,7 +1091,7 @@ void PPPMIntel::poisson_ik_intel() for (j = nylo_in; j <= nyhi_in; j++) for (i = nxlo_in; i <= nxhi_in; i++) { vdz0_brick[k][j][2*i] = work2[n]; - vdz0_brick[k][j][2*i+1] = 0.; + vdz0_brick[k][j][2*i+1] = 0.; n += 2; } } @@ -1202,7 +1202,7 @@ double PPPMIntel::memory_usage() } } if (_use_packing) { - bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + bytes += 2 * (nzhi_out + 2 - nzlo_out + 1) * (nyhi_out - nylo_out + 1) * (2 * nxhi_out + 1 - 2 * nxlo_out + 1) * sizeof(FFT_SCALAR); bytes -= 3 * (nxhi_out - nxlo_out + 1) * (nyhi_out - nylo_out + 1) * (nzhi_out - nzlo_out + 1) * sizeof(FFT_SCALAR); @@ -1228,7 +1228,7 @@ void PPPMIntel::pack_buffers() { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, - packthreads, + packthreads, sizeof(IntelBuffers<float,double>::atom_t)); if (fix->precision() == FixIntel::PREC_MODE_MIXED) fix->get_mixed_buffers()->thr_pack(ifrom,ito,1); diff --git a/src/USER-INTEL/pppm_intel.h b/src/USER-INTEL/pppm_intel.h index d48a6b709e..e152486b29 100644 --- a/src/USER-INTEL/pppm_intel.h +++ b/src/USER-INTEL/pppm_intel.h @@ -14,7 +14,7 @@ /* ---------------------------------------------------------------------- Contributing authors: William McDoniel (RWTH Aachen University) Rodrigo Canales (RWTH Aachen University) - Markus Hoehnerbach (RWTH Aachen University) + Markus Hoehnerbach (RWTH Aachen University) W. Michael Brown (Intel) ------------------------------------------------------------------------- */ @@ -77,7 +77,7 @@ class PPPMIntel : public PPPM { template<class flt_t, class acc_t> void test_function(IntelBuffers<flt_t,acc_t> *buffers); - + void precompute_rho(); template<class flt_t, class acc_t> void particle_map(IntelBuffers<flt_t,acc_t> *buffers); diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index b44870e9b0..81f4586143 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -51,7 +51,7 @@ VerletLRTIntel::VerletLRTIntel(LAMMPS *lmp, int narg, char **arg) : /* ---------------------------------------------------------------------- */ -VerletLRTIntel::~VerletLRTIntel() +VerletLRTIntel::~VerletLRTIntel() { #if defined(_LMP_INTEL_LRT_PTHREAD) pthread_mutex_destroy(&_kmutex); @@ -67,10 +67,10 @@ void VerletLRTIntel::init() Verlet::init(); _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0)); - + #ifdef LMP_INTEL_NOLRT - error->all(FLERR, - "LRT otion for Intel package disabled at compile time"); + error->all(FLERR, + "LRT otion for Intel package disabled at compile time"); #endif } @@ -83,7 +83,7 @@ void VerletLRTIntel::setup(int flag) if (_intel_kspace == 0) { Verlet::setup(flag); return; - } + } #ifdef _LMP_INTEL_OFFLOAD if (_intel_kspace->use_base()) { @@ -154,15 +154,15 @@ void VerletLRTIntel::setup(int flag) _intel_kspace->setup(); #if defined(_LMP_INTEL_LRT_PTHREAD) - pthread_create(&_kspace_thread, &_kspace_attr, - &VerletLRTIntel::k_launch_loop, this); + pthread_create(&_kspace_thread, &_kspace_attr, + &VerletLRTIntel::k_launch_loop, this); #elif defined(_LMP_INTEL_LRT_11) std::thread kspace_thread; - if (kspace_compute_flag) - _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag, + if (kspace_compute_flag) + _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag, vflag); }); - else - _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag, + else + _kspace_thread=std::thread([=]{ _intel_kspace->compute_dummy(eflag, vflag); }); #endif @@ -297,8 +297,8 @@ void VerletLRTIntel::run(int n) pthread_mutex_unlock(&_kmutex); #elif defined(_LMP_INTEL_LRT_11) std::thread kspace_thread; - if (kspace_compute_flag) - kspace_thread=std::thread([=] { + if (kspace_compute_flag) + kspace_thread=std::thread([=] { _intel_kspace->compute_first(eflag, vflag); timer->stamp(Timer::KSPACE); } ); @@ -329,7 +329,7 @@ void VerletLRTIntel::run(int n) _kspace_done = 0; pthread_mutex_unlock(&_kmutex); #elif defined(_LMP_INTEL_LRT_11) - if (kspace_compute_flag) + if (kspace_compute_flag) kspace_thread.join(); #endif @@ -367,7 +367,7 @@ void VerletLRTIntel::run(int n) } #if defined(_LMP_INTEL_LRT_PTHREAD) - if (run_cancelled) + if (run_cancelled) pthread_cancel(_kspace_thread); else { pthread_mutex_lock(&_kmutex); @@ -390,9 +390,9 @@ void * VerletLRTIntel::k_launch_loop(void *context) { VerletLRTIntel * const c = (VerletLRTIntel *)context; - if (c->kspace_compute_flag) + if (c->kspace_compute_flag) c->_intel_kspace->compute_first(c->eflag, c->vflag); - else + else c->_intel_kspace->compute_dummy(c->eflag, c->vflag); pthread_mutex_lock(&(c->_kmutex)); @@ -408,7 +408,7 @@ void * VerletLRTIntel::k_launch_loop(void *context) pthread_mutex_unlock(&(c->_kmutex)); for (int i = 0; i < n; i++) { - + if (c->kspace_compute_flag) { c->_intel_kspace->compute_first(c->eflag, c->vflag); c->timer->stamp(Timer::KSPACE); -- GitLab