From 2fe0eabc0988feda90ac85ef1331dcd83ad028ac Mon Sep 17 00:00:00 2001 From: Michael Brown <michael.w.brown@intel.com> Date: Fri, 22 Jun 2018 01:52:55 -0700 Subject: [PATCH] Intel Package: Bug fix when using LRT with minimize. Bug fix for virial calculation when using GNU compilers. Most of framework for adding hybrid support is now in place. --- src/USER-INTEL/fix_intel.h | 3 +- src/USER-INTEL/intel_preprocess.h | 53 +++- src/USER-INTEL/npair_full_bin_ghost_intel.cpp | 195 ++++++------ src/USER-INTEL/npair_intel.cpp | 287 ++++++++++-------- src/USER-INTEL/pair_buck_coul_cut_intel.cpp | 10 +- src/USER-INTEL/pair_buck_coul_long_intel.cpp | 10 +- src/USER-INTEL/pair_buck_intel.cpp | 10 +- src/USER-INTEL/pair_dpd_intel.cpp | 9 +- src/USER-INTEL/pair_eam_intel.cpp | 29 +- src/USER-INTEL/pair_gayberne_intel.cpp | 28 +- .../pair_lj_charmm_coul_charmm_intel.cpp | 11 +- .../pair_lj_charmm_coul_long_intel.cpp | 11 +- .../pair_lj_cut_coul_long_intel.cpp | 10 +- src/USER-INTEL/pair_lj_cut_intel.cpp | 10 +- src/USER-INTEL/pair_sw_intel.cpp | 44 +-- src/USER-INTEL/pair_tersoff_intel.cpp | 15 +- src/USER-INTEL/verlet_lrt_intel.cpp | 2 +- src/USER-INTEL/verlet_lrt_intel.h | 22 +- 18 files changed, 450 insertions(+), 309 deletions(-) diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index d7093e79bb..79fb23ab3d 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -82,7 +82,8 @@ class FixIntel : public Fix { } inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; } inline int lrt() { - if (force->kspace_match("pppm/intel", 0)) return _lrt; + if (force->kspace_match("pppm/intel", 0) && update->whichflag == 1) + return _lrt; else return 0; } inline int pppm_table() { diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index 29cc45f755..2f1b02a49f 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -134,7 +134,21 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, #define INTEL_HTHREADS 2 #endif -#define IP_PRE_get_stride(stride, n, datasize, torque) \ +#if INTEL_DATA_ALIGN > 1 + +#define IP_PRE_edge_align(n, esize) \ + { \ + const int pad_mask = ~static_cast<int>(INTEL_DATA_ALIGN/esize-1); \ + n = (n + INTEL_DATA_ALIGN / esize - 1) & pad_mask; \ + } + +#else + +#define IP_PRE_edge_align(n, esize) \ + +#endif + +#define IP_PRE_get_stride(stride, n, datasize, torque) \ { \ int blength = n; \ if (torque) blength *= 2; \ @@ -303,7 +317,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, { \ tid = 0; \ ifrom = 0; \ - ip = 1; \ + ip = vecsize; \ ito = inum; \ } @@ -316,7 +330,7 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR, acc_t *f_scalar = &f_start[0].x; \ flt_t *x_scalar = &pos[minlocal].x; \ int f_stride4 = f_stride * 4; \ - _alignvar(acc_t ovv[INTEL_COMPILE_WIDTH],64); \ + _alignvar(acc_t ovv[16],64); \ int vwidth; \ if (sizeof(acc_t) == sizeof(double)) \ vwidth = INTEL_COMPILE_WIDTH/2; \ @@ -516,6 +530,22 @@ inline double MIC_Wtime() { return time; } +#define IP_PRE_neighbor_pad(jnum, offload) \ +{ \ + const int opad_mask = ~static_cast<int>(INTEL_MIC_NBOR_PAD * \ + sizeof(float) / \ + sizeof(flt_t) - 1); \ + const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD * \ + sizeof(float) / \ + sizeof(flt_t) - 1); \ + if (offload && INTEL_MIC_NBOR_PAD > 1) \ + jnum = (jnum + INTEL_MIC_NBOR_PAD * sizeof(float) / \ + sizeof(flt_t) - 1) & opad_mask; \ + else if (INTEL_NBOR_PAD > 1) \ + jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) / \ + sizeof(flt_t) - 1) & pad_mask; \ +} + #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ nlocal, nall) \ { \ @@ -644,6 +674,23 @@ inline double MIC_Wtime() { #else +#if INTEL_NBOR_PAD > 1 + +#define IP_PRE_neighbor_pad(jnum, offload) \ +{ \ + const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD * \ + sizeof(float) / \ + sizeof(flt_t) - 1); \ + jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) / \ + sizeof(flt_t) - 1) & pad_mask; \ +} + +#else + +#define IP_PRE_neighbor_pad(jnum, offload) + +#endif + #define MIC_Wtime MPI_Wtime #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \ nlocal, nall) diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp index 24d5076c34..ae961e84b5 100644 --- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -112,7 +112,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, if (pend-pstart == 0) return; const int nall = atom->nlocal + atom->nghost; - int pad = 1; int nall_t = nall; const int aend = nall; @@ -207,6 +206,17 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, const int mbinz = this->mbinz; const int * const stencilxyz = &this->stencilxyz[0][0]; + int sb = 1; + if (special_flag[1] == 0) { + sb = 2; + if (special_flag[2] == 0) { + sb = 3; + if (special_flag[3] == 0) + sb = 4; + } + } + const int special_bound = sb; + #ifdef _LMP_INTEL_OFFLOAD const int * _noalias const binhead = this->binhead; const int * _noalias const bins = this->bins; @@ -230,7 +240,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \ in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \ in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \ - in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny) \ + in(separate_buffers,aend,nlocal,molecular,ntypes,mbinx,mbiny,special_bound)\ in(mbinz,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \ in(stencilxyz:length(3*nstencil)) \ out(overflow:length(5) alloc_if(0) free_if(0)) \ @@ -287,8 +297,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, int e_ito = ito; const int list_size = (e_ito + tid * 2 + 2) * maxnbors; - int which; - int pack_offset = maxnbors; int ct = (ifrom + tid * 2) * maxnbors; int *neighptr = firstneigh + ct; @@ -418,41 +426,109 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, } } // for u + if (molecular && i < nlocal) { + int alln = n; + n = 0; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #ifdef LMP_INTEL_NBOR_COMPAT + #pragma ivdep + #else + #pragma simd + #endif + #endif + for (int u = 0; u < alln; u++) { + int which; + int addme = 1; + int j = neighptr[u]; + if (need_ic && j < 0) { + which = 0; + j = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + if (which) { + j = j ^ (which << SBBITS); + if (which < special_bound) addme = 0; + } + #ifdef LMP_INTEL_NBOR_COMPAT + if (addme) neighptr2[n++] = j; + #else + neighptr2[n++] = j; + #endif + } + alln = n2; + n2 = maxnbors * 2; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #ifdef LMP_INTEL_NBOR_COMPAT + #pragma ivdep + #else + #pragma simd + #endif + #endif + for (int u = n2; u < alln; u++) { + int which; + int addme = 1; + int j = neighptr[u]; + if (need_ic && j < 0) { + which = 0; + j = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + if (which) { + j = j ^ (which << SBBITS); + if (which < special_bound) addme = 0; + } + #ifdef LMP_INTEL_NBOR_COMPAT + if (addme) neighptr2[n2++] = j; + #else + neighptr2[n2++] = j; + #endif + } + } + #ifndef _LMP_INTEL_OFFLOAD if (exclude) { int alln = n; n = maxnbors; - for (int u = pack_offset; u < alln; u++) { - const int j = neighptr[u]; - int pj = j; - if (need_ic) - if (pj < 0) pj = -j - 1; - const int jtype = x[pj].w; - if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; - neighptr[n++] = j; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int u = n; u < alln; u++) { + int addme = 1; + const int js = neighptr[u]; + const int j = js & NEIGHMASK; + const int jtype = x[j].w; + if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0; + if (addme) neighptr2[n++] = js; } alln = n2; n2 = maxnbors * 2; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif for (int u = n2; u < alln; u++) { - const int j = neighptr[u]; - int pj = j; - if (need_ic) - if (pj < 0) pj = -j - 1; - const int jtype = x[pj].w; - if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; - neighptr[n2++] = j; + int addme = 1; + const int js = neighptr[u]; + const int j = js & NEIGHMASK; + const int jtype = x[j].w; + if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0; + if (addme) neighptr2[n2++] = js; } } #endif + int ns = n - maxnbors; int alln = n; atombin[i] = ns; n = 0; for (int u = maxnbors; u < alln; u++) - neighptr[n++] = neighptr[u]; + neighptr[n++] = neighptr2[u]; ns += n2 - maxnbors * 2; for (int u = maxnbors * 2; u < n2; u++) - neighptr[n++] = neighptr[u]; + neighptr[n++] = neighptr2[u]; if (ns > maxnbors) *overflow = 1; ilist[i] = i; @@ -460,9 +536,7 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, numneigh[i] = ns; ct += ns; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = ct & (alignb - 1); - if (edge) ct += alignb - edge; + IP_PRE_edge_align(ct, sizeof(int)); neighptr = firstneigh + ct; if (ct + obound > list_size) { if (i < ito - 1) { @@ -477,84 +551,11 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, numneigh[i] = 0; #ifdef _LMP_INTEL_OFFLOAD - int ghost_offset = 0, nall_offset = e_nall; if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - #if __INTEL_COMPILER+0 > 1499 - #pragma vector aligned - #pragma simd - #endif - for (int jj = 0; jj < jnum; jj++) { - int j = jlist[jj]; - if (need_ic && j < 0) j = -j - 1; - } - } - overflow[LMP_LOCAL_MIN] = 0; overflow[LMP_LOCAL_MAX] = nlocal - 1; overflow[LMP_GHOST_MIN] = nlocal; overflow[LMP_GHOST_MAX] = e_nall - 1; - - int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN]; - if (nghost < 0) nghost = 0; - if (offload) { - ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1; - nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost; - } else { - ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; - nall_offset = nlocal + nghost; - } - } // if separate_buffers - #endif - - if (molecular) { - int ito_m = ito; - if (ito >= nlocal) ito_m = nlocal; - for (int i = ifrom; i < ito_m; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd - #endif - for (int jj = 0; jj < jnum; jj++) { - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } // for i - } // if molecular - #ifdef _LMP_INTEL_OFFLOAD - else if (separate_buffers) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - int jj = 0; - #pragma vector aligned - #pragma simd - for (jj = 0; jj < jnum; jj++) { - if (jlist[jj] >= nlocal) { - if (jlist[jj] == e_nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; - } - } - } } #endif } // end omp diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index d3d2745aee..355c8db199 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -62,19 +62,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, if (aend-astart == 0) return; const int nall = atom->nlocal + atom->nghost; - int pad = 1; int nall_t = nall; #ifdef _LMP_INTEL_OFFLOAD if (offload_noghost && offload) nall_t = atom->nlocal; - if (THREE == 0 && offload) { - if (INTEL_MIC_NBOR_PAD > 1) - pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t); - } else #endif - if (THREE == 0 && INTEL_NBOR_PAD > 1) - pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); - const int pad_width = pad; + const int pack_width = _fix->nbor_pack_width(); const ATOM_T * _noalias const x = buffers->get_x(); @@ -150,6 +143,17 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, int * _noalias const ncachetag = buffers->get_ncachetag(); const int ncache_stride = buffers->ncache_stride(); + int sb = 1; + if (special_flag[1] == 0) { + sb = 2; + if (special_flag[2] == 0) { + sb = 3; + if (special_flag[3] == 0) + sb = 4; + } + } + const int special_bound = sb; + #ifdef _LMP_INTEL_OFFLOAD const int * _noalias const binhead = this->binhead; const int * _noalias const bins = this->bins; @@ -172,9 +176,9 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, in(ncachex,ncachey,ncachez,ncachej:length(0) alloc_if(0) free_if(0)) \ in(ncachejtype,ncachetag:length(0) alloc_if(0) free_if(0)) \ in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \ - in(pad_width,offload_end,separate_buffers,astart,aend,nlocal,molecular) \ + in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \ in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \ - in(pack_width) \ + in(pack_width,special_bound) \ out(overflow:length(5) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ signal(tag) @@ -226,19 +230,25 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, ifrom += astart; ito += astart; int e_ito = ito; + #ifdef LMP_INTEL_3BODY_FAST if (THREE && ito == num) { int imod = ito & (pack_width - 1); if (imod) e_ito += pack_width - imod; } + #endif const int list_size = (e_ito + tid * 2 + 2) * maxnbors; - int which; - - int pack_offset = maxnbors; - if (THREE) pack_offset *= pack_width; + #ifdef LMP_INTEL_3BODY_FAST + const int pack_offset = maxnbors * pack_width; + const int obound = pack_offset + maxnbors * 2; + #else + const int pack_offset = 0; + const int obound = maxnbors * 3; + #endif int ct = (ifrom + tid * 2) * maxnbors; int *neighptr = firstneigh + ct; - const int obound = pack_offset + maxnbors * 2; + int *neighptr2; + if (THREE) neighptr2 = neighptr; const int toffs = tid * ncache_stride; flt_t * _noalias const tx = ncachex + toffs; @@ -256,10 +266,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, // loop over all atoms in other bins in stencil, store every pair int istart, icount, ncount, oldbin = -9999999, lane, max_chunk; + #ifdef LMP_INTEL_3BODY_FAST if (THREE) { lane = 0; max_chunk = 0; } + #endif for (int i = ifrom; i < ito; i++) { const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; @@ -298,9 +310,7 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, if (FULL == 0 || TRI == 1) { icount = 0; istart = ncount; - const int alignb = INTEL_DATA_ALIGN / sizeof(int); - int nedge = istart & (alignb - 1); - if (nedge) istart + (alignb - nedge); + IP_PRE_edge_align(istart, sizeof(int)); itx = tx + istart; ity = ty + istart; itz = tz + istart; @@ -383,11 +393,12 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, // ---------------------- Loop over other bins - int n2, *neighptr2; + int n2; if (THREE) { + #ifdef LMP_INTEL_3BODY_FAST n = pack_offset; + #endif n2 = pack_offset + maxnbors; - neighptr2 = neighptr; } #if defined(LMP_SIMD_COMPILER) #pragma vector aligned @@ -461,49 +472,133 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, } } // for u + if (molecular) { + if (!THREE) neighptr2 = neighptr; + int alln = n; + + n = pack_offset; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #ifdef LMP_INTEL_NBOR_COMPAT + #pragma ivdep + #else + #pragma simd + #endif + #endif + for (int u = n; u < alln; u++) { + int which; + int addme = 1; + int j = neighptr[u]; + if (need_ic && j < 0) { + which = 0; + j = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + + if (which) { + j = j ^ (which << SBBITS); + if (which < special_bound) addme = 0; + } + #ifdef LMP_INTEL_NBOR_COMPAT + if (addme) neighptr2[n++] = j; + #else + neighptr2[n++]=j; + #endif + } + + if (THREE) { + alln = n2; + n2 = pack_offset + maxnbors; + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #ifdef LMP_INTEL_NBOR_COMPAT + #pragma ivdep + #else + #pragma simd + #endif + #endif + for (int u = n2; u < alln; u++) { + int which; + int addme = 1; + int j = neighptr[u]; + if (need_ic && j < 0) { + which = 0; + j = -j - 1; + } else + ofind_special(which, special, nspecial, i, tag[j]); + if (which) { + j = j ^ (which << SBBITS); + if (which < special_bound) addme = 0; + } + #ifdef LMP_INTEL_NBOR_COMPAT + if (addme) neighptr2[n2++] = j; + #else + neighptr2[n2++]=j; + #endif + } + } + } + #ifndef _LMP_INTEL_OFFLOAD if (exclude) { + neighptr2 = neighptr; int alln = n; - if (THREE) n = pack_offset; - else n = 0; - for (int u = pack_offset; u < alln; u++) { - const int j = neighptr[u]; - int pj = j; - if (need_ic) - if (pj < 0) pj = -j - 1; - const int jtype = x[pj].w; - if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; - neighptr[n++] = j; + n = pack_offset; + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int u = n; u < alln; u++) { + int addme = 1; + const int js = neighptr[u]; + const int j = js & NEIGHMASK; + const int jtype = x[j].w; + if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0; + if (addme) neighptr2[n++] = js; } if (THREE) { alln = n2; n2 = pack_offset + maxnbors; - for (int u = pack_offset + maxnbors; u < alln; u++) { - const int j = neighptr[u]; - int pj = j; - if (need_ic) - if (pj < 0) pj = -j - 1; - const int jtype = x[pj].w; - if (exclusion(i,pj,itype,jtype,mask,molecule)) continue; - neighptr[n2++] = j; + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int u = n2; u < alln; u++) { + int addme = 1; + const int js = neighptr[u]; + const int j = js & NEIGHMASK; + const int jtype = x[j].w; + if (exclusion(i,j,itype,jtype,mask,molecule)) addme = 0; + if (addme) neighptr2[n2++] = js; } } } #endif + int ns; if (THREE) { int alln = n; ns = n - pack_offset; atombin[i] = ns; + ns += n2 - pack_offset - maxnbors; + + #ifdef LMP_INTEL_3BODY_FAST n = lane; for (int u = pack_offset; u < alln; u++) { - neighptr[n] = neighptr[u]; + neighptr[n] = neighptr2[u]; n += pack_width; } - ns += n2 - pack_offset - maxnbors; + #endif + for (int u = pack_offset + maxnbors; u < n2; u++) { - neighptr[n] = neighptr[u]; + #ifdef LMP_INTEL_3BODY_FAST + neighptr[n] = neighptr2[u]; n += pack_width; + #else + neighptr[n++] = neighptr2[u]; + #endif } if (ns > maxnbors) *overflow = 1; } else @@ -512,34 +607,33 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, ilist[i] = i; cnumneigh[i] = ct; if (THREE) { + #ifdef LMP_INTEL_3BODY_FAST cnumneigh[i] += lane; + #endif numneigh[i] = ns; } else { - int edge = n & (pad_width - 1); - if (edge) { - const int pad_end = n + (pad_width - edge); - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ - avg=INTEL_COMPILE_WIDTH/2 - #endif - for ( ; n < pad_end; n++) - neighptr[n] = e_nall; - } numneigh[i] = n; + int pad_end = n; + IP_PRE_neighbor_pad(pad_end, offload); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif + for ( ; n < pad_end; n++) + neighptr[n] = e_nall; } + #ifdef LMP_INTEL_3BODY_FAST if (THREE) { if (ns > max_chunk) max_chunk = ns; lane++; if (lane == pack_width) { ct += max_chunk * pack_width; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = ct & (alignb - 1); - if (edge) ct += alignb - edge; + IP_PRE_edge_align(ct, sizeof(int)); neighptr = firstneigh + ct; + neighptr2 = neighptr; max_chunk = 0; - pack_offset = maxnbors * pack_width; lane = 0; if (ct + obound > list_size) { if (i < ito - 1) { @@ -548,12 +642,13 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, } } } - } else { + } else + #endif + { ct += n; - const int alignb = (INTEL_DATA_ALIGN / sizeof(int)); - const int edge = ct & (alignb - 1); - if (edge) ct += alignb - edge; + //IP_PRE_edge_align(ct, sizeof(int)); neighptr = firstneigh + ct; + if (THREE) neighptr2 = neighptr; if (ct + obound > list_size) { if (i < ito - 1) { *overflow = 1; @@ -573,14 +668,14 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, if (separate_buffers) { for (int i = ifrom; i < ito; ++i) { int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + int jnum = numneigh[i]; + if (!THREE) IP_PRE_neighbor_pad(jnum, offload); #if __INTEL_COMPILER+0 > 1499 #pragma vector aligned #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin) #endif for (int jj = 0; jj < jnum; jj++) { - int j = jlist[jj]; - if (need_ic && j < 0) j = -j - 1; + const int j = jlist[jj] & NEIGHMASK; if (j < nlocal) { if (j < vlmin) vlmin = j; if (j > vlmax) vlmax = j; @@ -615,72 +710,20 @@ void NPairIntel::bin_newton(const int offload, NeighList *list, ghost_offset = overflow[LMP_GHOST_MIN] - nlocal; nall_offset = nlocal + nghost; } - } // if separate_buffers - #endif - if (molecular) { - for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - - if (THREE) { - const int trip = jnum * pack_width; - for (int jj = 0; jj < trip; jj+=pack_width) { - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } else { - #if defined(LMP_SIMD_COMPILER) - #pragma vector aligned - #pragma simd - #endif - for (int jj = 0; jj < jnum; jj++) { - const int j = jlist[jj]; - if (need_ic && j < 0) { - which = 0; - jlist[jj] = -j - 1; - } else - ofind_special(which, special, nspecial, i, tag[j]); - #ifdef _LMP_INTEL_OFFLOAD - if (j >= nlocal) { - if (j == e_nall) - jlist[jj] = nall_offset; - else if (which) - jlist[jj] = (j-ghost_offset) ^ (which << SBBITS); - else jlist[jj]-=ghost_offset; - } else - #endif - if (which) jlist[jj] = j ^ (which << SBBITS); - } - } - } // for i - } // if molecular - #ifdef _LMP_INTEL_OFFLOAD - else if (separate_buffers) { for (int i = ifrom; i < ito; ++i) { int * _noalias jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + int jnum = numneigh[i]; + if (!THREE) IP_PRE_neighbor_pad(jnum, offload); int jj = 0; #pragma vector aligned #pragma simd for (jj = 0; jj < jnum; jj++) { - if (jlist[jj] >= nlocal) { - if (jlist[jj] == e_nall) jlist[jj] = nall_offset; - else jlist[jj] -= ghost_offset; + const int which = jlist[jj] >> SBBITS & 3; + const int j = jlist[jj] & NEIGHMASK; + if (j >= nlocal) { + if (j == e_nall) jlist[jj] = nall_offset; + else jlist[jj] = (j - ghost_offset) ^ (which << SBBITS); } } } diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp index 71aad95bc5..39fa9014db 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp @@ -142,6 +142,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); flt_t * _noalias const q = buffers->get_q(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -185,6 +186,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,qqrd2e,inum,nall,ntypes,vflag,eatom) \ in(f_stride,nlocal,minlocal,separate_flag,offload) \ @@ -221,15 +223,17 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, FORCE_T * _noalias const f = f_start + foff; if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp index bba8a7b5e7..fe4d408a13 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp @@ -142,6 +142,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); flt_t * _noalias const q = buffers->get_q(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -209,6 +210,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ @@ -255,15 +257,17 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, int * _noalias const tj = ccachei + toffs; int * _noalias const tjtype = ccachej + toffs; - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; const flt_t * _noalias const rho_invi = rho_inv + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp index f5dde26393..8ce3d121e0 100644 --- a/src/USER-INTEL/pair_buck_intel.cpp +++ b/src/USER-INTEL/pair_buck_intel.cpp @@ -133,6 +133,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -169,6 +170,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \ in(f_stride,nlocal,minlocal,separate_flag,offload) \ @@ -205,14 +207,16 @@ void PairBuckIntel::eval(const int offload, const int vflag, FORCE_T * _noalias const f = f_start + foff; if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp index cb8c06cacc..7c33ed21d3 100644 --- a/src/USER-INTEL/pair_dpd_intel.cpp +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -171,6 +171,7 @@ void PairDPDIntel::eval(const int offload, const int vflag, lmp_vt *v = (lmp_vt *)atom->v[0]; const flt_t dtinvsqrt = 1.0/sqrt(update->dt); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -237,7 +238,8 @@ void PairDPDIntel::eval(const int offload, const int vflag, gamma = param[3].gamma; sigma = param[3].sigma; } - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; int itype, ptr_off; const FC_PACKED1_T * _noalias parami; if (!ONETYPE) { @@ -246,8 +248,9 @@ void PairDPDIntel::eval(const int offload, const int vflag, parami = param + ptr_off; } - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp, fytmp, fztmp, fwtmp; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp index 94799cdba2..201277e68d 100644 --- a/src/USER-INTEL/pair_eam_intel.cpp +++ b/src/USER-INTEL/pair_eam_intel.cpp @@ -164,8 +164,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, memory->destroy(rho); memory->destroy(fp); nmax = atom->nmax; - int edge = (nmax * sizeof(acc_t)) % INTEL_DATA_ALIGN; - if (edge) nmax += (INTEL_DATA_ALIGN - edge) / sizeof(acc_t); + IP_PRE_edge_align(nmax, sizeof(acc_t)); if (NEWTON_PAIR) memory->create(rho,nmax*comm->nthreads,"pair:rho"); else @@ -192,6 +191,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -286,14 +286,16 @@ void PairEAMIntel::eval(const int offload, const int vflag, rhor_joff = rhor_ioff + _onetype * jstride; frho_ioff = fstride * _onetype; } - for (int i = iifrom; i < iito; ++i) { + for (int ii = iifrom; ii < iito; ++ii) { + const int i = ilist[ii]; int itype, rhor_ioff; if (!ONETYPE) { itype = x[i].w; rhor_ioff = istride * itype; } - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; @@ -411,7 +413,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, #pragma vector aligned #pragma simd reduction(+:tevdwl) #endif - for (int i = iifrom; i < iito; ++i) { + for (int ii = iifrom; ii < iito; ++ii) { + const int i = ilist[ii]; int itype; if (!ONETYPE) itype = x[i].w; flt_t p = rho[i]*frdrho + (flt_t)1.0; @@ -457,7 +460,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, // compute forces on each atom // loop over neighbors of my atoms - for (int i = iifrom; i < iito; ++i) { + for (int ii = iifrom; ii < iito; ++ii) { + const int i = ilist[ii]; int itype, rhor_ioff; const flt_t * _noalias scale_fi; if (!ONETYPE) { @@ -465,8 +469,9 @@ void PairEAMIntel::eval(const int offload, const int vflag, rhor_ioff = istride * itype; scale_fi = scale_f + itype*ntypes; } - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp, fytmp, fztmp, fwtmp; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; @@ -761,14 +766,12 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, if (ntypes > 0) { _cop = cop; _nr = nr + 1; - int edge = (_nr * sizeof(flt_t)) % INTEL_DATA_ALIGN; - if (edge) _nr += (INTEL_DATA_ALIGN - edge) / sizeof(flt_t); + IP_PRE_edge_align(_nr, sizeof(flt_t)); memory->create(rhor_spline_f,ntypes*ntypes*_nr,"fc.rhor_spline_f"); memory->create(rhor_spline_e,ntypes*ntypes*_nr,"fc.rhor_spline_e"); memory->create(z2r_spline_t,ntypes*ntypes*_nr,"fc.z2r_spline_t"); _nrho = nrho + 1; - edge = (_nrho * sizeof(flt_t)) % INTEL_DATA_ALIGN; - if (edge) _nrho += (INTEL_DATA_ALIGN - edge) / sizeof(flt_t); + IP_PRE_edge_align(_nrho, sizeof(flt_t)); memory->create(frho_spline_f,ntypes*_nrho,"fc.frho_spline_f"); memory->create(frho_spline_e,ntypes*_nrho,"fc.frho_spline_e"); memory->create(scale_f,ntypes,ntypes,"fc.scale_f"); diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index 1f05ad0efc..30941a7594 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -226,7 +226,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } #endif - // const int * _noalias const ilist = list->ilist; + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -287,6 +287,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(quat:length(nall+1) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \ in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \ @@ -364,15 +365,16 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, int * _noalias const jlist_form = jlist_formi + tid * max_nbors; int ierror = 0; - for (int i = iifrom; i < iito; i += iip) { - // const int i = ilist[ii]; + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off; const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; @@ -428,15 +430,13 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, } else multiple_forms = true; } - const int edge = packed_j & (pad_width - 1); - if (edge) { - const int packed_end = packed_j + (pad_width - edge); - #if defined(LMP_SIMD_COMPILER) - #pragma loop_count min=1, max=15, avg=8 - #endif - for ( ; packed_j < packed_end; packed_j++) - jlist_form[packed_j] = nall; - } + int packed_end = packed_j; + IP_PRE_neighbor_pad(packed_end, offload); + #if defined(LMP_SIMD_COMPILER) + #pragma loop_count min=1, max=15, avg=8 + #endif + for ( ; packed_j < packed_end; packed_j++) + jlist_form[packed_j] = nall; // ------------------------------------------------------------- diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp index 7485395bef..8e6395c388 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp @@ -136,6 +136,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); flt_t * _noalias const q = buffers->get_q(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -189,6 +190,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ @@ -238,16 +240,17 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, int * _noalias const tj = ccachei + toffs; int * _noalias const tjtype = ccachej + toffs; - for (int i = iifrom; i < iito; i += iip) { - // const int i = ilist[ii]; + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const flt_t * _noalias const cutsqi = cutsq + ptr_off; const LJ_T * _noalias const lji = lj + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index 25cca59714..a01a4688a5 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -140,6 +140,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); flt_t * _noalias const q = buffers->get_q(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -210,6 +211,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ @@ -258,16 +260,17 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, int * _noalias const tj = ccachei + toffs; int * _noalias const tjtype = ccachej + toffs; - for (int i = iifrom; i < iito; i += iip) { - // const int i = ilist[ii]; + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const flt_t * _noalias const cutsqi = cutsq + ptr_off; const LJ_T * _noalias const lji = lj + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index cb7381270d..ab0b5b3d55 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -139,6 +139,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); flt_t * _noalias const q = buffers->get_q(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -204,6 +205,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ @@ -250,15 +252,17 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, int * _noalias const tj = ccachei + toffs; int * _noalias const tjtype = ccachej + toffs; - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; const int itype = x[i].w; const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index b16f6230cc..c973639709 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -148,6 +148,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const firstneigh = buffers->firstneigh(list); @@ -207,7 +208,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag, lj4 = lj34[3].lj4; offset = ljc12o[3].offset; } - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; int itype, ptr_off; const FC_PACKED1_T * _noalias ljc12oi; const FC_PACKED2_T * _noalias lj34i; @@ -217,9 +219,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag, ljc12oi = ljc12o + ptr_off; lj34i = lj34 + ptr_off; } - - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + int jnum = numneigh[ii]; + IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp, fytmp, fztmp, fwtmp; acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5; diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp index 421de91ee9..d1d52270be 100644 --- a/src/USER-INTEL/pair_sw_intel.cpp +++ b/src/USER-INTEL/pair_sw_intel.cpp @@ -42,7 +42,6 @@ #include "suffix.h" #ifdef LMP_USE_AVXCD -#define OUTER_CHUNK 1 #include "intel_simd.h" using namespace ip_simd; #endif @@ -185,6 +184,7 @@ void PairSWIntel::eval(const int offload, const int vflag, IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); ATOM_T * _noalias const x = buffers->get_x(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneighhalf = buffers->get_atombin(); const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); @@ -230,6 +230,7 @@ void PairSWIntel::eval(const int offload, const int vflag, in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(numneighhalf:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ @@ -298,7 +299,8 @@ void PairSWIntel::eval(const int offload, const int vflag, } } - for (int i = iifrom; i < iito; i += iip) { + for (int ii = iifrom; ii < iito; ii += iip) { + const int i = ilist[ii]; int itype, itype_offset; const flt_t xtmp = x[i].x; const flt_t ytmp = x[i].y; @@ -309,9 +311,9 @@ void PairSWIntel::eval(const int offload, const int vflag, itype_offset = itype * ntypes; } - const int * _noalias const jlist = firstneigh + cnumneigh[i]; - const int jnum = numneigh[i]; - const int jnumhalf = numneighhalf[i]; + const int * _noalias const jlist = firstneigh + cnumneigh[ii]; + const int jnum = numneigh[ii]; + const int jnumhalf = numneighhalf[ii]; acc_t fxtmp, fytmp, fztmp, fwtmp; acc_t sevdwl; @@ -346,9 +348,13 @@ void PairSWIntel::eval(const int offload, const int vflag, } } - int ejrem = ejnum & (pad_width - 1); - if (ejrem) ejrem = pad_width - ejrem; - const int ejnum_pad = ejnum + ejrem; + int ejnum_pad = ejnum; + IP_PRE_neighbor_pad(ejnum_pad, offload); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif for (int jj = ejnum; jj < ejnum_pad; jj++) { tdelx[jj] = (flt_t)0.0; tdely[jj] = (flt_t)0.0; @@ -594,6 +600,7 @@ void PairSWIntel::eval(const int offload, const int vflag, IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall); ATOM_T * _noalias const x = buffers->get_x(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneighhalf = buffers->get_atombin(); const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); @@ -641,6 +648,7 @@ void PairSWIntel::eval(const int offload, const int vflag, in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(numneighhalf:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ @@ -714,24 +722,24 @@ void PairSWIntel::eval(const int offload, const int vflag, } } - SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128, 144,160,176,192,208,224,240); - ilist = ilist + iifrom; acc_t * const dforce = &(f[0].x); for (int i = iifrom; i < iito; i += iip) { - SIMD_mask imask = ilist < iito; + SIMD_int ilistv = SIMD_load(ilist + i); + SIMD_int goffset = ilistv * 16; + SIMD_mask imask = ilistv < iito; SIMD_flt_t xtmp, ytmp, ztmp; SIMD_int itype, itype_offset; if (ONETYPE) - SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp); + SIMD_atom_gather(imask, &(x[0].x), goffset, xtmp, ytmp, ztmp); else { - SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype); + SIMD_atom_gather(imask, &(x[0].x), goffset, xtmp, ytmp, ztmp, itype); itype_offset = itype * ntypes; } - #ifdef OUTER_CHUNK + #ifdef LMP_INTEL_3BODY_FAST const int* ng = firstneigh + cnumneigh[i] - swidth; #else SIMD_int ng = SIMD_load(cnumneigh + i); @@ -765,7 +773,7 @@ void PairSWIntel::eval(const int offload, const int vflag, for (int jj = 0; jj < jnum_max; jj++) { SIMD_mask jmask = jj < jnum; - #ifdef OUTER_CHUNK + #ifdef LMP_INTEL_3BODY_FAST ng += swidth; SIMD_int j = SIMD_load(ng); #else @@ -1025,15 +1033,15 @@ void PairSWIntel::eval(const int offload, const int vflag, } } // for jj second loop - SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp, + SIMD_iforce_update(imask, &(f[0].x), goffset, fxtmp, fytmp, fztmp, EFLAG, eatom, fwtmp); if (is_same<flt_t,acc_t>::value == 0) { imask = imask >> 8; - SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, + SIMD_int goffset2 = _mm512_shuffle_i32x4(goffset, goffset, 238); + SIMD_iforce_update(imask, &(f[0].x), goffset2, fxtmp2, fytmp2, fztmp2, EFLAG, eatom, fwtmp2); } if (EFLAG) oevdwl += SIMD_sum(sevdwl); - ilist = ilist + iip; } // for ii IP_PRE_fdotr_reduce_omp(1, nall, minlocal, nthreads, f_start, f_stride, diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp index c772546928..584b371784 100644 --- a/src/USER-INTEL/pair_tersoff_intel.cpp +++ b/src/USER-INTEL/pair_tersoff_intel.cpp @@ -208,6 +208,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> const int * _noalias const cnumneigh, const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, + const int * _noalias const ilist, const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, @@ -271,6 +272,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag, tagint * _noalias tag = this->atom->tag; flt_t * _noalias const q = buffers->get_q(offload); + const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; const int * _noalias const cnumneigh = buffers->cnumneigh(list); const int * _noalias const numneighhalf = buffers->get_atombin(); @@ -311,6 +313,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag, in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(numneighhalf:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ + in(ilist:length(0) alloc_if(0) free_if(0)) \ in(overflow:length(0) alloc_if(0) free_if(0)) \ in(astart,nthreads,inum,nall,ntypes,vflag,eatom) \ in(f_stride,nlocal,minlocal,separate_flag,offload) \ @@ -349,8 +352,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag, { acc_t sevdwl; sevdwl = 0.; - #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, cnumneigh, \ - firstneigh, ntypes, x, c_inner, c_outer, f, &sevdwl + #define ARGS iito, iifrom, eatom, vflag, numneigh, numneighhalf, \ + cnumneigh, firstneigh, ntypes, x, ilist, c_inner, \ + c_outer, f, &sevdwl // Pick the variable i algorithm under specific conditions // do use scalar algorithm with very short vectors int VL = lmp_intel::vector_routines<flt_t,acc_t,lmp_intel::mode>::VL; @@ -1135,6 +1139,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( const int * _noalias const cnumneigh, const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, + const int * _noalias const ilist, const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, @@ -1155,12 +1160,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( for (ii = iifrom; ii < iito; ii++) { // Right now this loop is scalar, to allow for the compiler to do // its prefetching magic. - int i = ii; + int i = ilist[ii]; int w_i = x[i].w; flt_t x_i = x[i].x; flt_t y_i = x[i].y; flt_t z_i = x[i].z; - int jlist_off_i = cnumneigh[i]; + int jlist_off_i = cnumneigh[ii]; int jnum = numneigh[ii]; for (jj = 0; jj < jnum; jj++) { int j = firstneigh[jlist_off_i + jj] & NEIGHMASK; @@ -1173,7 +1178,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( if (rsq < cutsq) { is[compress_idx] = ii; js[compress_idx] = j; - if (jj < numneighhalf[i]) + if (jj < numneighhalf[ii]) repulsive_flag[compress_idx] = 1; compress_idx += 1; } diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index 962202e228..5dd0da1939 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -157,7 +157,7 @@ void VerletLRTIntel::setup(int flag) pthread_create(&_kspace_thread, &_kspace_attr, &VerletLRTIntel::k_launch_loop, this); #elif defined(_LMP_INTEL_LRT_11) - std::thread kspace_thread; + std::thread _kspace_thread; if (kspace_compute_flag) _kspace_thread=std::thread([=]{ _intel_kspace->compute_first(eflag, vflag); }); diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h index 7380cc6376..6c44ba7e57 100644 --- a/src/USER-INTEL/verlet_lrt_intel.h +++ b/src/USER-INTEL/verlet_lrt_intel.h @@ -24,14 +24,20 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel) #include "pppm_intel.h" #ifdef LMP_INTEL_USELRT -#ifdef LMP_INTEL_LRT11 -#define _LMP_INTEL_LRT_11 -#include <thread> -#else -#define _LMP_INTEL_LRT_PTHREAD -#include <pthread.h> -#endif - + #if defined(LMP_INTEL_LRT11) || defined(__APPLE__) + #if __cplusplus > 199711L + #define _LMP_INTEL_LRT_11 + #include <thread> + #else + #undef LMP_INTEL_USELRT + #ifdef LMP_INTEL_LRT11 + #error C++11 support required for LMP_INTEL_LRT11 define + #endif + #endif + #else + #define _LMP_INTEL_LRT_PTHREAD + #include <pthread.h> + #endif #endif namespace LAMMPS_NS { -- GitLab