diff --git a/doc/src/Speed_intel.txt b/doc/src/Speed_intel.txt index bf078fb57652b747c99b17d12d4f441618e58d24..d35a311f460f91b3a9732fa19c6ad1f3f3df3d4a 100644 --- a/doc/src/Speed_intel.txt +++ b/doc/src/Speed_intel.txt @@ -499,7 +499,7 @@ MPI task. When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles that require skip lists for neighbor builds cannot be offloaded. Using "hybrid/overlay"_pair_hybrid.html is allowed. Only one intel -accelerated style may be used with hybrid styles. +accelerated style may be used with hybrid styles when offloading. "Special_bonds"_special_bonds.html exclusion lists are not currently supported with offload, however, the same effect can often be accomplished by setting cutoffs for excluded atom types to 0. None of diff --git a/src/USER-INTEL/angle_harmonic_intel.cpp b/src/USER-INTEL/angle_harmonic_intel.cpp index 5777dad9479957be52f9496435a6fdf845d828ff..6d8901a5ee1141f563e954c8781cccbf96d0948c 100644 --- a/src/USER-INTEL/angle_harmonic_intel.cpp +++ b/src/USER-INTEL/angle_harmonic_intel.cpp @@ -314,7 +314,7 @@ void AngleHarmonicIntel::init_style() template <class flt_t, class acc_t> void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> * /*buffers*/) + IntelBuffers<flt_t,acc_t> * /*buffers*/) { const int bp1 = atom->nangletypes + 1; fc.set_ntypes(bp1,memory); diff --git a/src/USER-INTEL/bond_fene_intel.cpp b/src/USER-INTEL/bond_fene_intel.cpp index 9d54269c0563cb4134fdd278eb123ebc66ace8b4..bff3722a44a06ee657f19479ad7b28452567bb97 100644 --- a/src/USER-INTEL/bond_fene_intel.cpp +++ b/src/USER-INTEL/bond_fene_intel.cpp @@ -291,7 +291,7 @@ void BondFENEIntel::init_style() template <class flt_t, class acc_t> void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> * /*buffers*/) + IntelBuffers<flt_t,acc_t> * /*buffers*/) { const int bp1 = atom->nbondtypes + 1; fc.set_ntypes(bp1,memory); diff --git a/src/USER-INTEL/dihedral_opls_intel.cpp b/src/USER-INTEL/dihedral_opls_intel.cpp index ef95cc75bb1bf998418152d47e994edbeb35f976..7a60b62cae62fc64c5f9d4f1432bfe1cbbe90dd9 100644 --- a/src/USER-INTEL/dihedral_opls_intel.cpp +++ b/src/USER-INTEL/dihedral_opls_intel.cpp @@ -416,7 +416,7 @@ void DihedralOPLSIntel::init_style() template <class flt_t, class acc_t> void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> * /*buffers*/) + IntelBuffers<flt_t,acc_t> * /*buffers*/) { const int bp1 = atom->ndihedraltypes + 1; fc.set_ntypes(bp1,memory); diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp index 37212928dfdc5333a50c7fd3daa3b373c9bc70b6..e0865f043164a21a88e1aaee5115b72a6080c9ec 100644 --- a/src/USER-INTEL/fix_intel.cpp +++ b/src/USER-INTEL/fix_intel.cpp @@ -65,6 +65,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg) _nbor_pack_width = 1; _three_body_neighbor = 0; + _hybrid_nonpair = 0; _precision_mode = PREC_MODE_MIXED; _offload_balance = -1.0; @@ -266,8 +267,7 @@ FixIntel::~FixIntel() double *time1 = off_watch_pair(); double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); - if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL && - overflow != NULL) { + if (_offload_balance != 0.0) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2,overflow:alloc_if(0) free_if(1)) } @@ -314,34 +314,63 @@ void FixIntel::init() int nstyles = 0; if (force->pair_match("hybrid", 1) != NULL) { + _pair_hybrid_flag = 1; PairHybrid *hybrid = (PairHybrid *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (strstr(hybrid->keywords[i], "/intel") != NULL) nstyles++; + if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute) + error->all(FLERR, + "Intel package requires fdotr virial with newton on."); } else if (force->pair_match("hybrid/overlay", 1) != NULL) { + _pair_hybrid_flag = 1; PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (strstr(hybrid->keywords[i], "/intel") != NULL) nstyles++; - else - force->pair->no_virial_fdotr_compute = 1; + if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute) + error->all(FLERR, + "Intel package requires fdotr virial with newton on."); + } else + _pair_hybrid_flag = 0; + + if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2; + else if (force->newton_pair == 0) _pair_hybrid_flag = 0; + + _pair_hybrid_zero = 0; + _zero_master = 0; + + if (_pair_hybrid_flag && _hybrid_nonpair) + if (_pair_hybrid_flag > 1 || force->newton_pair == 0) + _pair_hybrid_zero = 1; + _hybrid_nonpair = 0; + + #ifdef _LMP_INTEL_OFFLOAD + if (offload_balance() != 0.0) { + _pair_hybrid_zero = 0; + if (force->newton_pair == 0) _pair_hybrid_flag = 0; + if (nstyles > 1) + error->all(FLERR, + "Currently, cannot offload more than one intel style with hybrid."); } - if (nstyles > 1) - error->all(FLERR, - "Currently, cannot use more than one intel style with hybrid."); + #endif check_neighbor_intel(); + int off_mode = 0; if (_offload_balance != 0.0) off_mode = 1; if (_precision_mode == PREC_MODE_SINGLE) { _single_buffers->zero_ev(); _single_buffers->grow_ncache(off_mode,_nthreads); + _single_buffers->free_list_ptrs(); } else if (_precision_mode == PREC_MODE_MIXED) { _mixed_buffers->zero_ev(); _mixed_buffers->grow_ncache(off_mode,_nthreads); + _mixed_buffers->free_list_ptrs(); } else { _double_buffers->zero_ev(); _double_buffers->grow_ncache(off_mode,_nthreads); + _double_buffers->free_list_ptrs(); } _need_reduce = 0; @@ -349,7 +378,7 @@ void FixIntel::init() /* ---------------------------------------------------------------------- */ -void FixIntel::setup(int /*vflag*/) +void FixIntel::setup(int vflag) { if (neighbor->style != Neighbor::BIN) error->all(FLERR, @@ -395,8 +424,7 @@ void FixIntel::pair_init_check(const bool cdmessage) double *time1 = off_watch_pair(); double *time2 = off_watch_neighbor(); int *overflow = get_off_overflow_flag(); - if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL && - overflow != NULL) { + if (_offload_balance !=0.0) { #pragma offload_transfer target(mic:_cop) \ nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \ in(overflow:length(5) alloc_if(1) free_if(0)) @@ -419,6 +447,21 @@ void FixIntel::pair_init_check(const bool cdmessage) #endif } + #ifndef LMP_INTEL_NBOR_COMPAT + if (force->pair->manybody_flag && atom->molecular) { + int flag = 0; + if (atom->nbonds > 0 && force->special_lj[1] == 0.0 && + force->special_coul[1] == 0.0) flag = 1; + if (atom->nangles > 0 && force->special_lj[2] == 0.0 && + force->special_coul[2] == 0.0) flag = 1; + if (atom->ndihedrals > 0 && force->special_lj[3] == 0.0 && + force->special_coul[3] == 0.0) flag = 1; + if (flag) + error->all(FLERR,"Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond" + "exclusions with Intel"); + } + #endif + int need_tag = 0; if (atom->molecular) need_tag = 1; @@ -477,11 +520,13 @@ void FixIntel::bond_init_check() if (force->pair_match("/intel", 0) != NULL) intel_pair = 1; else if (force->pair_match("hybrid", 1) != NULL) { + _hybrid_nonpair = 1; PairHybrid *hybrid = (PairHybrid *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (strstr(hybrid->keywords[i], "/intel") != NULL) intel_pair = 1; } else if (force->pair_match("hybrid/overlay", 1) != NULL) { + _hybrid_nonpair = 1; PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (strstr(hybrid->keywords[i], "/intel") != NULL) @@ -501,11 +546,13 @@ void FixIntel::kspace_init_check() if (force->pair_match("/intel", 0) != NULL) intel_pair = 1; else if (force->pair_match("hybrid", 1) != NULL) { + _hybrid_nonpair = 1; PairHybrid *hybrid = (PairHybrid *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (strstr(hybrid->keywords[i], "/intel") != NULL) intel_pair = 1; } else if (force->pair_match("hybrid/overlay", 1) != NULL) { + _hybrid_nonpair = 1; PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (strstr(hybrid->keywords[i], "/intel") != NULL) @@ -522,51 +569,60 @@ void FixIntel::check_neighbor_intel() { #ifdef _LMP_INTEL_OFFLOAD _full_host_list = 0; - #endif - const int nrequest = neighbor->nrequest; + const int nrequest = neighbor->nrequest; for (int i = 0; i < nrequest; ++i) { - #ifdef _LMP_INTEL_OFFLOAD if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) { _full_host_list = 1; _offload_noghost = 0; } - #endif + if (neighbor->requests[i]->skip && _offload_balance != 0.0) + error->all(FLERR, "Cannot yet use hybrid styles with Intel offload."); // avoid flagging a neighbor list as both USER-INTEL and USER-OMP if (neighbor->requests[i]->intel) neighbor->requests[i]->omp = 0; - - if (neighbor->requests[i]->skip) - error->all(FLERR, "Hybrid styles with Intel package are unsupported."); } + #else + // avoid flagging a neighbor list as both USER-INTEL and USER-OMP + const int nrequest = neighbor->nrequest; + for (int i = 0; i < nrequest; ++i) + if (neighbor->requests[i]->intel) + neighbor->requests[i]->omp = 0; + #endif } /* ---------------------------------------------------------------------- */ -void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/) +void FixIntel::_sync_main_arrays(const int prereverse) { + if (!prereverse) _zero_master = 1; + int done_this_step = prereverse; + if (_pair_hybrid_zero == 0) done_this_step = 1; if (_force_array_m != 0) { if (_need_reduce) { reduce_results(&_force_array_m[0].x); _need_reduce = 0; } add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0); - _force_array_m = 0; + if (done_this_step) _force_array_m = 0; + else _ev_array_d = 0; } else if (_force_array_d != 0) { if (_need_reduce) { reduce_results(&_force_array_d[0].x); _need_reduce = 0; } add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0); - _force_array_d = 0; + if (done_this_step) _force_array_d = 0; + else _ev_array_d = 0; } else if (_force_array_s != 0) { if (_need_reduce) { reduce_results(&_force_array_s[0].x); _need_reduce = 0; } add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0); - _force_array_s = 0; + if (done_this_step) _force_array_s = 0; + else _ev_array_s = 0; } #ifdef _LMP_INTEL_OFFLOAD @@ -576,6 +632,13 @@ void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/) /* ---------------------------------------------------------------------- */ +void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/) +{ + _sync_main_arrays(1); +} + +/* ---------------------------------------------------------------------- */ + template <class acc_t> void FixIntel::reduce_results(acc_t * _noalias const f_scalar) { @@ -657,7 +720,7 @@ template <class ft, class acc_t> void FixIntel::add_results(const ft * _noalias const f_in, const acc_t * _noalias const ev_global, const int eatom, const int vatom, - const int /*offload*/) { + const int offload) { start_watch(TIME_PACK); int f_length; #ifdef _LMP_INTEL_OFFLOAD diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h index e6e09be503dc0d69840e251c0abc20a103890a04..81dd0b5d975886d6c6a5444b6bf666543ffc9a56 100644 --- a/src/USER-INTEL/fix_intel.h +++ b/src/USER-INTEL/fix_intel.h @@ -74,11 +74,12 @@ class FixIntel : public Fix { inline int nbor_pack_width() const { return _nbor_pack_width; } inline void nbor_pack_width(const int w) { _nbor_pack_width = w; } inline int three_body_neighbor() { return _three_body_neighbor; } - inline void three_body_neighbor(const int /*i*/) { _three_body_neighbor = 1; } + inline void three_body_neighbor(const int i) { _three_body_neighbor = i; } inline int need_zero(const int tid) { if (_need_reduce == 0 && tid > 0) return 1; - return 0; + else if (_zero_master && tid == 0) { _zero_master = 0; return 1; } + else return 0; } inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; } inline int lrt() { @@ -100,7 +101,10 @@ class FixIntel : public Fix { IntelBuffers<double,double> *_double_buffers; int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor; - + int _pair_hybrid_flag; + // These should be removed in subsequent update w/ simpler hybrid arch + int _pair_hybrid_zero, _hybrid_nonpair, _zero_master; + public: inline int* get_overflow_flag() { return _overflow_flag; } inline int* get_off_overflow_flag() { return _off_overflow_flag; } @@ -210,6 +214,8 @@ class FixIntel : public Fix { _alignvar(double _stopwatch_offload_neighbor[1],64); _alignvar(double _stopwatch_offload_pair[1],64); + void _sync_main_arrays(const int prereverse); + template <class ft> void reduce_results(ft * _noalias const f_in); @@ -238,7 +244,7 @@ class FixIntel : public Fix { /* ---------------------------------------------------------------------- */ -void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall, +void FixIntel::get_buffern(const int offload, int &nlocal, int &nall, int &minlocal) { #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers) { @@ -273,7 +279,7 @@ void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall, /* ---------------------------------------------------------------------- */ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, - double *ev_in, const int /*offload*/, + double *ev_in, const int offload, const int eatom, const int vatom, const int rflag) { #ifdef _LMP_INTEL_OFFLOAD @@ -282,6 +288,8 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, _off_results_vatom = vatom; _off_force_array_d = f_in; _off_ev_array_d = ev_in; + if (_pair_hybrid_flag && force->pair->fdotr_is_set()) + _sync_main_arrays(1); return; } #endif @@ -296,12 +304,15 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in, if (_overflow_flag[LMP_OVERFLOW]) error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + + if (_pair_hybrid_flag > 1 || + (_pair_hybrid_flag && force->pair->fdotr_is_set())) _sync_main_arrays(0); } /* ---------------------------------------------------------------------- */ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, - double *ev_in, const int /*offload*/, + double *ev_in, const int offload, const int eatom, const int vatom, const int rflag) { #ifdef _LMP_INTEL_OFFLOAD @@ -310,6 +321,8 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, _off_results_vatom = vatom; _off_force_array_m = f_in; _off_ev_array_d = ev_in; + if (_pair_hybrid_flag && force->pair->fdotr_is_set()) + _sync_main_arrays(1); return; } #endif @@ -324,12 +337,16 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in, if (_overflow_flag[LMP_OVERFLOW]) error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + + if (_pair_hybrid_flag > 1 || + (_pair_hybrid_flag && force->pair->fdotr_is_set())) + _sync_main_arrays(0); } /* ---------------------------------------------------------------------- */ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in, - float *ev_in, const int /*offload*/, + float *ev_in, const int offload, const int eatom, const int vatom, const int rflag) { #ifdef _LMP_INTEL_OFFLOAD @@ -338,6 +355,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in, _off_results_vatom = vatom; _off_force_array_s = f_in; _off_ev_array_s = ev_in; + if (_pair_hybrid_flag && force->pair->fdotr_is_set()) + _sync_main_arrays(1); return; } #endif @@ -352,6 +371,10 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in, if (_overflow_flag[LMP_OVERFLOW]) error->one(FLERR, "Neighbor list overflow, boost neigh_modify one"); + + if (_pair_hybrid_flag > 1 || + (_pair_hybrid_flag && force->pair->fdotr_is_set())) + _sync_main_arrays(0); } /* ---------------------------------------------------------------------- */ @@ -487,16 +510,16 @@ The compiler version used to build LAMMPS is not supported when using offload to a coprocessor. There could be performance or correctness issues. Please use 14.0.1.106 or 15.1.133 or later. -E: Currently, cannot use more than one intel style with hybrid. +E: Currently, cannot offload more than one intel style with hybrid. -Currently, hybrid pair styles can only use the intel suffix for one of the -pair styles. +Currently, when using offload, hybrid pair styles can only use the intel +suffix for one of the pair styles. -E: Cannot yet use hybrid styles with Intel package. +E: Cannot yet use hybrid styles with Intel offload. -The hybrid pair style configuration is not yet supported by the Intel -package. Support is limited to hybrid/overlay or a hybrid style that does -not require a skip list. +The hybrid pair style configuration is not yet supported when using offload +within the Intel package. Support is limited to hybrid/overlay or a hybrid +style that does not require a skip list. W: Leaving a core/node free can improve performance for offload @@ -538,4 +561,16 @@ E: Too few atoms for load balancing offload. When using offload to a coprocessor, each MPI task must have at least 2 atoms throughout the simulation. +E: Intel package requires fdotr virial with newton on. + +This error can occur with a hybrid pair style that mixes styles that are +incompatible with the newton pair setting turned on. Try turning the +newton pair setting off. + +E: Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond exclusions with Intel + +When using a manybody pair style, bonds/angles/dihedrals, and special_bond +exclusions, LAMMPS should be built with the above compile flag for compatible +results. + */ diff --git a/src/USER-INTEL/improper_cvff_intel.cpp b/src/USER-INTEL/improper_cvff_intel.cpp index c20c004657b1b53ce16c55dd6a41a87f37d708f4..03bd134b401f67e4aec8603bd858e7dc9716a597 100644 --- a/src/USER-INTEL/improper_cvff_intel.cpp +++ b/src/USER-INTEL/improper_cvff_intel.cpp @@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style() template <class flt_t, class acc_t> void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc, - IntelBuffers<flt_t,acc_t> * /*buffers*/) + IntelBuffers<flt_t,acc_t> * /*buffers*/) { const int bp1 = atom->nimpropertypes + 1; fc.set_ntypes(bp1,memory); diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp index 05176e1fd1e1bf4bc629dc6d8b40275b4bed81bb..b7026f90b762793124c42a4c474db8e09b3756a9 100644 --- a/src/USER-INTEL/intel_buffers.cpp +++ b/src/USER-INTEL/intel_buffers.cpp @@ -24,7 +24,9 @@ using namespace LAMMPS_NS; template <class flt_t, class acc_t> IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) : lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0), - _buf_size(0), _buf_local_size(0) { + _buf_size(0), _buf_local_size(0), _n_list_ptrs(1), _max_list_ptrs(4) { + _neigh_list_ptrs = new IntelNeighListPtrs[_max_list_ptrs]; + _neigh_list_ptrs[0].cnumneigh = 0; _list_alloc_atoms = 0; _ntypes = 0; _off_map_listlocal = 0; @@ -55,6 +57,7 @@ IntelBuffers<flt_t, acc_t>::~IntelBuffers() free_all_nbor_buffers(); free_ccache(); set_ntypes(0); + delete []_neigh_list_ptrs; } /* ---------------------------------------------------------------------- */ @@ -109,7 +112,7 @@ void IntelBuffers<flt_t, acc_t>::free_buffers() template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, const int nthreads, - const int /*offload_end*/) + const int offload_end) { free_buffers(); _buf_size = static_cast<double>(nall) * 1.1 + 1; @@ -186,11 +189,9 @@ void IntelBuffers<flt_t, acc_t>::free_nmax() const int * tag = _off_map_tag; const int * special = _off_map_special; const int * nspecial = _off_map_nspecial; - if (tag != 0 && special != 0 && nspecial !=0) { - #pragma offload_transfer target(mic:_cop) \ - nocopy(tag:alloc_if(0) free_if(1)) \ - nocopy(special,nspecial:alloc_if(0) free_if(1)) - } + #pragma offload_transfer target(mic:_cop) \ + nocopy(tag:alloc_if(0) free_if(1)) \ + nocopy(special,nspecial:alloc_if(0) free_if(1)) _off_map_nmax = 0; _host_nmax = 0; } @@ -200,7 +201,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax() /* ---------------------------------------------------------------------- */ template <class flt_t, class acc_t> -void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int /*offload_end*/) +void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int offload_end) { #ifdef _LMP_INTEL_OFFLOAD free_nmax(); @@ -243,46 +244,117 @@ template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::free_list_local() { if (_off_map_listlocal > 0) { - int * cnumneigh = _cnumneigh; + if (_neigh_list_ptrs[0].cnumneigh) { + int * cnumneigh = _neigh_list_ptrs[0].cnumneigh; + _neigh_list_ptrs[0].cnumneigh = 0; + #ifdef _LMP_INTEL_OFFLOAD + if (_off_map_ilist != NULL) { + #pragma offload_transfer target(mic:_cop) \ + nocopy(cnumneigh:alloc_if(0) free_if(1)) + } + #endif + lmp->memory->destroy(cnumneigh); + } + #ifdef _LMP_INTEL_OFFLOAD if (_off_map_ilist != NULL) { const int * ilist = _off_map_ilist; const int * numneigh = _off_map_numneigh; + const int ** firstneigh = (const int **)_off_map_firstneigh; _off_map_ilist = NULL; - if (numneigh != 0 && ilist != 0) { - #pragma offload_transfer target(mic:_cop) \ - nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1)) - } + #pragma offload_transfer target(mic:_cop) \ + nocopy(ilist,firstneigh,numneigh:alloc_if(0) free_if(1)) } #endif - lmp->memory->destroy(cnumneigh); _off_map_listlocal = 0; } } /* ---------------------------------------------------------------------- */ +template <class flt_t, class acc_t> +void IntelBuffers<flt_t, acc_t>::free_list_ptrs() +{ + for (int list_num = 1; list_num < _n_list_ptrs; list_num++) { + if (_neigh_list_ptrs[list_num].size) { + lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh); + lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf); + } + _neigh_list_ptrs[list_num].size = 0; + _neigh_list_ptrs[list_num].list_ptr = 0; + } + _n_list_ptrs = 1; +} + +/* ---------------------------------------------------------------------- */ + +template <class flt_t, class acc_t> +void IntelBuffers<flt_t, acc_t>::grow_data3(NeighList *list, + int *&numneighhalf, + int *&cnumneigh) +{ + const int size = list->get_maxlocal(); + int list_num; + for (list_num = 0; list_num < _n_list_ptrs; list_num++) + if (_neigh_list_ptrs[list_num].list_ptr == (void*)list) break; + if (list_num == _n_list_ptrs) { + if (_n_list_ptrs == _max_list_ptrs) { + _max_list_ptrs *= 2; + IntelNeighListPtrs *new_list = new IntelNeighListPtrs[_max_list_ptrs]; + for (int i = 0; i < _n_list_ptrs; i++) new_list[i] = _neigh_list_ptrs[i]; + delete []_neigh_list_ptrs; + _neigh_list_ptrs = new_list; + } + _neigh_list_ptrs[list_num].list_ptr = (void *)list; + _neigh_list_ptrs[list_num].size = 0; + _n_list_ptrs++; + } + if (size > _neigh_list_ptrs[list_num].size) { + if (_neigh_list_ptrs[list_num].size) { + lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh); + lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf); + } + lmp->memory->create(_neigh_list_ptrs[list_num].cnumneigh, size, + "_cnumneigh"); + lmp->memory->create(_neigh_list_ptrs[list_num].numneighhalf, size, + "_cnumneigh"); + _neigh_list_ptrs[list_num].size = size; + } + numneighhalf = _neigh_list_ptrs[list_num].numneighhalf; + cnumneigh = _neigh_list_ptrs[list_num].cnumneigh; +} + +/* ---------------------------------------------------------------------- */ + template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list, - const int /*offload_end*/) + const int three_body, + const int offload_end) { free_list_local(); int size = list->get_maxlocal(); - lmp->memory->create(_cnumneigh, size, "_cnumneigh"); _off_map_listlocal = size; + if (three_body) + lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, size, "_cnumneigh"); #ifdef _LMP_INTEL_OFFLOAD if (offload_end > 0) { + int tb_size = size; + if (three_body == 0) { + lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, 16, "_cnumneigh"); + tb_size = 16; + } + int ** firstneigh = list->firstneigh; int * numneigh = list->numneigh; int * ilist = list->ilist; - int * cnumneigh = _cnumneigh; - if (cnumneigh != 0) { - #pragma offload_transfer target(mic:_cop) \ - nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ - nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ - nocopy(cnumneigh:length(size) alloc_if(1) free_if(0)) - } + int * cnumneigh = _neigh_list_ptrs[0].cnumneigh; + #pragma offload_transfer target(mic:_cop) \ + nocopy(ilist:length(size) alloc_if(1) free_if(0)) \ + nocopy(firstneigh:length(size) alloc_if(1) free_if(0)) \ + nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \ + nocopy(cnumneigh:length(tb_size) alloc_if(1) free_if(0)) _off_map_ilist = ilist; + _off_map_firstneigh = firstneigh; _off_map_numneigh = numneigh; } #endif @@ -313,7 +385,7 @@ template <class flt_t, class acc_t> void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList * /*list*/, const int nlocal, const int nthreads, - const int /*offload_end*/, + const int offload_end, const int pack_width) { free_nbor_list(); @@ -382,7 +454,7 @@ void IntelBuffers<flt_t, acc_t>::free_ccache() /* ---------------------------------------------------------------------- */ template <class flt_t, class acc_t> -void IntelBuffers<flt_t, acc_t>::grow_ccache(const int /*off_flag*/, +void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag, const int nthreads, const int width) { @@ -481,7 +553,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache() /* ---------------------------------------------------------------------- */ template <class flt_t, class acc_t> -void IntelBuffers<flt_t, acc_t>::grow_ncache(const int /*off_flag*/, +void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag, const int nthreads) { const int nsize = get_max_nbors() * 3; @@ -576,12 +648,12 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes, if (_ntypes > 0) { #ifdef _LMP_INTEL_OFFLOAD flt_t * cutneighsqo = _cutneighsq[0]; - if (_off_threads > 0 && cutneighsqo != 0) { + if (_off_threads > 0) { #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighsqo:alloc_if(0) free_if(1)) } flt_t * cutneighghostsqo; - if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) { + if (_cutneighghostsq && _off_threads > 0) { cutneighghostsqo = _cutneighghostsq[0]; #pragma offload_transfer target(mic:_cop) \ nocopy(cutneighghostsqo:alloc_if(0) free_if(1)) @@ -637,6 +709,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads) tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int); tmem += _ntypes * _ntypes * sizeof(int); + tmem += _buf_local_size + (_n_list_ptrs - 1) * _buf_local_size * 2; + return tmem; } diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h index e8225a4a30f42f9d9012452fbf9bdfbebdce5195..276b87ec5f2a2d1ff86302a9d257bd1aec6ec7a9 100644 --- a/src/USER-INTEL/intel_buffers.h +++ b/src/USER-INTEL/intel_buffers.h @@ -33,12 +33,20 @@ namespace LAMMPS_NS { #define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t #define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t +struct IntelNeighListPtrs { + void * list_ptr; + int *cnumneigh; + int *numneighhalf; + int size; +}; + // May not need a separate force array for mixed/double template <class flt_t, class acc_t> class IntelBuffers { public: typedef struct { flt_t x,y,z; int w; } atom_t; typedef struct { flt_t w,i,j,k; } quat_t; + typedef struct { flt_t x,y; } vec2_t; typedef struct { flt_t x,y,z,w; } vec3_t; typedef struct { flt_t x,y,z,w; } vec4_t; typedef struct { acc_t x,y,z,w; } vec3_acc_t; @@ -62,8 +70,12 @@ class IntelBuffers { void free_buffers(); void free_nmax(); - inline void set_bininfo(int *atombin, int *binpacked) - { _atombin = atombin; _binpacked = binpacked; } + inline void set_bininfo(int *atombin, int *binpacked) { + _atombin = atombin; + _binpacked = binpacked; + _neigh_list_ptrs[0].numneighhalf = atombin; + } + inline void grow(const int nall, const int nlocal, const int nthreads, const int offload_end) { if (nall >= _buf_size || nlocal >= _buf_local_size) @@ -79,18 +91,22 @@ class IntelBuffers { free_nmax(); free_list_local(); free_ncache(); + free_list_ptrs(); } inline void grow_list(NeighList *list, const int nlocal, const int nthreads, - const int offload_end, const int pack_width=1) { - grow_list_local(list, offload_end); + const int three_body, const int offload_end, + const int pack_width=1) { + grow_list_local(list, three_body, offload_end); grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width); } void free_list_local(); - inline void grow_list_local(NeighList *list, const int offload_end) { + inline void grow_list_local(NeighList *list, const int three_body, + const int offload_end) { + _neigh_list_ptrs[0].list_ptr = (void *)list; if (list->get_maxlocal() > _off_map_listlocal) - _grow_list_local(list, offload_end); + _grow_list_local(list, three_body, offload_end); } void free_ccache(); @@ -133,26 +149,36 @@ class IntelBuffers { _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width); } - void set_ntypes(const int ntypes, const int use_ghost_cut = 0); + void set_ntypes(const int ntypes, const int use_ghost_cut = 1); - inline int * firstneigh(const NeighList * /*list*/) { return _list_alloc; } - inline int * cnumneigh(const NeighList * /*list*/) { return _cnumneigh; } + inline int * intel_list(const NeighList * /*list*/) { return _list_alloc; } inline int * get_atombin() { return _atombin; } inline int * get_binpacked() { return _binpacked; } + inline int * cnumneigh() { return _neigh_list_ptrs[0].cnumneigh; } + inline void get_list_data3(const NeighList *list, int *&numneighhalf, + int *&cnumneigh) { + for (int i = 0; i < _n_list_ptrs; i++) + if ((void *)list == _neigh_list_ptrs[i].list_ptr) { + numneighhalf = _neigh_list_ptrs[i].numneighhalf; + cnumneigh = _neigh_list_ptrs[i].cnumneigh; + } + } + void grow_data3(NeighList *list, int *&numneighhalf, int *&cnumneigh); + void free_list_ptrs(); - inline atom_t * get_x(const int /*offload*/ = 1) { + inline atom_t * get_x(const int offload = 1) { #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers && offload == 0) return _host_x; #endif return _x; } - inline flt_t * get_q(const int /*offload*/ = 1) { + inline flt_t * get_q(const int offload = 1) { #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers && offload == 0) return _host_q; #endif return _q; } - inline quat_t * get_quat(const int /*offload*/ = 1) { + inline quat_t * get_quat(const int offload = 1) { #ifdef _LMP_INTEL_OFFLOAD if (_separate_buffers && offload == 0) return _host_quat; #endif @@ -298,6 +324,9 @@ class IntelBuffers { int _list_alloc_atoms; int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked; + IntelNeighListPtrs *_neigh_list_ptrs; + int _n_list_ptrs, _max_list_ptrs; + flt_t **_cutneighsq, **_cutneighghostsq; int _ntypes; @@ -325,7 +354,7 @@ class IntelBuffers { int _off_map_nmax, _cop, _off_ccache, _off_ncache; int *_off_map_ilist; int *_off_map_special, *_off_map_nspecial, *_off_map_tag; - int *_off_map_numneigh; + int **_off_map_firstneigh, *_off_map_numneigh; bool _off_list_alloc; #endif @@ -336,7 +365,8 @@ class IntelBuffers { void _grow(const int nall, const int nlocal, const int nthreads, const int offload_end); void _grow_nmax(const int offload_end); - void _grow_list_local(NeighList *list, const int offload_end); + void _grow_list_local(NeighList *list, const int three_body, + const int offload_end); void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads, const int offload_end, const int pack_width); }; diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h index 2f1b02a49f48826c3cf8841b7643972ec9914354..178a20c6e13396c0e2b9c00ab3814f7aa6a6e8d5 100644 --- a/src/USER-INTEL/intel_preprocess.h +++ b/src/USER-INTEL/intel_preprocess.h @@ -20,6 +20,9 @@ #if (__INTEL_COMPILER_BUILD_DATE > 20160720) #define LMP_INTEL_USE_SIMDOFF #endif +#pragma warning (disable:3948) +#pragma warning (disable:3949) +#pragma warning (disable:13200) #endif #ifdef __INTEL_OFFLOAD @@ -606,7 +609,7 @@ inline double MIC_Wtime() { if (newton) \ f_stride = buffers->get_stride(nall); \ else \ - f_stride = buffers->get_stride(inum); \ + f_stride = buffers->get_stride(nlocal); \ } #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \ diff --git a/src/USER-INTEL/intel_simd.h b/src/USER-INTEL/intel_simd.h index e9d5d929953682276884274fe6d90285a88bf5eb..75fc9828b96f0f99cfcfac86ce384c9f535efe73 100644 --- a/src/USER-INTEL/intel_simd.h +++ b/src/USER-INTEL/intel_simd.h @@ -213,6 +213,12 @@ namespace ip_simd { _MM_SCALE_8); } + inline SIMD_int SIMD_gatherz(const SIMD_mask &m, const int *p, + const SIMD_int &i) { + return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p, + _MM_SCALE_4); + } + inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p, const SIMD_int &i) { return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p, diff --git a/src/USER-INTEL/nbin_intel.cpp b/src/USER-INTEL/nbin_intel.cpp index 789fa35b429448a5e50651b206d5baea7364dd63..8359587eb28c4ea46592db8022abb8b460209749 100644 --- a/src/USER-INTEL/nbin_intel.cpp +++ b/src/USER-INTEL/nbin_intel.cpp @@ -192,7 +192,6 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) { // ---------- Bin Atoms ------------- _fix->start_watch(TIME_HOST_NEIGHBOR); - //const ATOM_T * _noalias const x = buffers->get_x(); int * _noalias const atombin = this->_atombin; int * _noalias const binpacked = this->_binpacked; diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp index 74a04f0e7de9dd97197049c7ac9227d2b8ee2516..5149b26f2f82019149589a2c307f58fbcdbc82cd 100644 --- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp @@ -86,7 +86,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list, // only uses offload_end_neighbor to check whether we are doing offloading // at all, no need to correct this later - buffers->grow_list(list, nall, comm->nthreads, off_end, + buffers->grow_list(list, nall, comm->nthreads, 0, off_end, _fix->nbor_pack_width()); int need_ic = 0; @@ -106,7 +106,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list, /* ---------------------------------------------------------------------- */ template<class flt_t, class acc_t, int need_ic> -void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, +void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list, IntelBuffers<flt_t,acc_t> * buffers, const int pstart, const int pend) { if (pend-pstart == 0) return; @@ -116,7 +116,8 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, const int aend = nall; const ATOM_T * _noalias const x = buffers->get_x(); - int * _noalias const firstneigh = buffers->firstneigh(list); + int * _noalias const intel_list = buffers->intel_list(list); + int ** _noalias const firstneigh = list->firstneigh; const int e_nall = nall_t; const int molecular = atom->molecular; @@ -140,7 +141,6 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, int * _noalias const ilist = list->ilist; int * _noalias numneigh = list->numneigh; - int * _noalias const cnumneigh = buffers->cnumneigh(list); const int nstencil = this->nstencil; const int * _noalias const stencil = this->stencil; const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; @@ -228,7 +228,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ + in(intel_list:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ in(atombin:length(aend) alloc_if(0) free_if(0)) \ @@ -295,7 +295,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, int pack_offset = maxnbors; int ct = (ifrom + tid * 2) * maxnbors; - int *neighptr = firstneigh + ct; + int *neighptr = intel_list + ct; const int obound = pack_offset + maxnbors * 2; const int toffs = tid * ncache_stride; @@ -528,12 +528,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, if (ns > maxnbors) *overflow = 1; ilist[i] = i; - cnumneigh[i] = ct; + firstneigh[i] = intel_list + ct; numneigh[i] = ns; ct += ns; IP_PRE_edge_align(ct, sizeof(int)); - neighptr = firstneigh + ct; + neighptr = intel_list + ct; if (ct + obound > list_size) { if (i < ito - 1) { *overflow = 1; @@ -564,13 +564,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, if (offload) { _fix->stop_watch(TIME_OFFLOAD_LATENCY); _fix->start_watch(TIME_HOST_NEIGHBOR); + firstneigh[0] = intel_list; for (int n = 0; n < aend; n++) { ilist[n] = n; numneigh[n] = 0; } } else { - for (int i = 0; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; if (separate_buffers) { _fix->start_watch(TIME_PACK); _fix->set_neighbor_host_sizes(); @@ -581,10 +580,5 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list, _fix->stop_watch(TIME_PACK); } } - #else - #pragma vector aligned - #pragma simd - for (int i = 0; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; #endif } diff --git a/src/USER-INTEL/npair_full_bin_intel.cpp b/src/USER-INTEL/npair_full_bin_intel.cpp index 60b912d796146d2c9a7f7a15d116afdc430c20ff..4ef84be75ff0cffadce6e5afb54408e58c65dd98 100644 --- a/src/USER-INTEL/npair_full_bin_intel.cpp +++ b/src/USER-INTEL/npair_full_bin_intel.cpp @@ -70,7 +70,8 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int offload_noghost = _fix->offload_noghost(); #endif - buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end, + buffers->grow_list(list, atom->nlocal, comm->nthreads, + _fix->three_body_neighbor(), off_end, _fix->nbor_pack_width()); int need_ic = 0; diff --git a/src/USER-INTEL/npair_half_bin_newton_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_intel.cpp index 8c024a46046827985db3f8047446ad06df61eba1..799ba2b57b67e26e88b8a9a959a23d5fbd4a8167 100644 --- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp +++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp @@ -71,7 +71,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int offload_noghost = _fix->offload_noghost(); #endif - buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end); + buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end); int need_ic = 0; if (atom->molecular) diff --git a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp index 653a95139498dbe1d34b5c8d77ca706664e02c56..1f859cb5f1bc821202ceb80271d4e336c3772fd0 100644 --- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp +++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp @@ -71,7 +71,7 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) { int offload_noghost = _fix->offload_noghost(); #endif - buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end); + buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end); int need_ic = 0; if (atom->molecular) diff --git a/src/USER-INTEL/npair_halffull_newtoff_intel.h b/src/USER-INTEL/npair_halffull_newtoff_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..c4c41a85a105fd453db73473dfbf69ccd80e99d8 --- /dev/null +++ b/src/USER-INTEL/npair_halffull_newtoff_intel.h @@ -0,0 +1,47 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +// Only used for hybrid to generate list for non-intel style. Use +// standard routines. + +#ifdef NPAIR_CLASS + +NPairStyle(halffull/newtoff/intel, + NPairHalffullNewtoff, + NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF | + NP_ORTHO | NP_TRI | NP_INTEL) + +NPairStyle(halffull/newtoff/skip/intel, + NPairHalffullNewtoff, + NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF | + NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL) + +NPairStyle(halffull/newtoff/ghost/intel, + NPairHalffullNewtoff, + NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF | + NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL) + +NPairStyle(halffull/newtoff/skip/ghost/intel, + NPairHalffullNewtoff, + NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF | + NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_INTEL) + +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/USER-INTEL/npair_halffull_newton_intel.cpp b/src/USER-INTEL/npair_halffull_newton_intel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..93a1a9792392a6ffd89f6fe8e9e41a0d65f836f7 --- /dev/null +++ b/src/USER-INTEL/npair_halffull_newton_intel.cpp @@ -0,0 +1,233 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "npair_halffull_newton_intel.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "modify.h" +#include "molecule.h" +#include "domain.h" +#include "my_page.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +NPairHalffullNewtonIntel::NPairHalffullNewtonIntel(LAMMPS *lmp) : NPair(lmp) { + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + _fix = static_cast<FixIntel *>(modify->fix[ifix]); +} + +/* ---------------------------------------------------------------------- + build half list from full list + pair stored once if i,j are both owned and i < j + if j is ghost, only store if j coords are "above and to the right" of i + works if full list is a skip list +------------------------------------------------------------------------- */ + +template <class flt_t, class acc_t> +void NPairHalffullNewtonIntel::build_t(NeighList *list, + IntelBuffers<flt_t,acc_t> *buffers) +{ + const int inum_full = list->listfull->inum; + const int nlocal = atom->nlocal; + const int e_nall = nlocal + atom->nghost; + const ATOM_T * _noalias const x = buffers->get_x(); + int * _noalias const ilist = list->ilist; + int * _noalias const numneigh = list->numneigh; + int ** _noalias const firstneigh = list->firstneigh; + const int * _noalias const ilist_full = list->listfull->ilist; + const int * _noalias const numneigh_full = list->listfull->numneigh; + const int ** _noalias const firstneigh_full = + (const int ** const)list->listfull->firstneigh; + + #if defined(_OPENMP) + #pragma omp parallel + #endif + { + int tid, ifrom, ito; + IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, comm->nthreads); + + // each thread has its own page allocator + MyPage<int> &ipage = list->ipage[tid]; + ipage.reset(); + + // loop over parent full list + for (int ii = ifrom; ii < ito; ii++) { + int n = 0; + int *neighptr = ipage.vget(); + + const int i = ilist_full[ii]; + const flt_t xtmp = x[i].x; + const flt_t ytmp = x[i].y; + const flt_t ztmp = x[i].z; + + // loop over full neighbor list + + const int * _noalias const jlist = firstneigh_full[i]; + const int jnum = numneigh_full[i]; + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int jj = 0; jj < jnum; jj++) { + const int joriginal = jlist[jj]; + const int j = joriginal & NEIGHMASK; + int addme = 1; + if (j < nlocal) { + if (i > j) addme = 0; + } else { + if (x[j].z < ztmp) addme = 0; + if (x[j].z == ztmp) { + if (x[j].y < ytmp) addme = 0; + if (x[j].y == ytmp && x[j].x < xtmp) addme = 0; + } + } + if (addme) + neighptr[n++] = joriginal; + } + + ilist[ii] = i; + firstneigh[i] = neighptr; + numneigh[i] = n; + + int pad_end = n; + IP_PRE_neighbor_pad(pad_end, 0); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif + for ( ; n < pad_end; n++) + neighptr[n] = e_nall; + + ipage.vgot(n); + if (ipage.status()) + error->one(FLERR,"Neighbor list overflow, boost neigh_modify one"); + } + } + list->inum = inum_full; +} + +/* ---------------------------------------------------------------------- + build half list from full 3-body list + half list is already stored as first part of 3-body list +------------------------------------------------------------------------- */ + +template <class flt_t> +void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf) +{ + const int inum_full = list->listfull->inum; + const int e_nall = atom->nlocal + atom->nghost; + int * _noalias const ilist = list->ilist; + int * _noalias const numneigh = list->numneigh; + int ** _noalias const firstneigh = list->firstneigh; + const int * _noalias const ilist_full = list->listfull->ilist; + const int * _noalias const numneigh_full = numhalf; + const int ** _noalias const firstneigh_full = + (const int ** const)list->listfull->firstneigh; + + int packthreads = 1; + if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads; + + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int tid, ifrom, ito; + IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, packthreads); + + // each thread has its own page allocator + MyPage<int> &ipage = list->ipage[tid]; + ipage.reset(); + + // loop over parent full list + for (int ii = ifrom; ii < ito; ii++) { + int n = 0; + int *neighptr = ipage.vget(); + + const int i = ilist_full[ii]; + + // loop over full neighbor list + + const int * _noalias const jlist = firstneigh_full[i]; + const int jnum = numneigh_full[ii]; + + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int jj = 0; jj < jnum; jj++) { + const int joriginal = jlist[jj]; + neighptr[n++] = joriginal; + } + + ilist[ii] = i; + firstneigh[i] = neighptr; + numneigh[i] = n; + + int pad_end = n; + IP_PRE_neighbor_pad(pad_end, 0); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif + for ( ; n < pad_end; n++) + neighptr[n] = e_nall; + + ipage.vgot(n); + if (ipage.status()) + error->one(FLERR,"Neighbor list overflow, boost neigh_modify one"); + } + } + list->inum = inum_full; +} + +/* ---------------------------------------------------------------------- */ + +void NPairHalffullNewtonIntel::build(NeighList *list) +{ + if (_fix->three_body_neighbor() == 0) { + if (_fix->precision() == FixIntel::PREC_MODE_MIXED) + build_t(list, _fix->get_mixed_buffers()); + else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) + build_t(list, _fix->get_double_buffers()); + else + build_t(list, _fix->get_single_buffers()); + } else { + int *nhalf, *cnum; + if (_fix->precision() == FixIntel::PREC_MODE_MIXED) { + _fix->get_mixed_buffers()->get_list_data3(list->listfull, nhalf, cnum); + build_t3<float>(list, nhalf); + } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + _fix->get_double_buffers()->get_list_data3(list->listfull, nhalf, cnum); + build_t3<double>(list, nhalf); + } else { + _fix->get_single_buffers()->get_list_data3(list->listfull, nhalf, cnum); + build_t3<float>(list, nhalf); + } + } +} diff --git a/src/USER-INTEL/npair_halffull_newton_intel.h b/src/USER-INTEL/npair_halffull_newton_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..b3069d80747845f3a8157d0bce4c0ac2e5cd7eab --- /dev/null +++ b/src/USER-INTEL/npair_halffull_newton_intel.h @@ -0,0 +1,72 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#ifdef NPAIR_CLASS + +NPairStyle(halffull/newton/intel, + NPairHalffullNewtonIntel, + NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI | + NP_ORTHO | NP_TRI| NP_INTEL) + +NPairStyle(halffull/newton/skip/intel, + NPairHalffullNewtonIntel, + NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI | + NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL) + +#else + +#ifndef LMP_NPAIR_HALFFULL_NEWTON_INTEL_H +#define LMP_NPAIR_HALFFULL_NEWTON_INTEL_H + +#include "npair.h" +#include "fix_intel.h" + +#if defined(_OPENMP) +#include <omp.h> +#endif + + +namespace LAMMPS_NS { + +class NPairHalffullNewtonIntel : public NPair { + public: + NPairHalffullNewtonIntel(class LAMMPS *); + ~NPairHalffullNewtonIntel() {} + void build(class NeighList *); + + protected: + FixIntel *_fix; + + template<class flt_t, class acc_t> + void build_t(NeighList *, IntelBuffers<flt_t,acc_t> *); + + template<class flt_t> + void build_t3(NeighList *, int *); +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self explanatory. + +*/ diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp index 6d4529752ab38dd490680069d627c97a0de22446..a1c0785d4c4191312f769a577c2104e5060114ec 100644 --- a/src/USER-INTEL/npair_intel.cpp +++ b/src/USER-INTEL/npair_intel.cpp @@ -52,12 +52,61 @@ NPairIntel::~NPairIntel() { /* ---------------------------------------------------------------------- */ +void NPairIntel::copy_neighbor_info() +{ + NPair::copy_neighbor_info(); + if (_fix->precision() == FixIntel::PREC_MODE_MIXED) + copy_cutsq_info(_fix->get_mixed_buffers()); + else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) + copy_cutsq_info(_fix->get_double_buffers()); + else + copy_cutsq_info(_fix->get_single_buffers()); +} + +/* ---------------------------------------------------------------------- */ + +template <class flt_t, class acc_t> +void NPairIntel::copy_cutsq_info(IntelBuffers<flt_t,acc_t> *buffers) { + int tp1 = atom->ntypes + 1; + int use_ghost_cut = 0; + if (cutneighghostsq) + use_ghost_cut = 1; + buffers->set_ntypes(tp1, use_ghost_cut); + + flt_t **cutneighsqb = buffers->get_cutneighsq(); + for (int i = 1; i <= atom->ntypes; i++) + for (int j = 1; j <= atom->ntypes; j++) + cutneighsqb[i][j] = cutneighsq[i][j]; + + flt_t **cutneighghostsqb; + if (use_ghost_cut) { + cutneighghostsqb = buffers->get_cutneighghostsq(); + for (int i = 1; i <= atom->ntypes; i++) + for (int j = 1; j <= atom->ntypes; j++) + cutneighghostsqb[i][j] = cutneighghostsq[i][j]; + } + + #ifdef _LMP_INTEL_OFFLOAD + if (_cop < 0) return; + int tp1sq = tp1 * tp1; + flt_t * ocutneighsq = cutneighsqb[0]; + #pragma offload_transfer target(mic:_cop) in(ocutneighsq: length(tp1sq)) + if (use_ghost_cut) { + flt_t * ocutneighghostsq = cutneighghostsqb[0]; + #pragma offload_transfer target(mic:_cop) \ + in(ocutneighghostsq: length(tp1sq)) + } + #endif +} + +/* ---------------------------------------------------------------------- */ + template <class flt_t, class acc_t, int offload_noghost, int need_ic, int FULL, int TRI, int THREE> -void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, +void NPairIntel::bin_newton(const int offload, NeighList *list, IntelBuffers<flt_t,acc_t> *buffers, const int astart, const int aend, - const int /*offload_end*/) { + const int offload_end) { if (aend-astart == 0) return; @@ -71,7 +120,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, const int pack_width = _fix->nbor_pack_width(); const ATOM_T * _noalias const x = buffers->get_x(); - int * _noalias const firstneigh = buffers->firstneigh(list); + int * _noalias const intel_list = buffers->intel_list(list); const int e_nall = nall_t; const int molecular = atom->molecular; @@ -94,8 +143,10 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, const tagint * _noalias const tag = atom->tag; int * _noalias const ilist = list->ilist; - int * _noalias numneigh = list->numneigh; - int * _noalias const cnumneigh = buffers->cnumneigh(list); + int ** _noalias const firstneigh = list->firstneigh; + int * _noalias const numneigh = list->numneigh; + int * _noalias const cnumneigh = buffers->cnumneigh(); + const int nstencil = this->nstencil; const int * _noalias const stencil = this->stencil; const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0]; @@ -168,6 +219,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \ in(cutneighsq:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ + in(intel_list:length(0) alloc_if(0) free_if(0)) \ in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ out(numneigh:length(0) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ @@ -178,7 +230,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \ in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \ in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \ - in(pack_width,special_bound) \ + in(pack_width,special_bound) \ out(overflow:length(5) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ signal(tag) @@ -212,7 +264,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, #if defined(_OPENMP) #pragma omp parallel default(none) \ - shared(numneigh, overflow, nstencilp, binstart, binend) + shared(overflow, nstencilp, binstart, binend) #endif { #ifdef _LMP_INTEL_OFFLOAD @@ -246,7 +298,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, const int obound = maxnbors * 3; #endif int ct = (ifrom + tid * 2) * maxnbors; - int *neighptr = firstneigh + ct; + int *neighptr = intel_list + ct; int *neighptr2; if (THREE) neighptr2 = neighptr; @@ -605,12 +657,24 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, if (n > maxnbors) *overflow = 1; ilist[i] = i; - cnumneigh[i] = ct; + firstneigh[i] = intel_list + ct; if (THREE) { + numneigh[i] = ns; + cnumneigh[i] = ct; #ifdef LMP_INTEL_3BODY_FAST cnumneigh[i] += lane; + #else + // Pad anyways just in case we have hybrid with 2-body and newton off + int pad_end = ns; + IP_PRE_neighbor_pad(pad_end, offload); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif + for ( ; ns < pad_end; ns++) + neighptr[n++] = e_nall; #endif - numneigh[i] = ns; } else { numneigh[i] = n; int pad_end = n; @@ -631,7 +695,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, if (lane == pack_width) { ct += max_chunk * pack_width; IP_PRE_edge_align(ct, sizeof(int)); - neighptr = firstneigh + ct; + neighptr = intel_list + ct; neighptr2 = neighptr; max_chunk = 0; lane = 0; @@ -647,7 +711,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, { ct += n; //IP_PRE_edge_align(ct, sizeof(int)); - neighptr = firstneigh + ct; + neighptr = intel_list + ct; if (THREE) neighptr2 = neighptr; if (ct + obound > list_size) { if (i < ito - 1) { @@ -667,7 +731,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, int ghost_offset = 0, nall_offset = e_nall; if (separate_buffers) { for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh[i]; int jnum = numneigh[i]; if (!THREE) IP_PRE_neighbor_pad(jnum, offload); #if __INTEL_COMPILER+0 > 1499 @@ -712,7 +776,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, } for (int i = ifrom; i < ito; ++i) { - int * _noalias jlist = firstneigh + cnumneigh[i]; + int * _noalias jlist = firstneigh[i]; int jnum = numneigh[i]; if (!THREE) IP_PRE_neighbor_pad(jnum, offload); int jj = 0; @@ -739,13 +803,12 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, if (offload) { _fix->stop_watch(TIME_OFFLOAD_LATENCY); _fix->start_watch(TIME_HOST_NEIGHBOR); + firstneigh[0] = intel_list; for (int n = 0; n < aend; n++) { ilist[n] = n; numneigh[n] = 0; } } else { - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; if (separate_buffers) { _fix->start_watch(TIME_PACK); _fix->set_neighbor_host_sizes(); @@ -756,11 +819,6 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list, _fix->stop_watch(TIME_PACK); } } - #else - #pragma vector aligned - #pragma simd - for (int i = astart; i < aend; i++) - list->firstneigh[i] = firstneigh + cnumneigh[i]; #endif } diff --git a/src/USER-INTEL/npair_intel.h b/src/USER-INTEL/npair_intel.h index 55a529b2cb53a044b20a3a0f21f7ce9480ca6e89..e47687abea10424a5e052afdfd6b81b949e0574e 100644 --- a/src/USER-INTEL/npair_intel.h +++ b/src/USER-INTEL/npair_intel.h @@ -75,7 +75,8 @@ class NPairIntel : public NPair { public: NPairIntel(class LAMMPS *); ~NPairIntel(); - + virtual void copy_neighbor_info(); + #ifdef _LMP_INTEL_OFFLOAD void grow_stencil(); #endif @@ -83,6 +84,9 @@ class NPairIntel : public NPair { protected: FixIntel *_fix; + template <class flt_t, class acc_t> + void copy_cutsq_info(IntelBuffers<flt_t,acc_t> *); + template <class flt_t, class acc_t, int, int, int, int, int> void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *, const int, const int, const int offload_end = 0); diff --git a/src/USER-INTEL/npair_skip_intel.cpp b/src/USER-INTEL/npair_skip_intel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..3d2463a0b978d883c66a4c364427668b1b7a1719 --- /dev/null +++ b/src/USER-INTEL/npair_skip_intel.cpp @@ -0,0 +1,230 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: W. Michael Brown (Intel) +------------------------------------------------------------------------- */ + +#include "npair_skip_intel.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "modify.h" +#include "molecule.h" +#include "neigh_request.h" +#include "domain.h" +#include "my_page.h" +#include "error.h" + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +NPairSkipIntel::NPairSkipIntel(LAMMPS *lmp) : NPair(lmp) { + int ifix = modify->find_fix("package_intel"); + if (ifix < 0) + error->all(FLERR, + "The 'package intel' command is required for /intel styles"); + _fix = static_cast<FixIntel *>(modify->fix[ifix]); + _inum_starts = new int[comm->nthreads]; + _inum_counts = new int[comm->nthreads]; + _full_props = 0; +} + +/* ---------------------------------------------------------------------- */ + +NPairSkipIntel::~NPairSkipIntel() { + delete []_inum_starts; + delete []_inum_counts; + if (_full_props) delete []_full_props; +} + +/* ---------------------------------------------------------------------- */ + +void NPairSkipIntel::copy_neighbor_info() +{ + NPair::copy_neighbor_info(); + if (_full_props) delete []_full_props; + _full_props = new int[neighbor->nlist]; + for (int i = 0; i < neighbor->nlist; i++) + _full_props[i] = neighbor->requests[i]->full; +} + +/* ---------------------------------------------------------------------- + build skip list for subset of types from parent list + works for half and full lists + works for owned (non-ghost) list, also for ghost list + iskip and ijskip flag which atom types and type pairs to skip + if ghost, also store neighbors of ghost atoms & set inum,gnum correctly +------------------------------------------------------------------------- */ + +template<class flt_t, int THREE> +void NPairSkipIntel::build_t(NeighList *list, int *numhalf, int *cnumneigh, + int *numhalf_skip) +{ + const int nlocal = atom->nlocal; + const int e_nall = nlocal + atom->nghost; + const int * _noalias const type = atom->type; + int * _noalias const ilist = list->ilist; + int * _noalias const numneigh = list->numneigh; + int ** _noalias const firstneigh = (int ** const)list->firstneigh; + const int * _noalias const ilist_skip = list->listskip->ilist; + const int * _noalias const numneigh_skip = list->listskip->numneigh; + const int ** _noalias const firstneigh_skip = + (const int ** const)list->listskip->firstneigh; + const int * _noalias const iskip = list->iskip; + const int ** _noalias const ijskip = (const int ** const)list->ijskip; + + int num_skip = list->listskip->inum; + if (list->ghost) num_skip += list->listskip->gnum; + + int packthreads; + if (comm->nthreads > INTEL_HTHREADS && THREE==0) + packthreads = comm->nthreads; + else + packthreads = 1; + + #if defined(_OPENMP) + #pragma omp parallel if(packthreads > 1) + #endif + { + int tid, ifrom, ito; + IP_PRE_omp_range_id(ifrom, ito, tid, num_skip, packthreads); + + // each thread has its own page allocator + MyPage<int> &ipage = list->ipage[tid]; + ipage.reset(); + + int my_inum = ifrom; + _inum_starts[tid] = ifrom; + + // loop over parent full list + for (int ii = ifrom; ii < ito; ii++) { + const int i = ilist_skip[ii]; + const int itype = type[i]; + if (iskip[itype]) continue; + + int n = 0; + int *neighptr = ipage.vget(); + + // loop over parent non-skip list + + const int * _noalias const jlist = firstneigh_skip[i]; + const int jnum = numneigh_skip[i]; + + if (THREE) { + const int jnumhalf = numhalf_skip[ii]; + for (int jj = 0; jj < jnumhalf; jj++) { + const int joriginal = jlist[jj]; + const int j = joriginal & NEIGHMASK; + if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal; + } + numhalf[my_inum] = n; + + for (int jj = jnumhalf; jj < jnum; jj++) { + const int joriginal = jlist[jj]; + const int j = joriginal & NEIGHMASK; + if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal; + } + } else { + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma ivdep + #endif + for (int jj = 0; jj < jnum; jj++) { + const int joriginal = jlist[jj]; + const int j = joriginal & NEIGHMASK; + if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal; + } + } + + ilist[my_inum++] = i; + firstneigh[i] = neighptr; + numneigh[i] = n; + + int pad_end = n; + IP_PRE_neighbor_pad(pad_end, 0); + #if defined(LMP_SIMD_COMPILER) + #pragma vector aligned + #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \ + avg=INTEL_COMPILE_WIDTH/2 + #endif + for ( ; n < pad_end; n++) + neighptr[n] = e_nall; + + ipage.vgot(n); + if (ipage.status()) + error->one(FLERR,"Neighbor list overflow, boost neigh_modify one"); + } + + int last_inum = 0, loop_end; + _inum_counts[tid] = my_inum; + } + int inum = _inum_counts[0]; + for (int tid = 1; tid < packthreads; tid++) { + for (int i = _inum_starts[tid]; i < _inum_counts[tid]; i++) { + if (THREE) numhalf[inum] = numhalf[i]; + ilist[inum++] = ilist[i]; + } + } + list->inum = inum; + + if (THREE && num_skip > 0) { + int * const list_start = firstneigh[ilist[0]]; + for (int ii = 0; ii < inum; ii++) { + int i = ilist[ii]; + cnumneigh[ii] = static_cast<int>(firstneigh[i] - list_start); + } + } + if (list->ghost) { + int num = 0; + int my_inum = list->inum; + for (int i = 0; i < my_inum; i++) + if (ilist[i] < nlocal) num++; + else break; + list->inum = num; + list->gnum = my_inum - num; + } +} + +/* ---------------------------------------------------------------------- */ + +void NPairSkipIntel::build(NeighList *list) +{ + if (_fix->three_body_neighbor()==0 || + _full_props[list->listskip->index] == 0) { + if (_fix->precision() == FixIntel::PREC_MODE_MIXED) + build_t<float,0>(list, 0, 0, 0); + else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) + build_t<double,0>(list, 0, 0, 0); + else + build_t<float,0>(list, 0, 0, 0); + } else { + int *nhalf, *cnumneigh, *nhalf_skip, *u; + if (_fix->precision() == FixIntel::PREC_MODE_MIXED) { + _fix->get_mixed_buffers()->get_list_data3(list->listskip,nhalf_skip,u); + _fix->get_mixed_buffers()->grow_data3(list, nhalf, cnumneigh); + build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip); + } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) { + _fix->get_double_buffers()->get_list_data3(list->listskip,nhalf_skip,u); + _fix->get_double_buffers()->grow_data3(list, nhalf, cnumneigh); + build_t<double,1>(list, nhalf, cnumneigh, nhalf_skip); + } else { + _fix->get_single_buffers()->get_list_data3(list->listskip,nhalf_skip,u); + _fix->get_single_buffers()->grow_data3(list,nhalf,cnumneigh); + build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip); + } + } +} diff --git a/src/USER-INTEL/npair_skip_intel.h b/src/USER-INTEL/npair_skip_intel.h new file mode 100644 index 0000000000000000000000000000000000000000..e3277708d5c4188ebeef2197a18270ae561bacef --- /dev/null +++ b/src/USER-INTEL/npair_skip_intel.h @@ -0,0 +1,69 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef NPAIR_CLASS + +NPairStyle(skip/intel, + NPairSkipIntel, + NP_SKIP | NP_HALF | NP_FULL | + NP_NSQ | NP_BIN | NP_MULTI | + NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL) + +NPairStyle(skip/ghost/intel, + NPairSkipIntel, + NP_SKIP | NP_HALF | NP_FULL | + NP_NSQ | NP_BIN | NP_MULTI | + NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL) + +#else + +#ifndef LMP_NPAIR_SKIP_INTEL_H +#define LMP_NPAIR_SKIP_INTEL_H + +#include "npair.h" +#include "fix_intel.h" + +#if defined(_OPENMP) +#include <omp.h> +#endif + + +namespace LAMMPS_NS { + +class NPairSkipIntel : public NPair { + public: + NPairSkipIntel(class LAMMPS *); + ~NPairSkipIntel(); + virtual void copy_neighbor_info(); + void build(class NeighList *); + + protected: + FixIntel *_fix; + int *_inum_starts, *_inum_counts, *_full_props; + + template<class flt_t, int THREE> + void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip); +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: The 'package intel' command is required for /intel styles + +Self explanatory. + +*/ diff --git a/src/USER-INTEL/pair_airebo_intel.cpp b/src/USER-INTEL/pair_airebo_intel.cpp index 601144a3c03ae6a14f8cb2cb697cf19d7cdb020e..198f8798fa29e35e00429022cc37566edd6e88cb 100644 --- a/src/USER-INTEL/pair_airebo_intel.cpp +++ b/src/USER-INTEL/pair_airebo_intel.cpp @@ -97,6 +97,13 @@ struct LAMMPS_NS::PairAIREBOIntelParam { namespace { +struct NeighListLMPAIREBO { + int * num; /* num_all */ + int * num_half; /* num_all */ + int * offset; /* num_all */ + int ** entries; /* num_all * num_neighs_per_atom */ +}; + struct NeighListAIREBO { int * num; /* num_all */ int * num_half; /* num_all */ @@ -125,7 +132,7 @@ struct KernelArgsAIREBOT { int neigh_from_atom, neigh_to_atom; int rebuild_flag; flt_t skin; - struct NeighListAIREBO neigh_lmp; + struct NeighListLMPAIREBO neigh_lmp; struct NeighListAIREBO neigh_rebo; PairAIREBOIntelParam<flt_t,acc_t> params; struct AtomAIREBOT<flt_t> * x; /* num_all */ @@ -176,6 +183,11 @@ void PairAIREBOIntel::init_style() PairAIREBO::init_style(); neighbor->requests[neighbor->nrequest-1]->intel = 1; + const int nrequest = neighbor->nrequest; + for (int i = 0; i < nrequest; ++i) + if (neighbor->requests[i]->skip) + error->all(FLERR, "Cannot yet use airebo/intel with hybrid."); + int ifix = modify->find_fix("package_intel"); if (ifix < 0) error->all(FLERR, @@ -399,8 +411,7 @@ void PairAIREBOIntel::eval( ATOM_T * _noalias const x = buffers->get_x(offload); const int * _noalias const numneighhalf = buffers->get_atombin(); const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; tagint * const tag = atom->tag; const int ntypes = atom->ntypes + 1; @@ -441,7 +452,6 @@ void PairAIREBOIntel::eval( #pragma offload target(mic:_cop) if(offload) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(numneighhalf:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ @@ -472,10 +482,8 @@ void PairAIREBOIntel::eval( f_stride, x, 0/*q*/); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EVFLAG) { - oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; - } + if (EVFLAG) + oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) @@ -508,8 +516,7 @@ void PairAIREBOIntel::eval( args.skin = skin; args.neigh_lmp.num = const_cast<int*>(numneigh); args.neigh_lmp.num_half = const_cast<int*>(numneighhalf); - args.neigh_lmp.offset = const_cast<int*>(cnumneigh); - args.neigh_lmp.entries = const_cast<int*>(firstneigh); + args.neigh_lmp.entries = const_cast<int**>(firstneigh); args.neigh_rebo.num = REBO_numneigh; args.neigh_rebo.num_half = REBO_num_skin; args.neigh_rebo.offset = REBO_cnumneigh; @@ -543,18 +550,14 @@ void PairAIREBOIntel::eval( IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); if (EVFLAG) { - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - ev_global[2] = ov0; - ev_global[3] = ov1; - ev_global[4] = ov2; - ev_global[5] = ov3; - ev_global[6] = ov4; - ev_global[7] = ov5; - } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; + ev_global[2] = ov0; + ev_global[3] = ov1; + ev_global[4] = ov2; + ev_global[5] = ov3; + ev_global[6] = ov4; + ev_global[7] = ov5; } #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD) *timer_compute = MIC_Wtime() - *timer_compute; @@ -578,28 +581,20 @@ template<class flt_t, class acc_t> void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) { int tp1 = atom->ntypes + 1; - buffers->set_ntypes(tp1,1); - flt_t **cutneighsq = buffers->get_cutneighsq(); - flt_t **cutneighghostsq = buffers->get_cutneighghostsq(); - // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - cut = cutghost[i][j] + neighbor->skin; - cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } #ifdef _LMP_INTEL_OFFLOAD if (_cop < 0) return; - flt_t * ocutneighsq = cutneighsq[0]; size_t VL = 512 / 8 / sizeof(flt_t); int ntypes = tp1; int tp1sq = tp1 * tp1; @@ -607,7 +602,6 @@ void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) { // it might not be freed if this method is called more than once int * map = this->map; #pragma offload_transfer target(mic:_cop) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \ in(map: length(tp1) alloc_if(1) free_if(0)) #endif @@ -1580,7 +1574,7 @@ void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) { ka->nC[i] = 0; ka->nH[i] = 0; for (int j = 0; j < ka->neigh_lmp.num[i]; j++) { - int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j]; + int ji = ka->neigh_lmp.entries[i][j]; flt_t delx = ka->x[i].x - ka->x[ji].x; flt_t dely = ka->x[i].y - ka->x[ji].y; flt_t delz = ka->x[i].z - ka->x[ji].z; @@ -2168,7 +2162,7 @@ void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i, int morseflag) { AtomAIREBOT<flt_t> * x = ka->x; int jj; - int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i]; + int * neighs = ka->neigh_lmp.entries[i]; int jnum = ka->neigh_lmp.num_half[i]; for (jj = 0; jj < jnum; jj++) { int j = neighs[jj]; @@ -2332,7 +2326,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) { int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] : ka->neigh_rebo.num_half[i]; int * neighs = ka->rebuild_flag ? - &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] : + ka->neigh_lmp.entries[i] : &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum]; int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom]; int n = 0; @@ -4554,7 +4548,7 @@ static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) { fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]); fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]); - int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i]; + int * neighs = ka->neigh_lmp.entries[i]; int jnum = ka->neigh_lmp.num_half[i]; bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result); diff --git a/src/USER-INTEL/pair_airebo_intel.h b/src/USER-INTEL/pair_airebo_intel.h index 2d32925f68f6731acb516b22ea03d788fd0a1287..8a319536de119f6f02ac2a42b90f493e1c389f33 100644 --- a/src/USER-INTEL/pair_airebo_intel.h +++ b/src/USER-INTEL/pair_airebo_intel.h @@ -107,4 +107,10 @@ E: Cannot open AIREBO potential file %s The specified AIREBO potential file cannot be opened. Check that the path and name are correct. +E: Cannot yet use airebo/intel with hybrid. + +Pair style airebo/intel cannot currently be used as part of a hybrid +pair style (with the exception of hybrid/overlay). + + */ diff --git a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp index 39fa9014db45238da0f50c04eb171843610d2979..3954d559e1472b6fe808cb2fa28c683e28140a6f 100644 --- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp @@ -144,8 +144,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_coul = fc.special_coul; const flt_t * _noalias const special_lj = fc.special_lj; @@ -182,7 +181,6 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ in(c_force, c_energy, c_cut:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ @@ -204,8 +202,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -231,8 +231,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -361,13 +361,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -375,6 +372,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -439,19 +438,16 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < ncoultablebits; i++) ntable *= 2; fc.set_ntypes(tp1, ntable, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -483,12 +479,10 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc, C_FORCE_T * c_force = fc.c_force[0]; C_ENERGY_T * c_energy = fc.c_energy[0]; C_CUT_T * c_cut = fc.c_cut[0]; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ - in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0)) #endif } diff --git a/src/USER-INTEL/pair_buck_coul_long_intel.cpp b/src/USER-INTEL/pair_buck_coul_long_intel.cpp index fe4d408a1333be088bb49dc200d0292ba7c1ef7d..059413c3d94c1e1d73ba7eee56417f9f07e3554e 100644 --- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp @@ -144,8 +144,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_coul = fc.special_coul; const flt_t * _noalias const special_lj = fc.special_lj; @@ -206,7 +205,6 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \ in(rho_inv:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ @@ -230,8 +228,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -265,8 +265,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; const flt_t * _noalias const rho_invi = rho_inv + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -437,13 +437,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; + oecoul *= (acc_t)0.5; ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -451,6 +448,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -523,19 +522,16 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < ncoultablebits; i++) ntable *= 2; fc.set_ntypes(tp1, ntable, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -591,15 +587,13 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, flt_t * detable = fc.detable; flt_t * ctable = fc.ctable; flt_t * dctable = fc.dctable; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \ in(rho_inv: length(tp1sq) alloc_if(0) free_if(0)) \ in(table: length(ntable) alloc_if(0) free_if(0)) \ - in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) #endif } diff --git a/src/USER-INTEL/pair_buck_intel.cpp b/src/USER-INTEL/pair_buck_intel.cpp index 8ce3d121e0b6f0cfd18149bf111e27b403db4b51..0e0bd0f56fdf8977f70e38cf78ad8886de5da488 100644 --- a/src/USER-INTEL/pair_buck_intel.cpp +++ b/src/USER-INTEL/pair_buck_intel.cpp @@ -135,8 +135,7 @@ void PairBuckIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_lj = fc.special_lj; const C_FORCE_T * _noalias const c_force = fc.c_force[0]; @@ -167,7 +166,6 @@ void PairBuckIntel::eval(const int offload, const int vflag, in(special_lj:length(0) alloc_if(0) free_if(0)) \ in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(ilist:length(0) alloc_if(0) free_if(0)) \ @@ -188,8 +186,10 @@ void PairBuckIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -214,8 +214,8 @@ void PairBuckIntel::eval(const int offload, const int vflag, const int ptr_off = itype * ntypes; const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -324,13 +324,9 @@ void PairBuckIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -338,6 +334,8 @@ void PairBuckIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -396,19 +394,16 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc, int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -434,12 +429,10 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc, flt_t * special_lj = fc.special_lj; C_FORCE_T * c_force = fc.c_force[0]; C_ENERGY_T * c_energy = fc.c_energy[0]; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj: length(4) alloc_if(0) free_if(0)) \ - in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) #endif } diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp index 7c33ed21d35cb130b4271e62cf9b98cfe77e2132..9baa64e8e05984afd58aa55115d6ca2c2b911dac 100644 --- a/src/USER-INTEL/pair_dpd_intel.cpp +++ b/src/USER-INTEL/pair_dpd_intel.cpp @@ -173,8 +173,7 @@ void PairDPDIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const FC_PACKED1_T * _noalias const param = fc.param[0]; const flt_t * _noalias const special_lj = fc.special_lj; int * _noalias const rngi_thread = fc.rngi; @@ -204,8 +203,10 @@ void PairDPDIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -233,10 +234,10 @@ void PairDPDIntel::eval(const int offload, const int vflag, flt_t icut, a0, gamma, sigma; if (ONETYPE) { - icut = param[3].icut; - a0 = param[3].a0; - gamma = param[3].gamma; - sigma = param[3].sigma; + icut = param[_onetype].icut; + a0 = param[_onetype].a0; + gamma = param[_onetype].gamma; + sigma = param[_onetype].sigma; } for (int ii = iifrom; ii < iito; ii += iip) { const int i = ilist[ii]; @@ -248,8 +249,8 @@ void PairDPDIntel::eval(const int offload, const int vflag, parami = param + ptr_off; } - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp, fytmp, fztmp, fwtmp; @@ -379,13 +380,9 @@ void PairDPDIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -393,6 +390,8 @@ void PairDPDIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -503,32 +502,26 @@ void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> *buffers) { _onetype = 0; - if (atom->ntypes == 1 && !atom->molecular) _onetype = 1; int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; + int mytypes = 0; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { - cut = init_one(i,j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - double icut = 1.0 / cut; - fc.param[i][j].icut = fc.param[j][i].icut = icut; - } else { - cut = init_one(i,j); - double icut = 1.0 / cut; - fc.param[i][j].icut = fc.param[j][i].icut = icut; + mytypes++; + _onetype = i * tp1 + j; } + double cut = init_one(i,j); + cutsq[i][j] = cutsq[j][i] = cut*cut; + double icut = 1.0 / cut; + fc.param[i][j].icut = fc.param[j][i].icut = icut; } } - + if (mytypes > 1 || atom->molecular) _onetype = 0; + for (int i = 0; i < 4; i++) { fc.special_lj[i] = force->special_lj[i]; fc.special_lj[0] = 1.0; diff --git a/src/USER-INTEL/pair_eam_intel.cpp b/src/USER-INTEL/pair_eam_intel.cpp index ce9ede69d680684a6af9a896187d24fb425ed52b..95d0272d33361609d0372138811b407faa59f26e 100644 --- a/src/USER-INTEL/pair_eam_intel.cpp +++ b/src/USER-INTEL/pair_eam_intel.cpp @@ -193,8 +193,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const FC_PACKED1_T * _noalias const rhor_spline_f = fc.rhor_spline_f; const FC_PACKED1_T * _noalias const rhor_spline_e = fc.rhor_spline_e; const FC_PACKED2_T * _noalias const z2r_spline_t = fc.z2r_spline_t; @@ -243,8 +242,10 @@ void PairEAMIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -293,8 +294,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, itype = x[i].w; rhor_ioff = istride * itype; } - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); const flt_t xtmp = x[i].x; @@ -450,8 +451,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, if (tid == 0) comm->forward_comm_pair(this); - if (NEWTON_PAIR) - memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); + if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T)); #if defined(_OPENMP) #pragma omp barrier @@ -469,8 +469,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, rhor_ioff = istride * itype; scale_fi = scale_f + itype*ntypes; } - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp, fytmp, fztmp, fwtmp; @@ -597,11 +597,7 @@ void PairEAMIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; @@ -610,6 +606,8 @@ void PairEAMIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -682,19 +680,16 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1,nr,nrho,memory,_cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i,j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -733,13 +728,13 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc, fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4]; fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5]; fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6]; - fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0]; - fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1]; - fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2]; - fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3]; - fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4]; - fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5]; - fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6]; + fc.z2r_spline_t[joff + k].a=z2r_spline[type2z2r[j][i]][k][0]; + fc.z2r_spline_t[joff + k].b=z2r_spline[type2z2r[j][i]][k][1]; + fc.z2r_spline_t[joff + k].c=z2r_spline[type2z2r[j][i]][k][2]; + fc.z2r_spline_t[joff + k].d=z2r_spline[type2z2r[j][i]][k][3]; + fc.z2r_spline_t[joff + k].e=z2r_spline[type2z2r[j][i]][k][4]; + fc.z2r_spline_t[joff + k].f=z2r_spline[type2z2r[j][i]][k][5]; + fc.z2r_spline_t[joff + k].g=z2r_spline[type2z2r[j][i]][k][6]; } } } @@ -754,7 +749,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, const int nr, const int nrho, Memory *memory, const int cop) { - if (ntypes != _ntypes || nr > _nr || nrho > _nrho) { + if (ntypes != _ntypes || nr + 1 > _nr || nrho + 1 > _nrho) { if (_ntypes > 0) { _memory->destroy(rhor_spline_f); _memory->destroy(rhor_spline_e); diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp index 30941a7594810f8bb7037ee162fc5a361a69c0fa..51524355d5cf8b56b48316a6aad86b109ccca2d9 100644 --- a/src/USER-INTEL/pair_gayberne_intel.cpp +++ b/src/USER-INTEL/pair_gayberne_intel.cpp @@ -228,8 +228,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_lj = fc.special_lj; const FC_PACKED1_T * _noalias const ijc = fc.ijc[0]; @@ -283,7 +282,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \ in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(quat:length(nall+1) alloc_if(0) free_if(0)) \ @@ -337,9 +335,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, #endif acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0.0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0; if (NEWTON_PAIR == 0) f_start[1].w = 0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -372,8 +372,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off; const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); const flt_t xtmp = x[i].x; @@ -833,13 +833,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, f_start[1].w = ierror; } // omp - if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; ov0 *= (acc_t)-0.5; ov1 *= (acc_t)-0.5; ov2 *= (acc_t)-0.5; @@ -847,6 +843,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)-0.5; ov5 *= (acc_t)-0.5; } + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -914,19 +912,16 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc, if (mthreads < buffers->get_off_threads()) mthreads = buffers->get_off_threads(); fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i,j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -963,14 +958,12 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc, FC_PACKED1_T *oijc = fc.ijc[0]; FC_PACKED2_T *olj34 = fc.lj34[0]; FC_PACKED3_T *oic = fc.ic; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; if (oijc != NULL && oic != NULL) { #pragma offload_transfer target(mic:_cop) \ in(special_lj: length(4) alloc_if(0) free_if(0)) \ in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \ - in(oic: length(tp1) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq)) + in(oic: length(tp1) alloc_if(0) free_if(0)) } #endif } diff --git a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp index 8e6395c3885cecf42540f303ed278996b2656ed1..0b6ac3ffa56d45333631c5bd4e01c2e08d8fd7bf 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp @@ -138,8 +138,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_coul = fc.special_coul; const flt_t * _noalias const special_lj = fc.special_lj; @@ -186,7 +185,6 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ @@ -212,8 +210,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -248,8 +248,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, const flt_t * _noalias const cutsqi = cutsq + ptr_off; const LJ_T * _noalias const lji = lj + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -403,16 +403,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { oevdwl *= (acc_t)0.5; oecoul *= (acc_t)0.5; - } - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - if (NEWTON_PAIR == 0) { ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -420,6 +414,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -487,22 +483,19 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; if (cut_lj > cut_coul) error->all(FLERR, "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -540,12 +533,10 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc, flt_t * special_coul = fc.special_coul; flt_t * cutsq = fc.cutsq[0]; LJ_T * lj = fc.lj[0]; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ - in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) #endif } diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp index a01a4688a582036b6a0498b460dfbcb3691f28ba..753a9afdd9e4ede4977f50021645abb5b58b6eb8 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp @@ -28,7 +28,7 @@ #include "suffix.h" using namespace LAMMPS_NS; -#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t +#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec2_t #define TABLE_T typename ForceConst<flt_t>::table_t /* ---------------------------------------------------------------------- */ @@ -142,8 +142,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_coul = fc.special_coul; const flt_t * _noalias const special_lj = fc.special_lj; @@ -207,7 +206,6 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ @@ -232,8 +230,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -268,8 +268,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, const flt_t * _noalias const cutsqi = cutsq + ptr_off; const LJ_T * _noalias const lji = lj + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -376,8 +376,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, if (rsq < cut_ljsq) { #endif flt_t r6inv = r2inv * r2inv * r2inv; - forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y); - if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w); + flt_t eps4 = lji[jtype].x; + flt_t sigp6 = lji[jtype].y; + flt_t lj4 = eps4 * sigp6; + flt_t lj3 = lj4 * sigp6; + flt_t lj2 = (flt_t)6.0 * lj4; + flt_t lj1 = (flt_t)12.0 * lj3; + forcelj = r6inv * (lj1 * r6inv - lj2); + if (EFLAG) evdwl = r6inv*(lj3 * r6inv - lj4); #ifdef INTEL_VMASK if (rsq > cut_lj_innersq) { @@ -397,8 +403,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, } #endif } else { - const flt_t philj = r6inv * (lji[jtype].z*r6inv - - lji[jtype].w); + const flt_t philj = r6inv * (lj3 * r6inv - lj4); #ifndef INTEL_VMASK if (rsq > cut_lj_innersq) #endif @@ -463,16 +468,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { oevdwl *= (acc_t)0.5; oecoul *= (acc_t)0.5; - } - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - if (NEWTON_PAIR == 0) { ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -480,6 +479,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -552,22 +553,19 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < ncoultablebits; i++) ntable *= 2; fc.set_ntypes(tp1, ntable, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; if (cut_lj > cut_coul) error->all(FLERR, "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic"); for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -591,10 +589,13 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 1; i < tp1; i++) { for (int j = 1; j < tp1; j++) { - fc.lj[i][j].x = lj1[i][j]; - fc.lj[i][j].y = lj2[i][j]; - fc.lj[i][j].z = lj3[i][j]; - fc.lj[i][j].w = lj4[i][j]; + if (i <= j) { + fc.lj[i][j].x = epsilon[i][j] * 4.0; + fc.lj[i][j].y = pow(sigma[i][j],6.0); + } else { + fc.lj[i][j].x = epsilon[j][i] * 4.0; + fc.lj[i][j].y = pow(sigma[j][i],6.0); + } fc.cutsq[i][j] = cutsq[i][j]; } } @@ -623,14 +624,12 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, flt_t * detable = fc.detable; flt_t * ctable = fc.ctable; flt_t * dctable = fc.dctable; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \ in(table: length(ntable) alloc_if(0) free_if(0)) \ - in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) #endif } @@ -647,7 +646,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, flt_t * ospecial_lj = special_lj; flt_t * ospecial_coul = special_coul; flt_t * ocutsq = cutsq[0]; - typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0]; + typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0]; table_t * otable = table; flt_t * oetable = etable; flt_t * odetable = detable; @@ -687,7 +686,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes, flt_t * ospecial_lj = special_lj; flt_t * ospecial_coul = special_coul; flt_t * ocutsq = cutsq[0]; - typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0]; + typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0]; table_t * otable = table; flt_t * oetable = etable; flt_t * odetable = detable; diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h index 1b13d784971667c7f68b48afbe2e057013e0cb5e..006674507889077820cd843a20b1eedfc8da08e8 100644 --- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h +++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h @@ -69,7 +69,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong { flt_t cut_lj_innersq; table_t *table; flt_t *etable, *detable, *ctable, *dctable; - typename IntelBuffers<flt_t,flt_t>::vec4_t **lj; + typename IntelBuffers<flt_t,flt_t>::vec2_t **lj; ForceConst() : _ntypes(0), _ntable(0) {} ~ForceConst() { set_ntypes(0,0,NULL,_cop); } diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp index ab0b5b3d55e65d75e0fb2acd076182931990e151..35ed9061ce0be04d05f9c522e46fd9cdcd01b30e 100644 --- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp @@ -141,8 +141,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_coul = fc.special_coul; const flt_t * _noalias const special_lj = fc.special_lj; @@ -201,7 +200,6 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \ in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \ in(firstneigh:length(0) alloc_if(0) free_if(0)) \ - in(cnumneigh:length(0) alloc_if(0) free_if(0)) \ in(numneigh:length(0) alloc_if(0) free_if(0)) \ in(x:length(x_size) alloc_if(0) free_if(0)) \ in(q:length(q_size) alloc_if(0) free_if(0)) \ @@ -225,8 +223,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, f_stride, x, q); acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -260,8 +260,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off; const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off; - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp,fytmp,fztmp,fwtmp; @@ -433,16 +433,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { oevdwl *= (acc_t)0.5; oecoul *= (acc_t)0.5; - } - ev_global[0] = oevdwl; - ev_global[1] = oecoul; - } - if (vflag) { - if (NEWTON_PAIR == 0) { ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -450,6 +444,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = oecoul; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -522,19 +518,16 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, for (int i = 0; i < ncoultablebits; i++) ntable *= 2; fc.set_ntypes(tp1, ntable, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -587,14 +580,12 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc, flt_t * detable = fc.detable; flt_t * ctable = fc.ctable; flt_t * dctable = fc.dctable; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; #pragma offload_transfer target(mic:_cop) \ in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \ in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \ in(table: length(ntable) alloc_if(0) free_if(0)) \ - in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) #endif } diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp index c973639709e079e2f731f8eb903282ff8bcb8275..94133a7f4771e349e61e365aca9f0fce1670e4b4 100644 --- a/src/USER-INTEL/pair_lj_cut_intel.cpp +++ b/src/USER-INTEL/pair_lj_cut_intel.cpp @@ -150,8 +150,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int ** _noalias const firstneigh = (const int **)list->firstneigh; const flt_t * _noalias const special_lj = fc.special_lj; const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0]; const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0]; @@ -180,8 +179,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (NEWTON_PAIR == 0 && inum != nlocal) + memset(f_start, 0, f_stride * sizeof(FORCE_T)); // loop over neighbors of my atoms #if defined(_OPENMP) @@ -201,12 +202,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag, flt_t cutsq, lj1, lj2, lj3, lj4, offset; if (ONETYPE) { - cutsq = ljc12o[3].cutsq; - lj1 = ljc12o[3].lj1; - lj2 = ljc12o[3].lj2; - lj3 = lj34[3].lj3; - lj4 = lj34[3].lj4; - offset = ljc12o[3].offset; + cutsq = ljc12o[_onetype].cutsq; + lj1 = ljc12o[_onetype].lj1; + lj2 = ljc12o[_onetype].lj2; + lj3 = lj34[_onetype].lj3; + lj4 = lj34[_onetype].lj4; + offset = ljc12o[_onetype].offset; } for (int ii = iifrom; ii < iito; ii += iip) { const int i = ilist[ii]; @@ -219,8 +220,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag, ljc12oi = ljc12o + ptr_off; lj34i = lj34 + ptr_off; } - const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - int jnum = numneigh[ii]; + const int * _noalias const jlist = firstneigh[i]; + int jnum = numneigh[i]; IP_PRE_neighbor_pad(jnum, offload); acc_t fxtmp, fytmp, fztmp, fwtmp; @@ -338,13 +339,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { - if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5; - ev_global[0] = oevdwl; - ev_global[1] = (acc_t)0.0; - } - if (vflag) { + if (EFLAG || vflag) { if (NEWTON_PAIR == 0) { + oevdwl *= (acc_t)0.5; ov0 *= (acc_t)0.5; ov1 *= (acc_t)0.5; ov2 *= (acc_t)0.5; @@ -352,6 +349,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag, ov4 *= (acc_t)0.5; ov5 *= (acc_t)0.5; } + ev_global[0] = oevdwl; + ev_global[1] = (acc_t)0.0; ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -414,25 +413,26 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t,acc_t> *buffers) { _onetype = 0; - if (atom->ntypes == 1 && !atom->molecular) _onetype = 1; int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1,memory,_cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; + int mytypes = 0; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { + double cut; if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; + mytypes++; + _onetype = i * tp1 + j; + } else { + cut = 0.0; } + cutsq[i][j] = cutsq[j][i] = cut*cut; } } + if (mytypes > 1 || atom->molecular) _onetype = 0; for (int i = 0; i < 4; i++) { fc.special_lj[i] = force->special_lj[i]; diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp index aff8ba88a74971f7ee1e4e7f824334af0f056330..9e00516087cb3711f43d37927b6c98448bbc5b35 100644 --- a/src/USER-INTEL/pair_sw_intel.cpp +++ b/src/USER-INTEL/pair_sw_intel.cpp @@ -130,37 +130,37 @@ void PairSWIntel::compute(int eflag, int vflag, if (_onetype) { if (_spq) { if (eflag) { - eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (eflag) { - eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum); } } } else { if (_spq) { if (eflag) { - eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum); } } else { if (eflag) { - eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum); } else { - eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad); - eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad); + eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end); + eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum); } } } @@ -173,7 +173,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t> void PairSWIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, const int astart, - const int aend, const int /*pad_width*/) + const int aend) { const int inum = aend - astart; if (inum == 0) return; @@ -185,10 +185,13 @@ void PairSWIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); const int * _noalias const ilist = list->ilist; - const int * _noalias const numneighhalf = buffers->get_atombin(); const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int * _noalias const firstneigh = list->firstneigh[ilist[0]]; + + int *nhalf, *cnum; + buffers->get_list_data3(list, nhalf, cnum); + const int * _noalias const numneighhalf = nhalf; + const int * _noalias const cnumneigh = cnum; const FC_PACKED0_T * _noalias const p2 = fc.p2[0]; const FC_PACKED1_T * _noalias const p2f = fc.p2f[0]; @@ -206,6 +209,8 @@ void PairSWIntel::eval(const int offload, const int vflag, const int ntypes = atom->ntypes + 1; const int eatom = this->eflag_atom; + const int onetype = _onetype; + const int onetype3 = _onetype3; // Determine how much data to transfer int x_size, q_size, f_stride, ev_size, separate_flag; @@ -235,8 +240,8 @@ void PairSWIntel::eval(const int offload, const int vflag, in(overflow:length(0) alloc_if(0) free_if(0)) \ in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \ - in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \ - in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \ + in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload,onetype3) \ + in(astart,nlocal,f_stride,minlocal,separate_flag,onetype) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ out(timer_compute:length(1) alloc_if(0) free_if(0)) \ @@ -251,8 +256,8 @@ void PairSWIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; #if defined(_OPENMP) #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) @@ -278,24 +283,24 @@ void PairSWIntel::eval(const int offload, const int vflag, flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3, c4, c5, c6; flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2; if (ONETYPE) { - cutsq = p2[3].cutsq; - cut = p2f[3].cut; - sigma = p2f[3].sigma; - c1 = p2f2[3].c1; - c2 = p2f2[3].c2; - c3 = p2f2[3].c3; - c4 = p2f2[3].c4; - sigma_gamma = p2[3].sigma_gamma; - costheta = p3[7].costheta; - lambda_epsilon = p3[7].lambda_epsilon; - lambda_epsilon2 = p3[7].lambda_epsilon2; + cutsq = p2[onetype].cutsq; + cut = p2f[onetype].cut; + sigma = p2f[onetype].sigma; + c1 = p2f2[onetype].c1; + c2 = p2f2[onetype].c2; + c3 = p2f2[onetype].c3; + c4 = p2f2[onetype].c4; + sigma_gamma = p2[onetype].sigma_gamma; + costheta = p3[onetype3].costheta; + lambda_epsilon = p3[onetype3].lambda_epsilon; + lambda_epsilon2 = p3[onetype3].lambda_epsilon2; if (SPQ == 0) { - powerp = p2f[3].powerp; - powerq = p2f[3].powerq; + powerp = p2f[onetype].powerp; + powerq = p2f[onetype].powerq; } if (EFLAG) { - c5 = p2e[3].c5; - c6 = p2e[3].c6; + c5 = p2e[onetype].c5; + c6 = p2e[onetype].c6; } } @@ -312,7 +317,7 @@ void PairSWIntel::eval(const int offload, const int vflag, } const int * _noalias const jlist = firstneigh + cnumneigh[ii]; - const int jnum = numneigh[ii]; + const int jnum = numneigh[i]; const int jnumhalf = numneighhalf[ii]; acc_t fxtmp, fytmp, fztmp, fwtmp; @@ -541,11 +546,9 @@ void PairSWIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { + if (EFLAG || vflag) { ev_global[0] = oevdwl; ev_global[1] = (acc_t)0.0; - } - if (vflag) { ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -585,7 +588,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t> void PairSWIntel::eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> *buffers, const ForceConst<flt_t> &fc, const int astart, - const int aend, const int pad_width) + const int aend) { typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t; typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t; @@ -601,10 +604,13 @@ void PairSWIntel::eval(const int offload, const int vflag, ATOM_T * _noalias const x = buffers->get_x(offload); const int * _noalias const ilist = list->ilist; - const int * _noalias const numneighhalf = buffers->get_atombin(); const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int * _noalias const firstneigh = list->firstneigh[ilist[0]]; + + int *nhalf, *cnum; + buffers->get_list_data3(list, nhalf, cnum); + const int * _noalias const numneighhalf = nhalf; + const int * _noalias const cnumneigh = cnum; const FC_PACKED0_T * _noalias const p2 = fc.p2[0]; const FC_PACKED1_T * _noalias const p2f = fc.p2f[0]; @@ -654,7 +660,7 @@ void PairSWIntel::eval(const int offload, const int vflag, in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \ in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \ in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \ - in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \ + in(astart,nlocal,f_stride,minlocal,separate_flag) \ in(ccache_stride3) \ out(f_start:length(f_stride) alloc_if(0) free_if(0)) \ out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \ @@ -670,8 +676,8 @@ void PairSWIntel::eval(const int offload, const int vflag, f_stride, x, 0); acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; #if defined(_OPENMP) #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) @@ -701,34 +707,38 @@ void PairSWIntel::eval(const int offload, const int vflag, SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6; SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2; if (ONETYPE) { - cutsq = SIMD_set(p2[3].cutsq); - cut = SIMD_set(p2f[3].cut); - sigma = SIMD_set(p2f[3].sigma); - c1 = SIMD_set(p2f2[3].c1); - c2 = SIMD_set(p2f2[3].c2); - c3 = SIMD_set(p2f2[3].c3); - c4 = SIMD_set(p2f2[3].c4); - sigma_gamma = SIMD_set(p2[3].sigma_gamma); - costheta = SIMD_set(p3[7].costheta); - lambda_epsilon = SIMD_set(p3[7].lambda_epsilon); - lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2); + cutsq = SIMD_set(p2[_onetype].cutsq); + cut = SIMD_set(p2f[_onetype].cut); + sigma = SIMD_set(p2f[_onetype].sigma); + c1 = SIMD_set(p2f2[_onetype].c1); + c2 = SIMD_set(p2f2[_onetype].c2); + c3 = SIMD_set(p2f2[_onetype].c3); + c4 = SIMD_set(p2f2[_onetype].c4); + sigma_gamma = SIMD_set(p2[_onetype].sigma_gamma); + costheta = SIMD_set(p3[_onetype3].costheta); + lambda_epsilon = SIMD_set(p3[_onetype3].lambda_epsilon); + lambda_epsilon2 = SIMD_set(p3[_onetype3].lambda_epsilon2); if (SPQ == 0) { - powerp = SIMD_set(p2f[3].powerp); - powerq = SIMD_set(p2f[3].powerq); + powerp = SIMD_set(p2f[_onetype].powerp); + powerq = SIMD_set(p2f[_onetype].powerq); } if (EFLAG) { - c5 = SIMD_set(p2e[3].c5); - c6 = SIMD_set(p2e[3].c6); + c5 = SIMD_set(p2e[_onetype].c5); + c6 = SIMD_set(p2e[_onetype].c6); } } - const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128, - 144,160,176,192,208,224,240); acc_t * const dforce = &(f[0].x); for (int i = iifrom; i < iito; i += iip) { SIMD_int ilistv = SIMD_load(ilist + i); SIMD_int goffset = ilistv * 16; - SIMD_mask imask = ilistv < iito; + SIMD_mask imask; + if (swidth == 16) + imask = 0xFFFF; + else + imask = 0xFF; + const int rem = iito - i; + if (rem < swidth) imask = imask >> (swidth - rem); SIMD_flt_t xtmp, ytmp, ztmp; SIMD_int itype, itype_offset; @@ -741,11 +751,12 @@ void PairSWIntel::eval(const int offload, const int vflag, #ifdef LMP_INTEL_3BODY_FAST const int* ng = firstneigh + cnumneigh[i] - swidth; + const SIMD_int jnum = SIMD_loadz(imask, numneigh + i); #else SIMD_int ng = SIMD_load(cnumneigh + i); + const SIMD_int jnum = SIMD_gatherz(imask, numneigh, ilistv); ng = ng - 1; #endif - const SIMD_int jnum = SIMD_loadz(imask, numneigh + i); const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i); const int jnum_max = SIMD_max(jnum); @@ -1051,11 +1062,9 @@ void PairSWIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { + if (EFLAG || vflag) { ev_global[0] = oevdwl; ev_global[1] = (acc_t)0.0; - } - if (vflag) { ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -1162,26 +1171,22 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1,memory,_cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i,j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } _onetype = 0; - if (atom->ntypes == 1) _onetype = 1; - _spq = 1; + int mytypes = 0; for (int ii = 0; ii < tp1; ii++) { int i = map[ii]; for (int jj = 0; jj < tp1; jj++) { @@ -1231,6 +1236,9 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, fc.p3[ii][jj][kk].lambda_epsilon = 0; fc.p3[ii][jj][kk].lambda_epsilon2 = 0; } else { + mytypes++; + _onetype = ii * tp1 + jj; + _onetype3 = ii * tp1 * tp1 + jj * tp1 + kk; int ijkparam = elem2param[i][j][k]; fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta; fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon; @@ -1239,12 +1247,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, } } } - - _host_pad = 1; - _offload_pad = 1; - - if (INTEL_NBOR_PAD > 1) - _host_pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t); + if (mytypes > 1) _onetype = 0; #ifdef _LMP_INTEL_OFFLOAD if (_cop < 0) return; @@ -1253,16 +1256,11 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc, FC_PACKED1p2_T *op2f2 = fc.p2f2[0]; FC_PACKED2_T *op2e = fc.p2e[0]; FC_PACKED3_T *op3 = fc.p3[0][0]; - flt_t * ocutneighsq = cutneighsq[0]; int tp1sq = tp1 * tp1; int tp1cu = tp1sq * tp1; - if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && - op3 != NULL && ocutneighsq != NULL) { - #pragma offload_transfer target(mic:_cop) \ - in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0)) \ - in(op3: length(tp1cu) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq)) - } + #pragma offload_transfer target(mic:_cop) \ + in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0)) \ + in(op3: length(tp1cu) alloc_if(0) free_if(0)) #endif } diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h index ffcf9a6fb6b69cbfe9747d4b19752a546006ed7d..36e39e3e853d39a9c6a6ce6e4602b34358283471 100644 --- a/src/USER-INTEL/pair_sw_intel.h +++ b/src/USER-INTEL/pair_sw_intel.h @@ -49,13 +49,13 @@ class PairSWIntel : public PairSW { template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t> void eval(const int offload, const int vflag, IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc, - const int astart, const int aend, const int pad_width); + const int astart, const int aend); template <class flt_t, class acc_t> void pack_force_const(ForceConst<flt_t> &fc, IntelBuffers<flt_t, acc_t> *buffers); - int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype; + int _ccache_stride, _spq, _onetype, _onetype3; #ifdef LMP_USE_AVXCD int _ccache_stride3; #endif @@ -75,7 +75,7 @@ class PairSWIntel : public PairSW { flt_t c1, c2, c3, c4; } fc_packed1p2; typedef struct { - flt_t c5, c6; + flt_t c5, c6, d1, d2; } fc_packed2; typedef struct { flt_t costheta, lambda_epsilon, lambda_epsilon2, pad; diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp index 584b3717842cfcd3b2219fd7419040a497582bb0..5d4c5f2cb677f3a5ec5e2d5ce88e6f4e83e4daa7 100644 --- a/src/USER-INTEL/pair_tersoff_intel.cpp +++ b/src/USER-INTEL/pair_tersoff_intel.cpp @@ -220,7 +220,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> static void kernel_step( int eatom, int vflag, const int * _noalias const numneigh, - const int * _noalias const cnumneigh, + iarr vcnumneigh, const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, @@ -241,7 +241,8 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic> const c_inner_t * _noalias const c_inner, const c_outer_t * _noalias const c_outer, typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, - avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive + avec *vsevdwl, int compress_idx, int ii, int i, iarr js, + bvec vmask_repulsive ); }; @@ -274,9 +275,12 @@ void PairTersoffIntel::eval(const int offload, const int vflag, const int * _noalias const ilist = list->ilist; const int * _noalias const numneigh = list->numneigh; - const int * _noalias const cnumneigh = buffers->cnumneigh(list); - const int * _noalias const numneighhalf = buffers->get_atombin(); - const int * _noalias const firstneigh = buffers->firstneigh(list); + const int * _noalias const firstneigh = list->firstneigh[ilist[0]]; + + int *nhalf, *cnum; + buffers->get_list_data3(list, nhalf, cnum); + const int * _noalias const numneighhalf = nhalf; + const int * _noalias const cnumneigh = cnum; typedef typename ForceConst<flt_t>::c_inner_t c_inner_t; typedef typename ForceConst<flt_t>::c_outer_t c_outer_t; @@ -332,13 +336,13 @@ void PairTersoffIntel::eval(const int offload, const int vflag, IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall, f_stride, x, 0); - acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5; - if (EFLAG) oevdwl = oecoul = (acc_t)0; - if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; + acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5; + if (EFLAG || vflag) + oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0; // loop over neighbors of my atoms #if defined(_OPENMP) - #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5) + #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5) #endif { int iifrom, iito, tid; @@ -379,11 +383,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag, IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag, ov0, ov1, ov2, ov3, ov4, ov5); - if (EFLAG) { + if (EFLAG || vflag) { ev_global[0] = oevdwl; ev_global[1] = 0.0; - } - if (vflag) { ev_global[2] = ov0; ev_global[3] = ov1; ev_global[4] = ov2; @@ -461,19 +463,16 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc, int tp1 = atom->ntypes + 1; fc.set_ntypes(tp1, memory, _cop); - buffers->set_ntypes(tp1); - flt_t **cutneighsq = buffers->get_cutneighsq(); // Repeat cutsq calculation because done after call to init_style - double cut, cutneigh; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { - if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + double cut; + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) cut = init_one(i, j); - cutneigh = cut + neighbor->skin; - cutsq[i][j] = cutsq[j][i] = cut*cut; - cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh; - } + else + cut = 0.0; + cutsq[i][j] = cutsq[j][i] = cut*cut; } } @@ -547,7 +546,6 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc, typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0]; typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0]; typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0]; - flt_t * ocutneighsq = cutneighsq[0]; size_t VL = 512 / 8 / sizeof(flt_t); int ntypes = tp1; int ntypes_pad = ntypes + VL - ntypes % VL; @@ -557,8 +555,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc, #pragma offload_transfer target(mic:_cop) \ in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \ in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \ - in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \ - in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) + in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) #endif } @@ -647,7 +644,7 @@ template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i> template<int EFLAG> void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( int eatom, int vflag, - const int * _noalias const numneigh, const int * _noalias const cnumneigh, + const int * _noalias const numneigh, iarr vcnumneigh, const int * _noalias const firstneigh, int ntypes, typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x, const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner, @@ -681,6 +678,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( // TDO: We could extract this from the driver routine ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is)); ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js)); + ivec vcnumneigh_i = v::int_load_vl(vcnumneigh); bvec vmask = v::mask_enable_lower(compress_idx); fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero(); ivec vw_i = v_i0; @@ -692,7 +690,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step( fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij; fvec vrij = sqrt(vrijsq); ivec vis_orig = v::int_load_vl(is); - ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh); ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh); ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i)); @@ -930,6 +927,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f, avec *vsevdwl, int compress_idx, + int ii, int i, iarr js, bvec vmask_repulsive @@ -970,7 +968,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i( fvec vrijsq = vdx_ij * vdx_ij + vdy_ij * vdy_ij + vdz_ij * vdz_ij; fvec vrij = sqrt(vrijsq); - int cnumneigh_i = cnumneigh[i]; + int cnumneigh_i = cnumneigh[ii]; int numneigh_i = numneigh[i]; ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j); ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j); @@ -1147,7 +1145,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( ) { int compress_idx = 0; int ii, jj; - iarr is, js; + iarr is, js, vc; avec vsevdwl = v::acc_zero(); ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4)); ivec vj, v_NEIGHMASK(NEIGHMASK); @@ -1166,7 +1164,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( flt_t y_i = x[i].y; flt_t z_i = x[i].z; int jlist_off_i = cnumneigh[ii]; - int jnum = numneigh[ii]; + int jnum = numneigh[i]; for (jj = 0; jj < jnum; jj++) { int j = firstneigh[jlist_off_i + jj] & NEIGHMASK; int w_j = x[j].w; @@ -1178,6 +1176,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( if (rsq < cutsq) { is[compress_idx] = ii; js[compress_idx] = j; + vc[compress_idx] = jlist_off_i; if (jj < numneighhalf[ii]) repulsive_flag[compress_idx] = 1; compress_idx += 1; @@ -1187,7 +1186,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); kernel_step<EFLAG>( eatom, vflag, - numneigh, cnumneigh, firstneigh, ntypes, + numneigh, vc, firstneigh, ntypes, x, c_inner, c_outer, f, &vsevdwl, compress_idx, is, js, vmask_repulsive @@ -1203,7 +1202,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( numneigh, cnumneigh, firstneigh, ntypes, x, c_inner, c_outer, f, &vsevdwl, compress_idx, - i, js, vmask_repulsive + ii, i, js, vmask_repulsive ); compress_idx = 0; v::int_clear_arr(repulsive_flag); @@ -1215,7 +1214,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel( vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0)); IntelKernelTersoff::kernel_step<EFLAG>( eatom, vflag, - numneigh, cnumneigh, firstneigh, ntypes, + numneigh, vc, firstneigh, ntypes, x, c_inner, c_outer, f, &vsevdwl, compress_idx, is, js, vmask_repulsive diff --git a/src/USER-INTEL/pppm_intel.cpp b/src/USER-INTEL/pppm_intel.cpp index e5540e637751bc758a347bcf00ee71aaf1c91e86..537a573f23598f30e7d570ce8542082d08dff4ac 100644 --- a/src/USER-INTEL/pppm_intel.cpp +++ b/src/USER-INTEL/pppm_intel.cpp @@ -581,6 +581,12 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers) else nthr = comm->nthreads; + if (fix->need_zero(0)) { + int zl = nlocal; + if (force->newton_pair) zl += atom->nghost; + memset(f, 0, zl * sizeof(FORCE_T)); + } + #if defined(_OPENMP) #pragma omp parallel default(none) \ shared(nlocal, nthr) if(!_use_lrt) @@ -726,6 +732,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers) FFT_SCALAR * _noalias const particle_eky = this->particle_eky; FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz; + if (fix->need_zero(0)) { + int zl = nlocal; + if (force->newton_pair) zl += atom->nghost; + memset(f, 0, zl * sizeof(FORCE_T)); + } + #if defined(_OPENMP) #pragma omp parallel default(none) \ shared(nlocal, nthr) if(!_use_lrt) diff --git a/src/memory.cpp b/src/memory.cpp index 7a23a230791c51f078b1195e7867dfb2d91c4739..971de3dce6cca6771d2cba400ef922eec2e6557d 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -79,7 +79,9 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name) #if defined(LMP_USE_TBB_ALLOCATOR) ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN); -#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) +#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \ + defined(__INTEL_COMPILER) + ptr = realloc(ptr, nbytes); uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN; if (offset) { diff --git a/src/pair.h b/src/pair.h index 844bc0cdc75c12983d98da16273b70c1257044a8..c1a9e6eef8df3c181a2284f0770ea602266a48cb 100644 --- a/src/pair.h +++ b/src/pair.h @@ -207,6 +207,10 @@ class Pair : protected Pointers { typedef union {int i; float f;} union_int_float_t; + // Accessor for the user-intel package to determine virial calc for hybrid + + inline int fdotr_is_set() const { return vflag_fdotr; } + protected: int vflag_fdotr; int maxeatom,maxvatom;