diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h index b40bb7f94368b208061d59c2b10101ad2ae3af06..47a216ff6f1c6fa84eefec11a48ad79e3e697804 100644 --- a/lib/gpu/lal_aux_fun1.h +++ b/lib/gpu/lal_aux_fun1.h @@ -22,21 +22,21 @@ offset=tid & (t_per_atom-1); \ ii=fast_mul((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom)+tid/t_per_atom; -#define nbor_info(nbor_mem, packed_mem, nbor_stride, t_per_atom, ii, offset, \ - i, numj, stride, nbor_end, nbor_begin) \ - i=nbor_mem[ii]; \ - nbor_begin=ii+nbor_stride; \ - numj=nbor_mem[nbor_begin]; \ - if (nbor_mem==packed_mem) { \ - nbor_begin+=nbor_stride+fast_mul(ii,t_per_atom-1); \ - stride=fast_mul(t_per_atom,nbor_stride); \ - nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj & (t_per_atom-1)); \ +#define nbor_info(dev_nbor, dev_packed, nbor_pitch, t_per_atom, ii, offset, \ + i, numj, n_stride, nbor_end, nbor_begin) \ + i=dev_nbor[ii]; \ + nbor_begin=ii+nbor_pitch; \ + numj=dev_nbor[nbor_begin]; \ + if (dev_nbor==dev_packed) { \ + nbor_begin+=nbor_pitch+fast_mul(ii,t_per_atom-1); \ + n_stride=fast_mul(t_per_atom,nbor_pitch); \ + nbor_end=nbor_begin+fast_mul(numj/t_per_atom,n_stride)+(numj & (t_per_atom-1)); \ nbor_begin+=offset; \ } else { \ - nbor_begin+=nbor_stride; \ - nbor_begin=nbor_mem[nbor_begin]; \ + nbor_begin+=nbor_pitch; \ + nbor_begin=dev_nbor[nbor_begin]; \ nbor_end=nbor_begin+numj; \ - stride=t_per_atom; \ + n_stride=t_per_atom; \ nbor_begin+=offset; \ } diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index f772e3629574942dfe5443e95213314801509349..aa77a48c662ac07c7b105f30e0097f15574c4eaa 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -20,7 +20,7 @@ using namespace LAMMPS_AL; extern Device<PRECISION,ACC_PRECISION> global_device; template <class numtyp, class acctyp> -BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { +BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) { device=&global_device; ans=new Answer<numtyp,acctyp>(); nbor=new Neighbor(); @@ -53,8 +53,8 @@ int BaseThreeT::init_three(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_two, const char *k_three_center, - const char *k_three_end) { + const char *two, const char *three_center, + const char *three_end, const char *short_nbor) { screen=_screen; int gpu_nbor=0; @@ -70,10 +70,10 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _gpu_host=1; _threads_per_atom=device->threads_per_atom(); - if (_threads_per_atom>1 && gpu_nbor==0) { + if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1 nbor->packing(true); _nbor_data=&(nbor->dev_packed); - } else + } else // neigh yes or tpa == 1 _nbor_data=&(nbor->dev_nbor); if (_threads_per_atom*_threads_per_atom>device->warp_size()) return -10; @@ -97,7 +97,7 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _block_pair=device->pair_block_size(); _block_size=device->block_ellipse(); - compile_kernels(*ucl_device,pair_program,k_two,k_three_center,k_three_end); + compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor); // Initialize host-device load balancer hd_balancer.init(device,gpu_nbor,gpu_split); @@ -113,6 +113,11 @@ int BaseThreeT::init_three(const int nlocal, const int nall, _max_an_bytes+=ans2->gpu_bytes(); #endif + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE); + return 0; } @@ -136,6 +141,7 @@ void BaseThreeT::clear_atomic() { k_three_end.clear(); k_three_end_vatom.clear(); k_pair.clear(); + k_short_nbor.clear(); delete pair_program; _compiled=false; } @@ -143,6 +149,7 @@ void BaseThreeT::clear_atomic() { time_pair.clear(); hd_balancer.clear(); + dev_short_nbor.clear(); nbor->clear(); ans->clear(); #ifdef THREE_CONCURRENT @@ -169,6 +176,8 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist, if (!success) return NULL; + _nall = nall; + // originally the requirement that nall == nlist was enforced // to allow direct indexing neighbors of neighbors after re-arrangement // nbor->get_host3(nall,nlist,ilist,numj,firstneigh,block_size()); @@ -203,6 +212,8 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum, return 0; atom->cast_copy_x(host_x,host_type); + _nall = nall; + int mn; nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag, nspecial, special, success, mn); @@ -247,12 +258,22 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall, reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); if (!success) return; + _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist); } atom->cast_x_data(host_x,host_type); hd_balancer.start_timer(); atom->add_x_data(host_x,host_type); + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + + // _ainum to be used in loop() for short neighbor list build + _ainum = nlist; + int evatom=0; if (eatom || vatom) evatom=1; @@ -300,7 +321,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, // Build neighbor list on GPU if necessary if (ago==0) { - build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, sublo, subhi, tag, nspecial, special, success); if (!success) return NULL; @@ -313,6 +334,15 @@ int ** BaseThreeT::compute(const int ago, const int inum_full, *ilist=nbor->host_ilist.begin(); *jnum=nbor->host_acc.begin(); + // re-allocate dev_short_nbor if necessary + if (nall*(2+_max_nbors) > dev_short_nbor.cols()) { + int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); + dev_short_nbor.resize((2+_max_nbors)*_nmax); + } + + // _ainum to be used in loop() for short neighbor list build + _ainum = nall; + int evatom=0; if (eatom || vatom) evatom=1; @@ -339,19 +369,20 @@ double BaseThreeT::host_memory_usage_atomic() const { template <class numtyp, class acctyp> void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *ktwo, const char *kthree_center, - const char *kthree_end) { + const char *two, const char *three_center, + const char *three_end, const char* short_nbor) { if (_compiled) return; - std::string vatom_name=std::string(kthree_end)+"_vatom"; + std::string vatom_name=std::string(three_end)+"_vatom"; pair_program=new UCL_Program(dev); pair_program->load_string(pair_str,device->compile_string().c_str()); - k_three_center.set_function(*pair_program,kthree_center); - k_three_end.set_function(*pair_program,kthree_end); + k_three_center.set_function(*pair_program,three_center); + k_three_end.set_function(*pair_program,three_end); k_three_end_vatom.set_function(*pair_program,vatom_name.c_str()); - k_pair.set_function(*pair_program,ktwo); + k_pair.set_function(*pair_program,two); + k_short_nbor.set_function(*pair_program,short_nbor); pos_tex.get_texture(*pair_program,"pos_tex"); #ifdef THREE_CONCURRENT diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h index 4f27ecdf92a5014d58986f0718fab9fa3a34df50..f5f36863c4bcc4f3867bf75fa66819c916c5c2fd 100644 --- a/lib/gpu/lal_base_three.h +++ b/lib/gpu/lal_base_three.h @@ -56,7 +56,8 @@ class BaseThree { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *k_two, - const char *k_three_center, const char *k_three_end); + const char *k_three_center, const char *k_three_end, + const char *k_short_nbor=NULL); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -73,18 +74,18 @@ class BaseThree { } /// Check if there is enough storage for neighbors and realloc if not - /** \param nlocal number of particles whose nbors must be stored on device - * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors + /** \param inum number of particles whose nbors must be stored on device + * \param max_nbors maximum number of neighbors + * \param success set to false if insufficient memory * \note olist_size=total number of local particles **/ inline void resize_local(const int inum, const int max_nbors, bool &success) { nbor->resize(inum,max_nbors,success); } /// Check if there is enough storage for neighbors and realloc if not - /** \param nlocal number of particles whose nbors must be stored on device + /** \param inum number of particles whose nbors must be stored on device * \param host_inum number of particles whose nbors need to copied to host - * \param current maximum number of neighbors + * \param max_nbors current maximum number of neighbors * \note host_inum is 0 if the host is performing neighboring * \note nlocal+host_inum=total number local particles * \note olist_size=0 **/ @@ -143,14 +144,6 @@ class BaseThree { const bool vflag, const bool eatom, const bool vatom, int &host_start, const double cpu_time, bool &success); - /// Pair loop with device neighboring - int * compute(const int ago, const int inum_full, const int nall, - double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - const double cpu_time, bool &success); - /// Pair loop with device neighboring int ** compute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, double *sublo, @@ -193,6 +186,9 @@ class BaseThree { /// Neighbor data Neighbor *nbor; + UCL_D_Vec<int> dev_short_nbor; + UCL_Kernel k_short_nbor; + // ------------------------- DEVICE KERNELS ------------------------- UCL_Program *pair_program; UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom; @@ -207,12 +203,13 @@ class BaseThree { int _block_pair, _block_size, _threads_per_atom, _end_command_queue; int _gpu_nbor; double _max_bytes, _max_an_bytes; + int _max_nbors, _ainum, _nall; double _gpu_overhead, _driver_overhead; UCL_D_Vec<int> *_nbor_data; void compile_kernels(UCL_Device &dev, const void *pair_string, - const char *k_two, const char *k_three_center, - const char *k_three_end); + const char *two, const char *three_center, + const char *three_end, const char* short_nbor); virtual void loop(const bool _eflag, const bool _vflag, const int evatom) = 0; diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp index 3492d7030efaf6d3878ade630ca68cabe4b77cb7..24984e48785d665c63ad9cae54dc45812846a5f2 100644 --- a/lib/gpu/lal_sw.cpp +++ b/lib/gpu/lal_sw.cpp @@ -55,7 +55,7 @@ int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_ int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,sw,"k_sw","k_sw_three_center", - "k_sw_three_end"); + "k_sw_three_end","k_sw_short_nbor"); if (success!=0) return success; @@ -193,19 +193,30 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + // build the short neighbor list + int ainum=this->_ainum; + int nbor_pitch=this->nbor->nbor_pitch(); + int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ (BX/this->_threads_per_atom))); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); this->time_pair.start(); - + this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -217,6 +228,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -231,7 +243,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -240,7 +252,7 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu index 46330c59e49faaa81168d555adb9b07d5ba32d7c..a5c9f49d08f9b7bf4e4383f3e132324c8e5c02aa 100644 --- a/lib/gpu/lal_sw.cu +++ b/lib/gpu/lal_sw.cu @@ -130,6 +130,63 @@ texture<int4> sw3_tex; #endif +__kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict sw3, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii<inum) { + int nbor, nbor_end; + int i, numj; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; + + int ncount = 0; + int m = nbor; + dev_short_nbor[m] = 0; + int nbor_short = nbor+n_stride; + + for ( ; nbor<nbor_end; nbor+=n_stride) { + + int j=dev_packed[nbor]; + int nj = j; + j &= NEIGHMASK; + + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y + dev_short_nbor[nbor_short] = nj; + nbor_short += n_stride; + ncount++; + } + } // for nbor + + // store the number of neighbors for each thread + dev_short_nbor[m] = ncount; + + } // if ii +} __kernel void k_sw(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict sw1, @@ -140,6 +197,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, const int inum, @@ -158,8 +216,8 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor, nbor_end; - int i, numj; + int nbor, nbor_end, i, numj; + const int* nbor_mem = dev_packed; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -167,9 +225,17 @@ __kernel void k_sw(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor<nbor_end; nbor+=n_stride) { - int j=dev_packed[nbor]; + int j=nbor_mem[nbor]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -337,6 +403,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -361,7 +428,7 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -371,9 +438,18 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -395,14 +471,23 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_, sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma; sw_cut_ij=sw3_ijparam.x; - int nbor_k=nbor_j-offset_j+offset_k; - if (nbor_k<=nbor_j) - nbor_k+=n_stride; + int nbor_k,k_end; + if (dev_packed==dev_nbor) { + nbor_k=nborj_start-offset_j+offset_k; + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } else { + nbor_k = nbor_j-offset_j+offset_k; + if (nbor_k<=nbor_j) nbor_k += n_stride; + k_end = nbor_end; + } - for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + for ( ; nbor_k<k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; + if (dev_packed==dev_nbor && k <= j) continue; + numtyp4 kx; fetch4(kx,k,pos_tex); int ktype=kx.w; ktype=map[ktype]; @@ -460,6 +545,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -484,7 +570,7 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -494,8 +580,16 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -534,8 +628,15 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_, nbor_k+=offset_k; } + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } + for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -598,6 +699,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -622,7 +724,7 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -632,8 +734,16 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -672,8 +782,15 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_, nbor_k+=offset_k; } + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } + for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp index 6b0b563d9f3e8e40dae250ddc5f88f05457e3dee..a63d286d9c16a9120ee9e300f16d9fda4e5ecdfc 100644 --- a/lib/gpu/lal_tersoff.cpp +++ b/lib/gpu/lal_tersoff.cpp @@ -55,7 +55,8 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff,"k_tersoff_repulsive", - "k_tersoff_three_center", "k_tersoff_three_end"); + "k_tersoff_three_center", "k_tersoff_three_end", + "k_tersoff_short_nbor"); if (success!=0) return success; @@ -157,11 +158,16 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i<nparams; i++) + double cutsqmax = 0.0; + for (int i=0; i<nparams; i++) { cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast<numtyp>(cutsqmax); + UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -220,191 +226,57 @@ double TersoffT::host_memory_usage() const { #define KTHREADS this->_threads_per_atom #define JTHREADS this->_threads_per_atom // --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. +// Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> -void TersoffT::compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { - this->acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return; - } - - int ago=this->hd_balancer.ago_first(f_ago); - int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - if (ago==0) { - this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); - if (!success) - return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - } - - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } +void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { + // Compute the block size and grid size to keep all cores busy + int BX=this->block_pair(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; - int _eflag; - if (eflag) - _eflag=1; + if (_vflag) + vflag=1; else - _eflag=0; + vflag=0; - int ainum=nlist; + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template <class numtyp, class acctyp> -int ** TersoffT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { - this->acc_timers(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return NULL; - } + (BX/this->_threads_per_atom))); - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + int nall = this->_nall; + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/ (BX/(JTHREADS*KTHREADS)))); this->k_zeta.set_size(GX,BX); this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); - - return this->nbor->host_jlist.begin()-host_start; -} + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); -// --------------------------------------------------------------------------- -// Calculate energies, forces, and torques -// --------------------------------------------------------------------------- -template <class numtyp, class acctyp> -void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -412,6 +284,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -423,6 +296,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -437,7 +311,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -446,7 +320,7 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu index b7d48d9e344220910f9456eb0e7a0ed008b6ce93..cdeb5679d8421544ba07b2a880525d062196c231 100644 --- a/lib/gpu/lal_tersoff.cu +++ b/lib/gpu/lal_tersoff.cu @@ -106,7 +106,7 @@ texture<int4> ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -155,7 +155,7 @@ texture<int4> ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -164,6 +164,65 @@ texture<int4> ts5_tex; #endif +__kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii<inum) { + int nbor, nbor_end; + int i, numj; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; + + int ncount = 0; + int m = nbor; + dev_short_nbor[m] = 0; + int nbor_short = nbor+n_stride; + + for ( ; nbor<nbor_end; nbor+=n_stride) { + + int j=dev_packed[nbor]; + int nj = j; + j &= NEIGHMASK; + + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<cutsq[ijparam]) { + dev_short_nbor[nbor_short] = nj; + nbor_short += n_stride; + ncount++; + } + } // for nbor + + // store the number of neighbors for each thread + dev_short_nbor[m] = ncount; + + } // if ii +} + // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries // while the block size should never be less than 32. // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block @@ -184,6 +243,7 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, const int eflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; @@ -211,22 +271,29 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor_j, nbor_end; - int i, numj; - + int nbor_j, nbor_end, i, numj; + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -241,14 +308,20 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, delr1.z = jx.z-ix.z; numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z; - if (rsq1 > cutsq[ijparam]) continue; +// if (rsq1 > cutsq[ijparam]) continue; // compute zeta_ij z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -284,10 +357,12 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -330,6 +405,7 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -356,8 +432,8 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor, nbor_end; - int i, numj; + int nbor, nbor_end, i, numj; + const int* nbor_mem=dev_packed; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -365,9 +441,17 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor<nbor_end; nbor+=n_stride) { - int j=dev_packed[nbor]; + int j=nbor_mem[nbor]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -382,32 +466,31 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq<cutsq[ijparam]) { - numtyp feng[2]; - numtyp ijparam_lam1 = ts1[ijparam].x; - numtyp4 ts2_ijparam = ts2[ijparam]; - numtyp ijparam_biga = ts2_ijparam.x; - numtyp ijparam_bigr = ts2_ijparam.z; - numtyp ijparam_bigd = ts2_ijparam.w; - - repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, - rsq, eflag, feng); - - numtyp force = feng[0]; - f.x+=delx*force; - f.y+=dely*force; - f.z+=delz*force; - - if (eflag>0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + // rsq<cutsq[ijparam] + numtyp feng[2]; + numtyp ijparam_lam1 = ts1[ijparam].x; + numtyp4 ts2_ijparam = ts2[ijparam]; + numtyp ijparam_biga = ts2_ijparam.x; + numtyp ijparam_bigr = ts2_ijparam.z; + numtyp ijparam_bigd = ts2_ijparam.w; + + repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, + rsq, eflag, feng); + + numtyp force = feng[0]; + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -428,6 +511,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -461,20 +545,28 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end; - + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -489,7 +581,6 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -497,9 +588,11 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; @@ -520,9 +613,15 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_, virial[5] += delr1[1]*delr1[2]*mforce; } - int nbor_k=nborj_start-offset_j+offset_k; - for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int nbor_k = nborj_start-offset_j+offset_k; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k<k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (j == k) continue; @@ -598,6 +697,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -632,7 +732,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -643,9 +743,18 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, itype=map[itype]; numtyp tpainv = ucl_recip((numtyp)t_per_atom); + + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -660,8 +769,6 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -683,13 +790,20 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; @@ -711,9 +825,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; //int idx = iix*n_stride + j*t_per_atom + offset_kf; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, ijnum, offset_kf, idx); + //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor + int idx = ijnum; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, ijnum, offset_kf, idx); acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; @@ -736,7 +852,7 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -777,9 +893,11 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_, //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, nbor_k, offset_k, idx); + //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor + int idx = nbor_k; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, nbor_k, offset_k, idx); acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -824,6 +942,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -858,7 +977,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -869,9 +988,18 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, itype=map[itype]; numtyp tpainv = ucl_recip((numtyp)t_per_atom); + + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -886,8 +1014,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -909,13 +1035,20 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; @@ -937,9 +1070,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; //int idx = iix*n_stride + j*t_per_atom + offset_kf; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, ijnum, offset_kf, idx); + //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor + int idx = ijnum; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, ijnum, offset_kf, idx); acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; @@ -962,7 +1097,7 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -1010,9 +1145,11 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, nbor_k, offset_k, idx); + //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor + int idx = nbor_k; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, nbor_k, offset_k, idx); acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; @@ -1040,7 +1177,6 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_, virial[3] += TWOTHIRD*(delr2[0]*fj[1] + mdelr1[0]*fk[1]); virial[4] += TWOTHIRD*(delr2[0]*fj[2] + mdelr1[0]*fk[2]); virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]); - } } // for nbor diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h index c72ebd72866d0069065a61f1fabfb749b328c042..fd01af031a7b0865be81709b3d09ce4e3666bb61 100644 --- a/lib/gpu/lal_tersoff.h +++ b/lib/gpu/lal_tersoff.h @@ -47,21 +47,6 @@ class Tersoff : public BaseThree<numtyp, acctyp> { const double* h, const double* gamma, const double* beta, const double* powern, const double* cutsq); - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success); - - /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); - /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -104,8 +89,7 @@ class Tersoff : public BaseThree<numtyp, acctyp> { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - - int _max_nbors; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp index 553dad3583e345ca317d35a4bd0343be1aa50617..c37c07f1a1234d1945cfefdac45d42f9be446458 100644 --- a/lib/gpu/lal_tersoff_mod.cpp +++ b/lib/gpu/lal_tersoff_mod.cpp @@ -55,7 +55,8 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_mod,"k_tersoff_mod_repulsive", - "k_tersoff_mod_three_center", "k_tersoff_mod_three_end"); + "k_tersoff_mod_three_center", "k_tersoff_mod_three_end", + "k_tersoff_mod_short_nbor"); if (success!=0) return success; @@ -157,11 +158,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i<nparams; i++) + double cutsqmax = 0.0; + for (int i=0; i<nparams; i++) { cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast<numtyp>(cutsqmax); + UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -220,191 +226,57 @@ double TersoffMT::host_memory_usage() const { #define KTHREADS this->_threads_per_atom #define JTHREADS this->_threads_per_atom // --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. +// Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> -void TersoffMT::compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { - this->acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return; - } - - int ago=this->hd_balancer.ago_first(f_ago); - int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - if (ago==0) { - this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); - if (!success) - return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - } - - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } +void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { + // Compute the block size and grid size to keep all cores busy + int BX=this->block_pair(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; - int _eflag; - if (eflag) - _eflag=1; + if (_vflag) + vflag=1; else - _eflag=0; + vflag=0; - int ainum=nlist; + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template <class numtyp, class acctyp> -int ** TersoffMT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { - this->acc_timers(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return NULL; - } + (BX/this->_threads_per_atom))); - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + int nall = this->_nall; + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/ (BX/(JTHREADS*KTHREADS)))); this->k_zeta.set_size(GX,BX); this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); - - return this->nbor->host_jlist.begin()-host_start; -} + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); -// --------------------------------------------------------------------------- -// Calculate energies, forces, and torques -// --------------------------------------------------------------------------- -template <class numtyp, class acctyp> -void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -412,6 +284,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -423,6 +296,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -437,7 +311,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -446,7 +320,7 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu index 3a81b369415174bd5399bf879d6872710c19f426..576359b5146246d74078b0deff98a700bbdcb533 100644 --- a/lib/gpu/lal_tersoff_mod.cu +++ b/lib/gpu/lal_tersoff_mod.cu @@ -106,7 +106,7 @@ texture<int4> ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -155,7 +155,7 @@ texture<int4> ts5_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -164,6 +164,65 @@ texture<int4> ts5_tex; #endif +__kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii<inum) { + int nbor, nbor_end; + int i, numj; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; + + int ncount = 0; + int m = nbor; + dev_short_nbor[m] = 0; + int nbor_short = nbor+n_stride; + + for ( ; nbor<nbor_end; nbor+=n_stride) { + + int j=dev_packed[nbor]; + int nj = j; + j &= NEIGHMASK; + + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<cutsq[ijparam]) { + dev_short_nbor[nbor_short] = nj; + nbor_short += n_stride; + ncount++; + } + } // for nbor + + // store the number of neighbors for each thread + dev_short_nbor[m] = ncount; + + } // if ii +} + // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries // while the block size should never be less than 32. // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block @@ -184,6 +243,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, const int eflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; @@ -211,22 +271,29 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor_j, nbor_end; - int i, numj; - + int nbor_j, nbor_end, i, numj; + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -241,14 +308,18 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, delr1.z = jx.z-ix.z; numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z; - if (rsq1 > cutsq[ijparam]) continue; - // compute zeta_ij - z = (numtyp)0; + z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -287,10 +358,12 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -331,6 +404,7 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -357,8 +431,8 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor, nbor_end; - int i, numj; + int nbor, nbor_end, i, numj; + const int* nbor_mem=dev_packed; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -366,9 +440,17 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor<nbor_end; nbor+=n_stride) { - int j=dev_packed[nbor]; + int j=nbor_mem[nbor]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -383,32 +465,31 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq<cutsq[ijparam]) { - numtyp feng[2]; - numtyp ijparam_lam1 = ts1[ijparam].x; - numtyp4 ts2_ijparam = ts2[ijparam]; - numtyp ijparam_biga = ts2_ijparam.x; - numtyp ijparam_bigr = ts2_ijparam.z; - numtyp ijparam_bigd = ts2_ijparam.w; - - repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, - rsq, eflag, feng); - - numtyp force = feng[0]; - f.x+=delx*force; - f.y+=dely*force; - f.z+=delz*force; - - if (eflag>0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + // rsq<cutsq[ijparam] + numtyp feng[2]; + numtyp ijparam_lam1 = ts1[ijparam].x; + numtyp4 ts2_ijparam = ts2[ijparam]; + numtyp ijparam_biga = ts2_ijparam.x; + numtyp ijparam_bigr = ts2_ijparam.z; + numtyp ijparam_bigd = ts2_ijparam.w; + + repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, + rsq, eflag, feng); + + numtyp force = feng[0]; + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -430,6 +511,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -465,20 +547,28 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end; - + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -493,7 +583,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -501,9 +590,11 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; @@ -524,9 +615,15 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_, virial[5] += delr1[1]*delr1[2]*mforce; } - int nbor_k=nborj_start-offset_j+offset_k; - for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int nbor_k = nborj_start-offset_j+offset_k; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k<k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (j == k) continue; @@ -606,6 +703,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -642,7 +740,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -653,9 +751,18 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, itype=map[itype]; numtyp tpainv = ucl_recip((numtyp)t_per_atom); + + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -670,8 +777,6 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -693,13 +798,20 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; @@ -721,9 +833,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; //int idx = iix*n_stride + j*t_per_atom + offset_kf; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, ijnum, offset_kf, idx); + //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor + int idx = ijnum; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, ijnum, offset_kf, idx); acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; @@ -746,7 +860,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -790,9 +904,11 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_, //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, nbor_k, offset_k, idx); + //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor + int idx = nbor_k; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, nbor_k, offset_k, idx); acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -841,6 +957,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -877,7 +994,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -888,9 +1005,18 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, itype=map[itype]; numtyp tpainv = ucl_recip((numtyp)t_per_atom); + + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -905,8 +1031,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -928,13 +1052,20 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; @@ -956,9 +1087,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; //int idx = iix*n_stride + j*t_per_atom + offset_kf; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, ijnum, offset_kf, idx); + //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor + int idx = ijnum; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, ijnum, offset_kf, idx); acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; @@ -981,7 +1114,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -1032,9 +1165,11 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_, //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, nbor_k, offset_k, idx); + //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor + int idx = nbor_k; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, nbor_k, offset_k, idx); acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h index 9a05c6600936d40e051b135b38b654937fded536..ab1560d9518958e4f5d7c7d9e2f7363f8c4c1636 100644 --- a/lib/gpu/lal_tersoff_mod.h +++ b/lib/gpu/lal_tersoff_mod.h @@ -47,21 +47,6 @@ class TersoffMod : public BaseThree<numtyp, acctyp> { const double* h, const double* beta, const double* powern, const double* powern_del, const double* ca1, const double* cutsq); - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success); - - /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); - /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -104,8 +89,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex; - - int _max_nbors; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp index 9cce8a802df1cd7ec39b680930b9d024d45f4e2b..341f663030f6330ec0ffbfd97526a24e5e59e87a 100644 --- a/lib/gpu/lal_tersoff_zbl.cpp +++ b/lib/gpu/lal_tersoff_zbl.cpp @@ -62,7 +62,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,tersoff_zbl,"k_tersoff_zbl_repulsive", - "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end"); + "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end", + "k_tersoff_zbl_short_nbor"); if (success!=0) return success; @@ -177,11 +178,16 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall, UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device), UCL_WRITE_ONLY); - for (int i=0; i<nparams; i++) + double cutsqmax = 0.0; + for (int i=0; i<nparams; i++) { cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]); + if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i]; + } cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); ucl_copy(cutsq,cutsq_view,false); + _cutshortsq = static_cast<numtyp>(cutsqmax); + UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements, *(this->ucl_device), UCL_WRITE_ONLY); @@ -245,191 +251,57 @@ double TersoffZT::host_memory_usage() const { #define KTHREADS this->_threads_per_atom #define JTHREADS this->_threads_per_atom // --------------------------------------------------------------------------- -// Copy nbor list from host if necessary and then calculate forces, virials,.. +// Calculate energies, forces, and torques // --------------------------------------------------------------------------- template <class numtyp, class acctyp> -void TersoffZT::compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, - const bool eflag, const bool vflag, const bool eatom, - const bool vatom, int &host_start, - const double cpu_time, bool &success) { - this->acc_timers(); - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return; - } - - int ago=this->hd_balancer.ago_first(f_ago); - int inum=this->hd_balancer.balance(ago,inum_full,cpu_time); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - if (ago==0) { - this->reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success); - if (!success) - return; - _max_nbors = this->nbor->max_nbor_loop(nlist,numj,ilist); - } - - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - - // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { - int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); - } +void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { + // Compute the block size and grid size to keep all cores busy + int BX=this->block_pair(); + int eflag, vflag; + if (_eflag) + eflag=1; + else + eflag=0; - int _eflag; - if (eflag) - _eflag=1; + if (_vflag) + vflag=1; else - _eflag=0; + vflag=0; - int ainum=nlist; + // build the short neighbor list + int ainum=this->_ainum; int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ - (BX/(JTHREADS*KTHREADS)))); - - this->k_zeta.set_size(GX,BX); - this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, - &map, &elem2param, &_nelements, &_nparams, &_zetaij, - &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom,ilist); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); -} - -// --------------------------------------------------------------------------- -// Reneighbor on GPU if necessary and then compute forces, virials, energies -// --------------------------------------------------------------------------- -template <class numtyp, class acctyp> -int ** TersoffZT::compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, - double *sublo, double *subhi, tagint *tag, - int **nspecial, tagint **special, const bool eflag, - const bool vflag, const bool eatom, - const bool vatom, int &host_start, - int **ilist, int **jnum, - const double cpu_time, bool &success) { - this->acc_timers(); - - if (inum_full==0) { - host_start=0; - // Make sure textures are correct if realloc by a different hybrid style - this->resize_atom(0,nall,success); - this->zero_timers(); - return NULL; - } + (BX/this->_threads_per_atom))); - this->hd_balancer.balance(cpu_time); - int inum=this->hd_balancer.get_gpu_count(ago,inum_full); - this->ans->inum(inum); - #ifdef THREE_CONCURRENT - this->ans2->inum(inum); - #endif - host_start=inum; - - // Build neighbor list on GPU if necessary - if (ago==0) { - _max_nbors = this->build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, - sublo, subhi, tag, nspecial, special, success); - if (!success) - return NULL; - this->hd_balancer.start_timer(); - } else { - this->atom->cast_x_data(host_x,host_type); - this->hd_balancer.start_timer(); - this->atom->add_x_data(host_x,host_type); - } - *ilist=this->nbor->host_ilist.begin(); - *jnum=this->nbor->host_acc.begin(); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, &cutsq, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); // re-allocate zetaij if necessary - if (nall*_max_nbors > _zetaij.cols()) { + int nall = this->_nall; + if (nall*this->_max_nbors > _zetaij.cols()) { int _nmax=static_cast<int>(static_cast<double>(nall)*1.10); - _zetaij.resize(_max_nbors*_nmax); + _zetaij.resize(this->_max_nbors*_nmax); } - int _eflag; - if (eflag) - _eflag=1; - else - _eflag=0; - - int ainum=nall; - int nbor_pitch=this->nbor->nbor_pitch(); - int BX=this->block_pair(); - int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/ (BX/(JTHREADS*KTHREADS)))); this->k_zeta.set_size(GX,BX); this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &_eflag, &ainum, &nbor_pitch, &this->_threads_per_atom); - - int evatom=0; - if (eatom || vatom) - evatom=1; - #ifdef THREE_CONCURRENT - this->ucl_device->sync(); - #endif - loop(eflag,vflag,evatom); - this->ans->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans); - #ifdef THREE_CONCURRENT - this->ans2->copy_answers(eflag,vflag,eatom,vatom); - this->device->add_ans_object(this->ans2); - #endif - this->hd_balancer.stop_timer(); - - return this->nbor->host_jlist.begin()-host_start; -} + &this->dev_short_nbor, + &_eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom); -// --------------------------------------------------------------------------- -// Calculate energies, forces, and torques -// --------------------------------------------------------------------------- -template <class numtyp, class acctyp> -void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { - // Compute the block size and grid size to keep all cores busy - int BX=this->block_pair(); - int eflag, vflag; - if (_eflag) - eflag=1; - else - eflag=0; - - if (_vflag) - vflag=1; - else - vflag=0; - - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); - int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ (BX/this->_threads_per_atom))); this->time_pair.start(); @@ -438,6 +310,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq, &map, &elem2param, &_nelements, &_nparams, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom); @@ -449,6 +322,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); @@ -463,7 +337,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); @@ -472,7 +346,7 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq, &map, &elem2param, &_nelements, &_nparams, &_zetaij, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu index 9509b9802c70cdea3ee6ef2202bb59fe00694371..e8bb017f5901a9184b80403d8814aab50794500c 100644 --- a/lib/gpu/lal_tersoff_zbl.cu +++ b/lib/gpu/lal_tersoff_zbl.cu @@ -109,7 +109,7 @@ texture<int4> ts6_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ __local acctyp red_acc[BLOCK_PAIR]; \ red_acc[tid]=z; \ @@ -158,7 +158,7 @@ texture<int4> ts6_tex; ans[ii]=old; \ } -#define store_zeta(z, tid, t_per_atom, offset) \ +#define acc_zeta(z, tid, t_per_atom, offset) \ if (t_per_atom>1) { \ for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ z += shfl_xor(z, s, t_per_atom); \ @@ -167,6 +167,65 @@ texture<int4> ts6_tex; #endif +__kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp *restrict cutsq, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii<inum) { + int nbor, nbor_end; + int i, numj; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; + + int ncount = 0; + int m = nbor; + dev_short_nbor[m] = 0; + int nbor_short = nbor+n_stride; + + for ( ; nbor<nbor_end; nbor+=n_stride) { + + int j=dev_packed[nbor]; + int nj = j; + j &= NEIGHMASK; + + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<cutsq[ijparam]) { + dev_short_nbor[nbor_short] = nj; + nbor_short += n_stride; + ncount++; + } + } // for nbor + + // store the number of neighbors for each thread + dev_short_nbor[m] = ncount; + + } // if ii +} + // Tersoff is currently used for 3 elements at most: 3*3*3 = 27 entries // while the block size should never be less than 32. // SHARED_SIZE = 32 for now to reduce the pressure on the shared memory per block @@ -188,6 +247,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, __global acctyp4 * zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, const int eflag, const int inum, const int nbor_pitch, const int t_per_atom) { __local int tpa_sq,n_stride; @@ -217,22 +277,29 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor_j, nbor_end; - int i, numj; - + int nbor_j, nbor_end, i, numj; + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -247,14 +314,18 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, delr1.z = jx.z-ix.z; numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z; - if (rsq1 > cutsq[ijparam]) continue; - // compute zeta_ij z = (acctyp)0; int nbor_k = nborj_start-offset_j+offset_k; - for ( ; nbor_k < nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k < k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == j) continue; @@ -290,10 +361,12 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); - store_zeta(z, tid, t_per_atom, offset_k); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); + acc_zeta(z, tid, t_per_atom, offset_k); numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex); numtyp ijparam_lam2 = ts1_ijparam.y; @@ -342,6 +415,7 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, const int nelements, const int nparams, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -370,8 +444,8 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor, nbor_end; - int i, numj; + int nbor, nbor_end, i, numj; + const int* nbor_mem=dev_packed; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -379,9 +453,17 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor]; + nbor += n_stride; + nbor_end = nbor+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor<nbor_end; nbor+=n_stride) { - int j=dev_packed[nbor]; + int j=nbor_mem[nbor]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -396,38 +478,37 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_, numtyp delz = ix.z-jx.z; numtyp rsq = delx*delx+dely*dely+delz*delz; - if (rsq<cutsq[ijparam]) { - numtyp feng[2]; - numtyp ijparam_lam1 = ts1[ijparam].x; - numtyp4 ts2_ijparam = ts2[ijparam]; - numtyp ijparam_biga = ts2_ijparam.x; - numtyp ijparam_bigr = ts2_ijparam.z; - numtyp ijparam_bigd = ts2_ijparam.w; - numtyp4 ts6_ijparam = ts6[ijparam]; - numtyp ijparam_Z_i = ts6_ijparam.x; - numtyp ijparam_Z_j = ts6_ijparam.y; - numtyp ijparam_ZBLcut = ts6_ijparam.z; - numtyp ijparam_ZBLexpscale = ts6_ijparam.w; - - repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, - ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale, - global_e, global_a_0, global_epsilon_0, rsq, eflag, feng); - - numtyp force = feng[0]; - f.x+=delx*force; - f.y+=dely*force; - f.z+=delz*force; - - if (eflag>0) - energy+=feng[1]; - if (vflag>0) { - virial[0] += delx*delx*force; - virial[1] += dely*dely*force; - virial[2] += delz*delz*force; - virial[3] += delx*dely*force; - virial[4] += delx*delz*force; - virial[5] += dely*delz*force; - } + // rsq<cutsq[ijparam] + numtyp feng[2]; + numtyp ijparam_lam1 = ts1[ijparam].x; + numtyp4 ts2_ijparam = ts2[ijparam]; + numtyp ijparam_biga = ts2_ijparam.x; + numtyp ijparam_bigr = ts2_ijparam.z; + numtyp ijparam_bigd = ts2_ijparam.w; + numtyp4 ts6_ijparam = ts6[ijparam]; + numtyp ijparam_Z_i = ts6_ijparam.x; + numtyp ijparam_Z_j = ts6_ijparam.y; + numtyp ijparam_ZBLcut = ts6_ijparam.z; + numtyp ijparam_ZBLexpscale = ts6_ijparam.w; + + repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga, + ijparam_Z_i, ijparam_Z_j, ijparam_ZBLcut, ijparam_ZBLexpscale, + global_e, global_a_0, global_epsilon_0, rsq, eflag, feng); + + numtyp force = feng[0]; + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (eflag>0) + energy+=feng[1]; + if (vflag>0) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; } } // for nbor @@ -448,6 +529,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, const __global acctyp4 *restrict zetaij, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -481,20 +563,28 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end; - + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); int offset_k=tid & (t_per_atom-1); - int nborj_start = nbor_j; numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -509,7 +599,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; numtyp r1 = ucl_sqrt(rsq1); numtyp r1inv = ucl_rsqrt(rsq1); @@ -517,9 +606,11 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, //int jj = (nbor_j-offset_j-2*nbor_pitch) / n_stride; //int idx = jj*n_stride + i*t_per_atom + offset_j; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - i, nbor_j, offset_j, idx); + //idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor + int idx = nbor_j; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// i, nbor_j, offset_j, idx); acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex); numtyp force = zeta_ij.x*tpainv; numtyp prefactor = zeta_ij.y; @@ -540,9 +631,15 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_, virial[5] += delr1[1]*delr1[2]*mforce; } - int nbor_k=nborj_start-offset_j+offset_k; - for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int nbor_k = nborj_start-offset_j+offset_k; + int k_end = nbor_end; + if (dev_packed==dev_nbor) { + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } + + for ( ; nbor_k<k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (j == k) continue; @@ -618,6 +715,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -652,7 +750,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem=dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -663,9 +761,18 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, itype=map[itype]; numtyp tpainv = ucl_recip((numtyp)t_per_atom); + + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -680,8 +787,6 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -703,13 +808,20 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji: find i in the j's neighbor list int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; @@ -731,9 +843,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; //int idx = iix*n_stride + j*t_per_atom + offset_kf; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, ijnum, offset_kf, idx); + //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor + int idx = ijnum; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, ijnum, offset_kf, idx); acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; @@ -756,7 +870,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -797,9 +911,11 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_, //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, nbor_k, offset_k, idx); + //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor + int idx = nbor_k; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, nbor_k, offset_k, idx); acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype]; @@ -844,6 +960,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -878,7 +995,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -889,9 +1006,18 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, itype=map[itype]; numtyp tpainv = ucl_recip((numtyp)t_per_atom); + + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -906,8 +1032,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, delr1[2] = jx.z-ix.z; numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2]; - if (rsq1 > cutsq[ijparam]) continue; - numtyp mdelr1[3]; mdelr1[0] = -delr1[0]; mdelr1[1] = -delr1[1]; @@ -929,13 +1053,20 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, k_end=nbor_k+numk; nbor_k+=offset_k; } + + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } int nbork_start = nbor_k; // look up for zeta_ji int m = tid / t_per_atom; int ijnum = -1; for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) { ijnum = nbor_k; @@ -957,9 +1088,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, //int iix = (ijnum - offset_kf - 2*nbor_pitch) / n_stride; //int idx = iix*n_stride + j*t_per_atom + offset_kf; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, ijnum, offset_kf, idx); + //idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor + int idx = ijnum; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, ijnum, offset_kf, idx); acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex); numtyp force = zeta_ji.x*tpainv; numtyp prefactor_ji = zeta_ji.y; @@ -982,7 +1115,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, // attractive forces for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -1030,9 +1163,11 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_, //int kk = (nbor_k - offset_k - 2*nbor_pitch) / n_stride; //int idx = kk*n_stride + j*t_per_atom + offset_k; - int idx; - zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, - j, nbor_k, offset_k, idx); + //idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor + int idx = nbor_k; + if (dev_packed==dev_nbor) idx -= n_stride; +// zeta_idx(dev_nbor,dev_packed, nbor_pitch, n_stride, t_per_atom, +// j, nbor_k, offset_k, idx); acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex); numtyp prefactor_jk = zeta_jk.y; diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h index cc0b848845938f7dfca6f1b353d4f8604f5a4d58..0e6cac958722a51b2c0f672891bb5364cd464686 100644 --- a/lib/gpu/lal_tersoff_zbl.h +++ b/lib/gpu/lal_tersoff_zbl.h @@ -49,21 +49,6 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> { const double* ZBLcut, const double* ZBLexpscale, const double global_e, const double global_a_0, const double global_epsilon_0, const double* cutsq); - /// Pair loop with host neighboring - void compute(const int f_ago, const int inum_full, const int nall, - const int nlist, double **host_x, int *host_type, - int *ilist, int *numj, int **firstneigh, const bool eflag, - const bool vflag, const bool eatom, const bool vatom, - int &host_start, const double cpu_time, bool &success); - - /// Pair loop with device neighboring - int ** compute(const int ago, const int inum_full, - const int nall, double **host_x, int *host_type, double *sublo, - double *subhi, tagint *tag, int **nspecial, - tagint **special, const bool eflag, const bool vflag, - const bool eatom, const bool vatom, int &host_start, - int **ilist, int **numj, const double cpu_time, bool &success); - /// Clear all host and device data /** \note This is called at the beginning of the init() routine **/ void clear(); @@ -109,8 +94,8 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> { UCL_Kernel k_zeta; UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex; - int _max_nbors; numtyp _global_e,_global_a_0,_global_epsilon_0; + numtyp _cutshortsq; private: bool _allocated; diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp index 96537e65d30bb3eecc7fea2390012f92fd9e44cf..d03ac992bd468ea0677a6b537468016f56b7b3e0 100644 --- a/lib/gpu/lal_vashishta.cpp +++ b/lib/gpu/lal_vashishta.cpp @@ -59,7 +59,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i int success; success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split, _screen,vashishta,"k_vashishta","k_vashishta_three_center", - "k_vashishta_three_end"); + "k_vashishta_three_end","k_vashishta_short_nbor"); if (success!=0) return success; @@ -128,15 +128,18 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i param4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY); + double r0sqmax = 0; for (int i=0; i<nparams; i++) { - double r0sq = r0[i]*r0[i]-1e-4; // TODO: should we have the 1e-4? - + double r0sq = r0[i]*r0[i]; // TODO: should we have the 1e-4? + if (r0sqmax < r0sq) r0sqmax = r0sq; dview[i].x=static_cast<numtyp>(r0sq); dview[i].y=static_cast<numtyp>(gamma[i]); dview[i].z=static_cast<numtyp>(cutsq[i]); dview[i].w=static_cast<numtyp>(r0[i]); } + _cutshortsq = static_cast<numtyp>(r0sqmax); + ucl_copy(param4,dview,false); param4_tex.get_texture(*(this->pair_program),"param4_tex"); param4_tex.bind_float(param4,4); @@ -223,15 +226,28 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { else vflag=0; - int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + // build the short neighbor list + int ainum=this->_ainum; + int nbor_pitch=this->nbor->nbor_pitch(); + int GX=static_cast<int>(ceil(static_cast<double>(ainum)/ (BX/this->_threads_per_atom))); + this->k_short_nbor.set_size(GX,BX); + this->k_short_nbor.run(&this->atom->x, ¶m4, &map, + &elem2param, &_nelements, &_nparams, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &ainum, + &nbor_pitch, &this->_threads_per_atom); + // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1 // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1 - int ainum=this->ans->inum(); - int nbor_pitch=this->nbor->nbor_pitch(); + ainum=this->ans->inum(); + nbor_pitch=this->nbor->nbor_pitch(); + GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/ + (BX/this->_threads_per_atom))); this->time_pair.start(); + // note that k_pair does not run with the short neighbor list this->k_pair.set_size(GX,BX); this->k_pair.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, @@ -248,6 +264,7 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { this->k_three_center.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->dev_short_nbor, &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &evatom); Answer<numtyp,acctyp> *end_ans; @@ -257,21 +274,19 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) { end_ans=this->ans; #endif if (evatom!=0) { - this->k_three_end_vatom.set_size(GX,BX); this->k_three_end_vatom.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } else { - this->k_three_end.set_size(GX,BX); this->k_three_end.run(&this->atom->x, ¶m1, ¶m2, ¶m3, ¶m4, ¶m5, &map, &elem2param, &_nelements, &this->nbor->dev_nbor, &this->_nbor_data->begin(), - &this->nbor->dev_acc, + &this->nbor->dev_acc, &this->dev_short_nbor, &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor); } diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu index caa3c036134dbae3fd96429dff6eeaff1d27081c..fa7f413aa532bf5e05aa28f4750e7be59e223063 100644 --- a/lib/gpu/lal_vashishta.cu +++ b/lib/gpu/lal_vashishta.cu @@ -136,6 +136,64 @@ texture<int4> param5_tex; #endif +__kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict param4, + const __global int *restrict map, + const __global int *restrict elem2param, + const int nelements, const int nparams, + const __global int * dev_nbor, + const __global int * dev_packed, + __global int * dev_short_nbor, + const int inum, const int nbor_pitch, + const int t_per_atom) { + __local int n_stride; + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + if (ii<inum) { + int nbor, nbor_end; + int i, numj; + nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, + n_stride,nbor_end,nbor); + + numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i]; + int itype=ix.w; + itype=map[itype]; + + int ncount = 0; + int m = nbor; + dev_short_nbor[m] = 0; + int nbor_short = nbor+n_stride; + + for ( ; nbor<nbor_end; nbor+=n_stride) { + + int j=dev_packed[nbor]; + int nj = j; + j &= NEIGHMASK; + + numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; + int jtype=jx.w; + jtype=map[jtype]; + int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; + + // Compute r12 + numtyp delx = ix.x-jx.x; + numtyp dely = ix.y-jx.y; + numtyp delz = ix.z-jx.z; + numtyp rsq = delx*delx+dely*dely+delz*delz; + + if (rsq<param4[ijparam].x) { //param4[ijparam].x = r0sq; //param4[ijparam].z=cutsq + dev_short_nbor[nbor_short] = nj; + nbor_short += n_stride; + ncount++; + } + } // for nbor + + // store the number of neighbors for each thread + dev_short_nbor[m] = ncount; + + } // if ii +} __kernel void k_vashishta(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict param1, @@ -166,8 +224,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, __syncthreads(); if (ii<inum) { - int nbor, nbor_end; - int i, numj; + int nbor, nbor_end, i, numj; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj, n_stride,nbor_end,nbor); @@ -211,7 +268,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp param3_dvrc=param3_ijparam.z; numtyp param3_c0 =param3_ijparam.w; - numtyp r=sqrt(rsq); + numtyp r=ucl_sqrt(rsq); numtyp rinvsq=1.0/rsq; numtyp r4inv = rinvsq*rinvsq; numtyp r6inv = rinvsq*r4inv; @@ -219,8 +276,8 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp reta = pow(r,-param1_eta); numtyp lam1r = r*param1_lam1inv; numtyp lam4r = r*param1_lam4inv; - numtyp vc2 = param1_zizj * exp(-lam1r)/r; - numtyp vc3 = param2_mbigd * r4inv*exp(-lam4r); + numtyp vc2 = param1_zizj * ucl_exp(-lam1r)/r; + numtyp vc3 = param2_mbigd * r4inv*ucl_exp(-lam4r); numtyp force = (param2_dvrc*r - (4.0*vc3 + lam4r*vc3+param2_big6w*r6inv @@ -230,6 +287,7 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; + if (eflag>0) energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0); @@ -255,31 +313,31 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp r1 = ucl_sqrt(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \ numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \ - numtyp gsrainv1 = param_gamma_ij * rainv1; \ + numtyp gsrainv1 = param_gamma_ij * rainv1; \ numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \ numtyp expgsrainv1 = ucl_exp(gsrainv1); \ \ numtyp r2 = ucl_sqrt(rsq2); \ numtyp rinvsq2 = ucl_recip(rsq2); \ numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \ - numtyp gsrainv2 = param_gamma_ik * rainv2; \ + numtyp gsrainv2 = param_gamma_ik * rainv2; \ numtyp gsrainvsq2 = gsrainv2*rainv2/r2; \ numtyp expgsrainv2 = ucl_exp(gsrainv2); \ \ numtyp rinv12 = ucl_recip(r1*r2); \ numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \ - numtyp delcs = cs - param_costheta_ijk; \ + numtyp delcs = cs - param_costheta_ijk; \ numtyp delcssq = delcs*delcs; \ - numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ + numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ numtyp pcsinvsq = pcsinv*pcsinv; \ numtyp pcs = delcssq/pcsinv; \ \ numtyp facexp = expgsrainv1*expgsrainv2; \ \ - numtyp facrad = param_bigb_ijk * facexp*pcs; \ + numtyp facrad = param_bigb_ijk * facexp*pcs; \ numtyp frad1 = facrad*gsrainvsq1; \ numtyp frad2 = facrad*gsrainvsq2; \ - numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ + numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ numtyp facang12 = rinv12*facang; \ numtyp csfacang = cs*facang; \ numtyp csfac1 = rinvsq1*csfacang; \ @@ -311,28 +369,28 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_, numtyp r1 = ucl_sqrt(rsq1); \ numtyp rinvsq1 = ucl_recip(rsq1); \ numtyp rainv1 = ucl_recip(r1 - param_r0_ij); \ - numtyp gsrainv1 = param_gamma_ij * rainv1; \ + numtyp gsrainv1 = param_gamma_ij * rainv1; \ numtyp gsrainvsq1 = gsrainv1*rainv1/r1; \ numtyp expgsrainv1 = ucl_exp(gsrainv1); \ \ numtyp r2 = ucl_sqrt(rsq2); \ numtyp rainv2 = ucl_recip(r2 - param_r0_ik); \ - numtyp gsrainv2 = param_gamma_ik * rainv2; \ + numtyp gsrainv2 = param_gamma_ik * rainv2; \ numtyp expgsrainv2 = ucl_exp(gsrainv2); \ \ numtyp rinv12 = ucl_recip(r1*r2); \ numtyp cs = (delr1x*delr2x + delr1y*delr2y + delr1z*delr2z) * rinv12; \ - numtyp delcs = cs - param_costheta_ijk; \ + numtyp delcs = cs - param_costheta_ijk; \ numtyp delcssq = delcs*delcs; \ - numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ + numtyp pcsinv = param_bigc_ijk*delcssq+1.0; \ numtyp pcsinvsq = pcsinv*pcsinv; \ numtyp pcs = delcssq/pcsinv; \ \ numtyp facexp = expgsrainv1*expgsrainv2; \ \ - numtyp facrad = param_bigb_ijk * facexp*pcs; \ + numtyp facrad = param_bigb_ijk * facexp*pcs; \ numtyp frad1 = facrad*gsrainvsq1; \ - numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ + numtyp facang = param_big2b_ijk * facexp*delcs/pcsinvsq; \ numtyp facang12 = rinv12*facang; \ numtyp csfacang = cs*facang; \ numtyp csfac1 = rinvsq1*csfacang; \ @@ -353,6 +411,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, const int nelements, const __global int * dev_nbor, const __global int * dev_packed, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -377,7 +436,7 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -387,9 +446,18 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + int nborj_start = nbor_j; + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -406,18 +474,27 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_, numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex); param_r0sq_ij=param4_ijparam.x; - if (rsq1 > param_r0sq_ij) continue; + if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij=param4_ijparam.w; - int nbor_k=nbor_j-offset_j+offset_k; - if (nbor_k<=nbor_j) - nbor_k+=n_stride; + int nbor_k,k_end; + if (dev_packed==dev_nbor) { + nbor_k=nborj_start-offset_j+offset_k; + int numk = dev_short_nbor[nbor_k-n_stride]; + k_end = nbor_k+fast_mul(numk,n_stride); + } else { + nbor_k = nbor_j-offset_j+offset_k; + if (nbor_k<=nbor_j) nbor_k += n_stride; + k_end = nbor_end; + } - for ( ; nbor_k<nbor_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + for ( ; nbor_k<k_end; nbor_k+=n_stride) { + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; + if (dev_packed==dev_nbor && k <= j) continue; + numtyp4 kx; fetch4(kx,k,pos_tex); int ktype=kx.w; ktype=map[ktype]; @@ -478,6 +555,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -502,7 +580,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -512,8 +590,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -529,7 +615,7 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex); param_r0sq_ij = param4_ijparam.x; - if (rsq1 > param_r0sq_ij) continue; + if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij = param4_ijparam.w; @@ -551,8 +637,15 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_, nbor_k+=offset_k; } + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } + for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; @@ -617,6 +710,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, const __global int * dev_nbor, const __global int * dev_packed, const __global int * dev_acc, + const __global int * dev_short_nbor, __global acctyp4 *restrict ans, __global acctyp *restrict engv, const int eflag, const int vflag, @@ -641,7 +735,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, if (ii<inum) { int i, numj, nbor_j, nbor_end, k_end; - + const int* nbor_mem = dev_packed; int offset_j=offset/t_per_atom; nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj, n_stride,nbor_end,nbor_j); @@ -651,8 +745,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, int itype=ix.w; itype=map[itype]; + // recalculate numj and nbor_end for use of the short nbor list + if (dev_packed==dev_nbor) { + numj = dev_short_nbor[nbor_j]; + nbor_j += n_stride; + nbor_end = nbor_j+fast_mul(numj,n_stride); + nbor_mem = dev_short_nbor; + } + for ( ; nbor_j<nbor_end; nbor_j+=n_stride) { - int j=dev_packed[nbor_j]; + int j=nbor_mem[nbor_j]; j &= NEIGHMASK; numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j]; @@ -668,7 +770,7 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype]; numtyp4 param4_ijparam; fetch4(param4_ijparam,ijparam,param4_tex); param_r0sq_ij=param4_ijparam.x; - if (rsq1 > param_r0sq_ij) continue; + if (rsq1 > param_r0sq_ij) continue; // still keep this for neigh no and tpa > 1 param_gamma_ij=param4_ijparam.y; param_r0_ij=param4_ijparam.w; @@ -690,8 +792,15 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_, nbor_k+=offset_k; } + // recalculate numk and k_end for the use of short neighbor list + if (dev_packed==dev_nbor) { + numk = dev_short_nbor[nbor_k]; + nbor_k += n_stride; + k_end = nbor_k+fast_mul(numk,n_stride); + } + for ( ; nbor_k<k_end; nbor_k+=n_stride) { - int k=dev_packed[nbor_k]; + int k=nbor_mem[nbor_k]; k &= NEIGHMASK; if (k == i) continue; diff --git a/lib/gpu/lal_vashishta.h b/lib/gpu/lal_vashishta.h index 6eea8362cc2b2cd299d459a6364b9b471c83b9b2..c5c96a5b805e7e4536e0fbb2951f36e8818461e9 100644 --- a/lib/gpu/lal_vashishta.h +++ b/lib/gpu/lal_vashishta.h @@ -82,6 +82,7 @@ class Vashishta : public BaseThree<numtyp, acctyp> { UCL_D_Vec<int> elem2param; UCL_D_Vec<int> map; int _nparams,_nelements; + numtyp _cutshortsq; UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex; diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 22ec8dde3bf08b4850e53dcd5607de01595a8329..87db73bd12d64bd792d684d3903c9b713f642b48 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -71,6 +71,22 @@ static const char cite_gpu_package[] = " year = 2013,\n" " volume = 184,\n" " pages = {2785--2793}\n" + "}\n\n" + "@Article{Trung15,\n" + " author = {T. D. Nguyen, S. J. Plimpton},\n" + " title = {Accelerating dissipative particle dynamics simulations for soft matter systems},\n" + " journal = {Comput.~Mater.~Sci.},\n" + " year = 2015,\n" + " volume = 100,\n" + " pages = {173--180}\n" + "}\n\n" + "@Article{Trung17,\n" + " author = {T. D. Nguyen},\n" + " title = {GPU-accelerated Tersoff potentials for massively parallel Molecular Dynamics simulations},\n" + " journal = {Comp.~Phys.~Comm.},\n" + " year = 2017,\n" + " volume = 212,\n" + " pages = {113--122}\n" "}\n\n"; /* ---------------------------------------------------------------------- */