From 70a7b3761475f12833b10f08bc1c55f5b4ed662b Mon Sep 17 00:00:00 2001 From: Trung Nguyen <ndactrung@gmail.com> Date: Tue, 19 Jun 2018 15:50:02 -0500 Subject: [PATCH] Updated pair coul/long/cs and born/coul/long/cs; updated gpu neighbor builds to support core-shell styles where r2 can be tiny. --- lib/gpu/lal_born_coul_long_cs.cu | 83 ++++++++++++++++---------------- lib/gpu/lal_coul_long_cs.cu | 56 ++++++++++++--------- lib/gpu/lal_neighbor_gpu.cu | 36 +++++++------- 3 files changed, 92 insertions(+), 83 deletions(-) diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu index d18f573c11..c5f98567d9 100644 --- a/lib/gpu/lal_born_coul_long_cs.cu +++ b/lib/gpu/lal_born_coul_long_cs.cu @@ -9,7 +9,7 @@ // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) // __________________________________________________________________________ // -// begin : +// begin : June 2018 // email : trung.nguyen@northwestern.edu // ***************************************************************************/ @@ -29,15 +29,19 @@ texture<int2> q_tex; #define q_tex q_ #endif -#define B0 (numtyp)-0.1335096380159268 -#define B1 (numtyp)-2.57839507e-1 -#define B2 (numtyp)-1.37203639e-1 -#define B3 (numtyp)-8.88822059e-3 -#define B4 (numtyp)-5.80844129e-3 -#define B5 (numtyp)1.14652755e-1 -#define EPSILON (numtyp)1.0e-20 -#define EPS_EWALD (numtyp)1.0e-6 -#define EPS_EWALD_SQR (numtyp)1.0e-12 +// Note: EWALD_P is different from that in lal_preprocessor.h +// acctyp is needed for these parameters +#define CS_EWALD_P (acctyp)9.95473818e-1 +#define B0 (acctyp)-0.1335096380159268 +#define B1 (acctyp)-2.57839507e-1 +#define B2 (acctyp)-1.37203639e-1 +#define B3 (acctyp)-8.88822059e-3 +#define B4 (acctyp)-5.80844129e-3 +#define B5 (acctyp)1.14652755e-1 + +#define EPSILON (acctyp)(1.0e-20) +#define EPS_EWALD (acctyp)(1.0e-6) +#define EPS_EWALD_SQR (acctyp)(1.0e-12) __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, const __global numtyp4 *restrict coeff1, @@ -104,40 +108,38 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, numtyp rsq = delx*delx+dely*dely+delz*delz; int mtype=itype*lj_types+jtype; - if (rsq<cutsq_sigma[mtype].x) { // cutsq - numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc; - numtyp rexp = (numtyp)0.0; + if (rsq<cutsq_sigma[mtype].x) { // cutsq + numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp; - if (rsq < cut_coulsq) { - rsq += EPSILON; + rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond; + numtyp r2inv = ucl_recip(rsq); - numtyp r2inv = ucl_recip(rsq); - numtyp r = ucl_rsqrt(r2inv); + if (rsq < cut_coulsq) { + numtyp r = ucl_sqrt(rsq); fetch(prefactor,j,q_tex); prefactor *= qqrd2e * qtmp; if (factor_coul<(numtyp)1.0) { numtyp grij = g_ewald * (r+EPS_EWALD); numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= (r+EPS_EWALD); forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul)); + // Additionally r2inv needs to be accordingly modified since the later + // scaling of the overall force shall be consistent r2inv = ucl_recip(rsq + EPS_EWALD_SQR); - forcecoul *= r2inv; } else { numtyp grij = g_ewald * r; numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= r; forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2); - forcecoul *= r2inv; } } else forcecoul = (numtyp)0.0; - numtyp r2inv = ucl_recip(rsq); if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq numtyp r = ucl_sqrt(rsq); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); @@ -146,7 +148,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, + coeff1[mtype].w*r2inv*r6inv)*factor_lj; } else forceborn = (numtyp)0.0; - force = forceborn*r2inv + forcecoul; + force = (forcecoul + forceborn) * r2inv; f.x+=delx*force; f.y+=dely*force; @@ -154,9 +156,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_, if (eflag>0) { if (rsq < cut_coulsq) { - e_coul += prefactor*_erfc; - if (factor_coul<(numtyp)1.0) - e_coul -= ((numtyp)1.0-factor_coul)*prefactor; + numtyp e = prefactor*_erfc; + if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; + e_coul += e; } if (rsq < cutsq_sigma[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv @@ -248,39 +250,38 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq<cutsq_sigma[mtype].x) { // cutsq - numtyp forcecoul, forceborn, force, r6inv, prefactor, _erfc; - numtyp rexp = (numtyp)0.0; + numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp; - if (rsq < cut_coulsq) { - rsq += EPSILON; + rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond; + numtyp r2inv = ucl_recip(rsq); - numtyp r2inv = ucl_recip(rsq); - numtyp r = ucl_rsqrt(r2inv); + if (rsq < cut_coulsq) { + numtyp r = ucl_sqrt(rsq); fetch(prefactor,j,q_tex); prefactor *= qqrd2e * qtmp; if (factor_coul<(numtyp)1.0) { numtyp grij = g_ewald * (r+EPS_EWALD); numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= (r+EPS_EWALD); forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul)); + // Additionally r2inv needs to be accordingly modified since the later + // scaling of the overall force shall be consistent r2inv = ucl_recip(rsq + EPS_EWALD_SQR); - forcecoul *= r2inv; } else { + numtyp grij = g_ewald * r; numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= r; forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2); - forcecoul *= r2inv; } } else forcecoul = (numtyp)0.0; - numtyp r2inv = ucl_recip(rsq); if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq numtyp r = ucl_sqrt(rsq); rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x); @@ -289,7 +290,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, + coeff1[mtype].w*r2inv*r6inv)*factor_lj; } else forceborn = (numtyp)0.0; - force = forceborn*r2inv + forcecoul; + force = (forcecoul + forceborn) * r2inv; f.x+=delx*force; f.y+=dely*force; @@ -297,9 +298,9 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_, if (eflag>0) { if (rsq < cut_coulsq) { - e_coul += prefactor*_erfc; - if (factor_coul<(numtyp)1.0) - e_coul -= ((numtyp)1.0-factor_coul)*prefactor; + numtyp e = prefactor*_erfc; + if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; + e_coul += e; } if (rsq < cutsq_sigma[mtype].y) { numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu index f6d3ef8d44..1ff9445f4c 100644 --- a/lib/gpu/lal_coul_long_cs.cu +++ b/lib/gpu/lal_coul_long_cs.cu @@ -29,6 +29,20 @@ texture<int2> q_tex; #define q_tex q_ #endif +// Note: EWALD_P is different from that in lal_preprocessor.h +// acctyp is needed for these parameters +#define CS_EWALD_P (acctyp)9.95473818e-1 +#define B0 (acctyp)-0.1335096380159268 +#define B1 (acctyp)-2.57839507e-1 +#define B2 (acctyp)-1.37203639e-1 +#define B3 (acctyp)-8.88822059e-3 +#define B4 (acctyp)-5.80844129e-3 +#define B5 (acctyp)1.14652755e-1 + +#define EPSILON (acctyp)(1.0e-20) +#define EPS_EWALD (acctyp)(1.0e-6) +#define EPS_EWALD_SQR (acctyp)(1.0e-12) + #if (ARCH < 300) #define store_answers_lq(f, e_coul, virial, ii, inum, tid, \ @@ -123,16 +137,6 @@ texture<int2> q_tex; #endif -#define B0 (numtyp)-0.1335096380159268 -#define B1 (numtyp)-2.57839507e-1 -#define B2 (numtyp)-1.37203639e-1 -#define B3 (numtyp)-8.88822059e-3 -#define B4 (numtyp)-5.80844129e-3 -#define B5 (numtyp)1.14652755e-1 -#define EPSILON (numtyp)1.0e-20 -#define EPS_EWALD (numtyp)1.0e-6 -#define EPS_EWALD_SQR (numtyp)1.0e-12 - __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, const __global numtyp *restrict scale, const int lj_types, @@ -191,7 +195,7 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, int mtype=itype*lj_types+jtype; if (rsq < cut_coulsq) { - rsq += EPSILON; + rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond; numtyp force,prefactor,_erfc; numtyp r2inv = ucl_recip(rsq); @@ -201,17 +205,19 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, if (factor_coul<(numtyp)1.0) { numtyp grij = g_ewald * (r+EPS_EWALD); numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= (r+EPS_EWALD); force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul)); + // Additionally r2inv needs to be accordingly modified since the later + // scaling of the overall force shall be consistent r2inv = ucl_recip(rsq + EPS_EWALD_SQR); force *= r2inv; } else { numtyp grij = g_ewald * r; numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= r; @@ -224,9 +230,9 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_, f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*_erfc; - if (factor_coul<(numtyp)1.0) - e_coul -= ((numtyp)1.0-factor_coul)*prefactor; + numtyp e = prefactor*_erfc; + if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; + e_coul += e; } if (vflag>0) { virial[0] += delx*delx*force; @@ -304,7 +310,7 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_, numtyp rsq = delx*delx+dely*dely+delz*delz; if (rsq < cut_coulsq) { - rsq += EPSILON; + rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond; numtyp force,prefactor,_erfc; numtyp r2inv = ucl_recip(rsq); @@ -314,32 +320,34 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_, if (factor_coul<(numtyp)1.0) { numtyp grij = g_ewald * (r+EPS_EWALD); numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= (r+EPS_EWALD); force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul)); + // Additionally r2inv needs to be accordingly modified since the later + // scaling of the overall force shall be consistent r2inv = ucl_recip(rsq + EPS_EWALD_SQR); - force *= r2inv; } else { numtyp grij = g_ewald * r; numtyp expm2 = ucl_exp(-grij*grij); - numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij); + numtyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij); numtyp u = (numtyp)1.0 - t; _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2; prefactor /= r; force = prefactor * (_erfc + EWALD_F*grij*expm2); - force *= r2inv; } + force *= r2inv; + f.x+=delx*force; f.y+=dely*force; f.z+=delz*force; if (eflag>0) { - e_coul += prefactor*_erfc; - if (factor_coul<(numtyp)1.0) - e_coul -= ((numtyp)1.0-factor_coul)*prefactor; + numtyp e = prefactor*_erfc; + if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor; + e_coul += e; } if (vflag>0) { virial[0] += delx*delx*force; diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu index b0b3cfb90c..83692a24e4 100644 --- a/lib/gpu/lal_neighbor_gpu.cu +++ b/lib/gpu/lal_neighbor_gpu.cu @@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out, const __global tagint *restrict in, int columns_in, int rows_in) { - __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; + __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1]; - unsigned ti=THREAD_ID_X; - unsigned tj=THREAD_ID_Y; - unsigned bi=BLOCK_ID_X; - unsigned bj=BLOCK_ID_Y; + unsigned ti=THREAD_ID_X; + unsigned tj=THREAD_ID_Y; + unsigned bi=BLOCK_ID_X; + unsigned bj=BLOCK_ID_Y; - unsigned i=bi*BLOCK_CELL_2D+ti; - unsigned j=bj*BLOCK_CELL_2D+tj; - if ((i<columns_in) && (j<rows_in)) - block[tj][ti]=in[j*columns_in+i]; + unsigned i=bi*BLOCK_CELL_2D+ti; + unsigned j=bj*BLOCK_CELL_2D+tj; + if ((i<columns_in) && (j<rows_in)) + block[tj][ti]=in[j*columns_in+i]; - __syncthreads(); + __syncthreads(); - i=bj*BLOCK_CELL_2D+ti; - j=bi*BLOCK_CELL_2D+tj; - if ((i<rows_in) && (j<columns_in)) - out[j*rows_in+i] = block[ti][tj]; + i=bj*BLOCK_CELL_2D+ti; + j=bi*BLOCK_CELL_2D+tj; + if ((i<rows_in) && (j<columns_in)) + out[j*rows_in+i] = block[ti][tj]; } __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, @@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, nbor_list[pid_i]=pid_i; } else { stride=0; - neigh_counts=host_numj+pid_i-inum; + neigh_counts=host_numj+pid_i-inum; neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size; } @@ -232,7 +232,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, diff.z = atom_i.z - pos_sh[j].z; r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z; - if (r2 < cell_size*cell_size && r2 > 1e-5) { + if (r2 < cell_size*cell_size && pid_j != pid_i) { // && r2 > 1e-5 cnt++; if (cnt <= neigh_bin_size) { *neigh_list = pid_j; @@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_, } } } - __syncthreads(); - } // for (k) + __syncthreads(); + } // for (k) } } } -- GitLab