From 83f139642eb638e73dd74a89b58cbcbc38d72ca5 Mon Sep 17 00:00:00 2001 From: stamoor <stamoor@f3b2605a-c512-4ea7-a41b-209d697bcdaa> Date: Tue, 6 Sep 2016 22:09:41 +0000 Subject: [PATCH] Reverting optimizations that hurt performance on some compilers git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15551 f3b2605a-c512-4ea7-a41b-209d697bcdaa --- src/KOKKOS/pair_eam_kokkos.cpp | 137 ++++++++++++++------------------- src/KOKKOS/pair_eam_kokkos.h | 16 ++-- 2 files changed, 65 insertions(+), 88 deletions(-) diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp index 37a26145e0..d91da280ac 100755 --- a/src/KOKKOS/pair_eam_kokkos.cpp +++ b/src/KOKKOS/pair_eam_kokkos.cpp @@ -340,81 +340,64 @@ void PairEAMKokkos<DeviceType>::array2spline() rdr = 1.0/dr; rdrho = 1.0/drho; - tdual_ffloat4 k_frho_spline_a = tdual_ffloat4("pair:frho_a",nfrho,nrho+1); - tdual_ffloat4 k_rhor_spline_a = tdual_ffloat4("pair:rhor_a",nrhor,nr+1); - tdual_ffloat4 k_z2r_spline_a = tdual_ffloat4("pair:z2r_a",nz2r,nr+1); - tdual_ffloat4 k_frho_spline_b = tdual_ffloat4("pair:frho_b",nfrho,nrho+1); - tdual_ffloat4 k_rhor_spline_b = tdual_ffloat4("pair:rhor_b",nrhor,nr+1); - tdual_ffloat4 k_z2r_spline_b = tdual_ffloat4("pair:z2r_b",nz2r,nr+1); - - t_host_ffloat4 h_frho_spline_a = k_frho_spline_a.h_view; - t_host_ffloat4 h_rhor_spline_a = k_rhor_spline_a.h_view; - t_host_ffloat4 h_z2r_spline_a = k_z2r_spline_a.h_view; - t_host_ffloat4 h_frho_spline_b = k_frho_spline_b.h_view; - t_host_ffloat4 h_rhor_spline_b = k_rhor_spline_b.h_view; - t_host_ffloat4 h_z2r_spline_b = k_z2r_spline_b.h_view; + tdual_ffloat_2d_n7 k_frho_spline = tdual_ffloat_2d_n7("pair:frho",nfrho,nrho+1); + tdual_ffloat_2d_n7 k_rhor_spline = tdual_ffloat_2d_n7("pair:rhor",nrhor,nr+1); + tdual_ffloat_2d_n7 k_z2r_spline = tdual_ffloat_2d_n7("pair:z2r",nz2r,nr+1); + + t_host_ffloat_2d_n7 h_frho_spline = k_frho_spline.h_view; + t_host_ffloat_2d_n7 h_rhor_spline = k_rhor_spline.h_view; + t_host_ffloat_2d_n7 h_z2r_spline = k_z2r_spline.h_view; for (int i = 0; i < nfrho; i++) - interpolate(nrho,drho,frho[i],h_frho_spline_a,h_frho_spline_b,i); - k_frho_spline_a.template modify<LMPHostType>(); - k_frho_spline_a.template sync<DeviceType>(); - k_frho_spline_b.template modify<LMPHostType>(); - k_frho_spline_b.template sync<DeviceType>(); + interpolate(nrho,drho,frho[i],h_frho_spline,i); + k_frho_spline.template modify<LMPHostType>(); + k_frho_spline.template sync<DeviceType>(); for (int i = 0; i < nrhor; i++) - interpolate(nr,dr,rhor[i],h_rhor_spline_a,h_rhor_spline_b,i); - k_rhor_spline_a.template modify<LMPHostType>(); - k_rhor_spline_a.template sync<DeviceType>(); - k_rhor_spline_b.template modify<LMPHostType>(); - k_rhor_spline_b.template sync<DeviceType>(); + interpolate(nr,dr,rhor[i],h_rhor_spline,i); + k_rhor_spline.template modify<LMPHostType>(); + k_rhor_spline.template sync<DeviceType>(); for (int i = 0; i < nz2r; i++) - interpolate(nr,dr,z2r[i],h_z2r_spline_a,h_z2r_spline_b,i); - k_z2r_spline_a.template modify<LMPHostType>(); - k_z2r_spline_a.template sync<DeviceType>(); - k_z2r_spline_b.template modify<LMPHostType>(); - k_z2r_spline_b.template sync<DeviceType>(); - - d_frho_spline_a = k_frho_spline_a.d_view; - d_rhor_spline_a = k_rhor_spline_a.d_view; - d_z2r_spline_a = k_z2r_spline_a.d_view; - d_frho_spline_b = k_frho_spline_b.d_view; - d_rhor_spline_b = k_rhor_spline_b.d_view; - d_z2r_spline_b = k_z2r_spline_b.d_view; - + interpolate(nr,dr,z2r[i],h_z2r_spline,i); + k_z2r_spline.template modify<LMPHostType>(); + k_z2r_spline.template sync<DeviceType>(); + d_frho_spline = k_frho_spline.d_view; + d_rhor_spline = k_rhor_spline.d_view; + d_z2r_spline = k_z2r_spline.d_view; } /* ---------------------------------------------------------------------- */ template<class DeviceType> -void PairEAMKokkos<DeviceType>::interpolate(int n, double delta, double *f, t_host_ffloat4 h_spline_a, t_host_ffloat4 h_spline_b, int i) +void PairEAMKokkos<DeviceType>::interpolate(int n, double delta, double *f, t_host_ffloat_2d_n7 h_spline, int i) { - for (int m = 1; m <= n; m++) h_spline_b(i,m).w = f[m]; + for (int m = 1; m <= n; m++) h_spline(i,m,6) = f[m]; - h_spline_b(i,1).z = h_spline_b(i,2).w - h_spline_b(i,1).w; - h_spline_b(i,2).z = 0.5 * (h_spline_b(i,3).w-h_spline_b(i,1).w); - h_spline_b(i,n-1).z = 0.5 * (h_spline_b(i,n).w-h_spline_b(i,n-2).w); - h_spline_b(i,n).z = h_spline_b(i,n).w - h_spline_b(i,n-1).w; + h_spline(i,1,5) = h_spline(i,2,6) - h_spline(i,1,6); + h_spline(i,2,5) = 0.5 * (h_spline(i,3,6)-h_spline(i,1,6)); + h_spline(i,n-1,5) = 0.5 * (h_spline(i,n,6)-h_spline(i,n-2,6)); + h_spline(i,n,5) = h_spline(i,n,6) - h_spline(i,n-1,6); for (int m = 3; m <= n-2; m++) - h_spline_b(i,m).z = ((h_spline_b(i,m-2).w-h_spline_b(i,m+2).w) + - 8.0*(h_spline_b(i,m+1).w-h_spline_b(i,m-1).w)) / 12.0; + h_spline(i,m,5) = ((h_spline(i,m-2,6)-h_spline(i,m+2,6)) + + 8.0*(h_spline(i,m+1,6)-h_spline(i,m-1,6))) / 12.0; for (int m = 1; m <= n-1; m++) { - h_spline_b(i,m).y = 3.0*(h_spline_b(i,m+1).w-h_spline_b(i,m).w) - - 2.0*h_spline_b(i,m).z - h_spline_b(i,m+1).z; - h_spline_b(i,m).x = h_spline_b(i,m).z + h_spline_b(i,m+1).z - - 2.0*(h_spline_b(i,m+1).w-h_spline_b(i,m).w); + h_spline(i,m,4) = 3.0*(h_spline(i,m+1,6)-h_spline(i,m,6)) - + 2.0*h_spline(i,m,5) - h_spline(i,m+1,5); + h_spline(i,m,3) = h_spline(i,m,5) + h_spline(i,m+1,5) - + 2.0*(h_spline(i,m+1,6)-h_spline(i,m,6)); } - h_spline_b(i,n).y = 0.0; - h_spline_b(i,n).x = 0.0; + h_spline(i,n,4) = 0.0; + h_spline(i,n,3) = 0.0; for (int m = 1; m <= n; m++) { - h_spline_a(i,m).z = h_spline_b(i,m).z/delta; - h_spline_a(i,m).y = 2.0*h_spline_b(i,m).y/delta; - h_spline_a(i,m).x = 3.0*h_spline_b(i,m).x/delta; + h_spline(i,m,2) = h_spline(i,m,5)/delta; + h_spline(i,m,1) = 2.0*h_spline(i,m,4)/delta; + h_spline(i,m,0) = 3.0*h_spline(i,m,3)/delta; } } @@ -558,13 +541,12 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelA<NEIGHFLAG,NEWTON_PA p -= m; p = MIN(p,1.0); const int d_type2rhor_ji = d_type2rhor(jtype,itype); - const F_FLOAT4 rhor = d_rhor_spline_b(d_type2rhor_ji,m); - rhotmp += ((rhor.x*p + rhor.y)*p + rhor.z)*p + rhor.w; - + rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p + + d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6); if (NEWTON_PAIR || j < nlocal) { const int d_type2rhor_ij = d_type2rhor(itype,jtype); - const F_FLOAT4 rhor = d_rhor_spline_b(d_type2rhor_ij,m); - rho[j] += ((rhor.x*p + rhor.y)*p + rhor.z)*p + rhor.w; + rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p + + d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6); } } @@ -593,11 +575,10 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelB<EFLAG>, const int & p -= m; p = MIN(p,1.0); const int d_type2frho_i = d_type2frho[itype]; - const F_FLOAT4 frho = d_frho_spline_a(d_type2frho_i,m); - d_fp[i] = (frho.x*p + frho.y)*p + frho.z; + d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2); if (EFLAG) { - const F_FLOAT4 frho_b = d_frho_spline_b(d_type2frho_i,m); - F_FLOAT phi = ((frho_b.x*p + frho_b.y)*p + frho_b.z)*p + frho_b.w; + F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p + + d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6); if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax); if (eflag_global) ev.evdwl += phi; if (eflag_atom) d_eatom[i] += phi; @@ -651,8 +632,8 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>, const int p -= m; p = MIN(p,1.0); const int d_type2rhor_ji = d_type2rhor(jtype,itype); - const F_FLOAT4 rhor = d_rhor_spline_b(d_type2rhor_ji,m); - rhotmp += ((rhor.x*p + rhor.y)*p + rhor.z)*p + rhor.w; + rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p + + d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6); } } @@ -669,11 +650,10 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>, const int p -= m; p = MIN(p,1.0); const int d_type2frho_i = d_type2frho[itype]; - const F_FLOAT4 frho = d_frho_spline_a(d_type2frho_i,m); - d_fp[i] = (frho.x*p + frho.y)*p + frho.z; + d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2); if (EFLAG) { - const F_FLOAT4 frho_b = d_frho_spline_b(d_type2frho_i,m); - F_FLOAT phi = ((frho_b.x*p + frho_b.y)*p + frho_b.z)*p + frho_b.w; + F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p + + d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6); if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax); if (eflag_global) ev.evdwl += phi; if (eflag_atom) d_eatom[i] += phi; @@ -742,18 +722,16 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PA // hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip const int d_type2rhor_ij = d_type2rhor(itype,jtype); - const F_FLOAT4 rhor_ij = d_rhor_spline_a(d_type2rhor_ij,m); - const F_FLOAT rhoip = (rhor_ij.x*p + rhor_ij.y)*p + rhor_ij.z; - + const F_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p + + d_rhor_spline(d_type2rhor_ij,m,2); const int d_type2rhor_ji = d_type2rhor(jtype,itype); - const F_FLOAT4 rhor_ji = d_rhor_spline_a(d_type2rhor_ji,m); - const F_FLOAT rhojp = (rhor_ji.x*p + rhor_ji.y)*p + rhor_ji.z; - + const F_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p + + d_rhor_spline(d_type2rhor_ji,m,2); const int d_type2z2r_ij = d_type2z2r(itype,jtype); - const F_FLOAT4 z2r_a = d_z2r_spline_a(d_type2z2r_ij,m); - const F_FLOAT z2p = (z2r_a.x*p + z2r_a.y)*p + z2r_a.z; - const F_FLOAT4 z2r_b = d_z2r_spline_b(d_type2z2r_ij,m); - const F_FLOAT z2 = ((z2r_b.x*p + z2r_b.y)*p + z2r_b.z)*p + z2r_b.w; + const F_FLOAT z2p = (d_z2r_spline(d_type2z2r_ij,m,0)*p + d_z2r_spline(d_type2z2r_ij,m,1))*p + + d_z2r_spline(d_type2z2r_ij,m,2); + const F_FLOAT z2 = ((d_z2r_spline(d_type2z2r_ij,m,3)*p + d_z2r_spline(d_type2z2r_ij,m,4))*p + + d_z2r_spline(d_type2z2r_ij,m,5))*p + d_z2r_spline(d_type2z2r_ij,m,6); const F_FLOAT recip = 1.0/r; const F_FLOAT phi = z2*recip; @@ -894,5 +872,4 @@ template class PairEAMKokkos<LMPDeviceType>; #ifdef KOKKOS_HAVE_CUDA template class PairEAMKokkos<LMPHostType>; #endif -} - +} \ No newline at end of file diff --git a/src/KOKKOS/pair_eam_kokkos.h b/src/KOKKOS/pair_eam_kokkos.h index b3d33bb37f..89d1d61b6b 100755 --- a/src/KOKKOS/pair_eam_kokkos.h +++ b/src/KOKKOS/pair_eam_kokkos.h @@ -136,14 +136,14 @@ class PairEAMKokkos : public PairEAM { DAT::t_int_2d_randomread d_type2rhor; DAT::t_int_2d_randomread d_type2z2r; - typedef Kokkos::DualView<F_FLOAT4**,Kokkos::LayoutLeft,DeviceType> tdual_ffloat4; - typedef typename tdual_ffloat4::t_dev_const_randomread t_ffloat4_randomread; - typedef typename tdual_ffloat4::t_host t_host_ffloat4; - - t_ffloat4_randomread d_frho_spline_a, d_frho_spline_b; - t_ffloat4_randomread d_rhor_spline_a, d_rhor_spline_b; - t_ffloat4_randomread d_z2r_spline_a, d_z2r_spline_b; - void interpolate(int, double, double *, t_host_ffloat4, t_host_ffloat4, int); + typedef Kokkos::DualView<F_FLOAT**[7],Kokkos::LayoutRight,DeviceType> tdual_ffloat_2d_n7; + typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread; + typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7; + + t_ffloat_2d_n7_randomread d_frho_spline; + t_ffloat_2d_n7_randomread d_rhor_spline; + t_ffloat_2d_n7_randomread d_z2r_spline; + void interpolate(int, double, double *, t_host_ffloat_2d_n7, int); virtual void file2array(); void array2spline(); -- GitLab