From e4e52616dd89c94972ef8359c5be0e8551a22a5e Mon Sep 17 00:00:00 2001
From: stamoor <stamoor@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Wed, 19 Aug 2015 22:50:56 +0000
Subject: [PATCH] Adding optimizations to Kokkos EAM pair style

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@13923 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 src/KOKKOS/pair_eam_kokkos.cpp | 187 +++++++++++++++++++++++----------
 src/KOKKOS/pair_eam_kokkos.h   |  14 +--
 2 files changed, 138 insertions(+), 63 deletions(-)

diff --git a/src/KOKKOS/pair_eam_kokkos.cpp b/src/KOKKOS/pair_eam_kokkos.cpp
index bcea86b1cf..14cd2ab8c2 100755
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@@ -342,67 +342,138 @@ void PairEAMKokkos<DeviceType>::file2array()
 template<class DeviceType>
 void PairEAMKokkos<DeviceType>::array2spline()
 {
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
   rdr = 1.0/dr;
   rdrho = 1.0/drho;
 
-  tdual_ffloat_2d_n7 k_frho_spline = tdual_ffloat_2d_n7("pair:frho",nfrho,nrho+1);
-  tdual_ffloat_2d_n7 k_rhor_spline = tdual_ffloat_2d_n7("pair:rhor",nrhor,nr+1);
-  tdual_ffloat_2d_n7 k_z2r_spline = tdual_ffloat_2d_n7("pair:z2r",nz2r,nr+1);
+  tdual_ffloat4 k_frho_spline_a = tdual_ffloat4("pair:frho_a",nfrho,nrho+1);
+  tdual_ffloat4 k_rhor_spline_a = tdual_ffloat4("pair:rhor_a",nrhor,nr+1);
+  tdual_ffloat4 k_z2r_spline_a = tdual_ffloat4("pair:z2r_a",nz2r,nr+1);
+  tdual_ffloat4 k_frho_spline_b = tdual_ffloat4("pair:frho_b",nfrho,nrho+1);
+  tdual_ffloat4 k_rhor_spline_b = tdual_ffloat4("pair:rhor_b",nrhor,nr+1);
+  tdual_ffloat4 k_z2r_spline_b = tdual_ffloat4("pair:z2r_b",nz2r,nr+1);
 
-  t_host_ffloat_2d_n7 h_frho_spline = k_frho_spline.h_view;
-  t_host_ffloat_2d_n7 h_rhor_spline = k_rhor_spline.h_view;
-  t_host_ffloat_2d_n7 h_z2r_spline = k_z2r_spline.h_view;
+  t_host_ffloat4 h_frho_spline_a = k_frho_spline_a.h_view;
+  t_host_ffloat4 h_rhor_spline_a = k_rhor_spline_a.h_view;
+  t_host_ffloat4 h_z2r_spline_a =  k_z2r_spline_a.h_view;
+  t_host_ffloat4 h_frho_spline_b = k_frho_spline_b.h_view;
+  t_host_ffloat4 h_rhor_spline_b = k_rhor_spline_b.h_view;
+  t_host_ffloat4 h_z2r_spline_b =  k_z2r_spline_b.h_view;
 
   for (int i = 0; i < nfrho; i++)
-    interpolate(nrho,drho,frho[i],h_frho_spline,i);
-  k_frho_spline.template modify<LMPHostType>();
-  k_frho_spline.template sync<DeviceType>();
+    interpolate(nrho,drho,frho[i],h_frho_spline_a,h_frho_spline_b,i);
+  k_frho_spline_a.template modify<LMPHostType>();
+  k_frho_spline_a.template sync<DeviceType>();
+  k_frho_spline_b.template modify<LMPHostType>();
+  k_frho_spline_b.template sync<DeviceType>();
 
   for (int i = 0; i < nrhor; i++)
-    interpolate(nr,dr,rhor[i],h_rhor_spline,i);
-  k_rhor_spline.template modify<LMPHostType>();
-  k_rhor_spline.template sync<DeviceType>();
+    interpolate(nr,dr,rhor[i],h_rhor_spline_a,h_rhor_spline_b,i);
+  k_rhor_spline_a.template modify<LMPHostType>();
+  k_rhor_spline_a.template sync<DeviceType>();
+  k_rhor_spline_b.template modify<LMPHostType>();
+  k_rhor_spline_b.template sync<DeviceType>();
 
   for (int i = 0; i < nz2r; i++)
-    interpolate(nr,dr,z2r[i],h_z2r_spline,i);
-  k_z2r_spline.template modify<LMPHostType>();
-  k_z2r_spline.template sync<DeviceType>();
+    interpolate(nr,dr,z2r[i],h_z2r_spline_a,h_z2r_spline_b,i);
+  k_z2r_spline_a.template modify<LMPHostType>();
+  k_z2r_spline_a.template sync<DeviceType>();
+  k_z2r_spline_b.template modify<LMPHostType>();
+  k_z2r_spline_b.template sync<DeviceType>();
+
+  d_frho_spline_a = k_frho_spline_a.d_view;
+  d_rhor_spline_a = k_rhor_spline_a.d_view;
+  d_z2r_spline_a = k_z2r_spline_a.d_view;
+  d_frho_spline_b = k_frho_spline_b.d_view;
+  d_rhor_spline_b = k_rhor_spline_b.d_view;
+  d_z2r_spline_b = k_z2r_spline_b.d_view;
+
 
-  d_frho_spline = k_frho_spline.d_view;
-  d_rhor_spline = k_rhor_spline.d_view;
-  d_z2r_spline = k_z2r_spline.d_view;
 }
 
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-void PairEAMKokkos<DeviceType>::interpolate(int n, double delta, double *f, t_host_ffloat_2d_n7 h_spline, int i)
+void PairEAMKokkos<DeviceType>::interpolate(int n, double delta, double *f, t_host_ffloat4 h_spline_a, t_host_ffloat4 h_spline_b, int i)
 {
-  for (int m = 1; m <= n; m++) h_spline(i,m,6) = f[m];
+  for (int m = 1; m <= n; m++) h_spline_b(i,m).w = f[m];
 
-  h_spline(i,1,5) = h_spline(i,2,6) - h_spline(i,1,6);
-  h_spline(i,2,5) = 0.5 * (h_spline(i,3,6)-h_spline(i,1,6));
-  h_spline(i,n-1,5) = 0.5 * (h_spline(i,n,6)-h_spline(i,n-2,6));
-  h_spline(i,n,5) = h_spline(i,n,6) - h_spline(i,n-1,6);
+  h_spline_b(i,1).z = h_spline_b(i,2).w - h_spline_b(i,1).w;
+  h_spline_b(i,2).z = 0.5 * (h_spline_b(i,3).w-h_spline_b(i,1).w);
+  h_spline_b(i,n-1).z = 0.5 * (h_spline_b(i,n).w-h_spline_b(i,n-2).w);
+  h_spline_b(i,n).z = h_spline_b(i,n).w - h_spline_b(i,n-1).w;
 
   for (int m = 3; m <= n-2; m++)
-    h_spline(i,m,5) = ((h_spline(i,m-2,6)-h_spline(i,m+2,6)) +
-                    8.0*(h_spline(i,m+1,6)-h_spline(i,m-1,6))) / 12.0;
+    h_spline_b(i,m).z = ((h_spline_b(i,m-2).w-h_spline_b(i,m+2).w) +
+                    8.0*(h_spline_b(i,m+1).w-h_spline_b(i,m-1).w)) / 12.0;
 
   for (int m = 1; m <= n-1; m++) {
-    h_spline(i,m,4) = 3.0*(h_spline(i,m+1,6)-h_spline(i,m,6)) -
-      2.0*h_spline(i,m,5) - h_spline(i,m+1,5);
-    h_spline(i,m,3) = h_spline(i,m,5) + h_spline(i,m+1,5) -
-      2.0*(h_spline(i,m+1,6)-h_spline(i,m,6));
+    h_spline_b(i,m).y = 3.0*(h_spline_b(i,m+1).w-h_spline_b(i,m).w) -
+      2.0*h_spline_b(i,m).z - h_spline_b(i,m+1).z;
+    h_spline_b(i,m).x = h_spline_b(i,m).z + h_spline_b(i,m+1).z -
+      2.0*(h_spline_b(i,m+1).w-h_spline_b(i,m).w);
   }
 
-  h_spline(i,n,4) = 0.0;
-  h_spline(i,n,3) = 0.0;
+  h_spline_b(i,n).y = 0.0;
+  h_spline_b(i,n).x = 0.0;
 
   for (int m = 1; m <= n; m++) {
-    h_spline(i,m,2) = h_spline(i,m,5)/delta;
-    h_spline(i,m,1) = 2.0*h_spline(i,m,4)/delta;
-    h_spline(i,m,0) = 3.0*h_spline(i,m,3)/delta;
+    h_spline_a(i,m).z = h_spline_b(i,m).z/delta;
+    h_spline_a(i,m).y = 2.0*h_spline_b(i,m).y/delta;
+    h_spline_a(i,m).x = 3.0*h_spline_b(i,m).x/delta;
   }
 }
 
@@ -546,12 +617,13 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelA<NEIGHFLAG,NEWTON_PA
       p -= m;
       p = MIN(p,1.0);
       const int d_type2rhor_ji = d_type2rhor(jtype,itype);
-      rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p + 
-                  d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
+      const F_FLOAT4 rhor = d_rhor_spline_b(d_type2rhor_ji,m);
+      rhotmp += ((rhor.x*p + rhor.y)*p + rhor.z)*p + rhor.w;
+
       if (NEWTON_PAIR || j < nlocal) {
         const int d_type2rhor_ij = d_type2rhor(itype,jtype);
-        rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p + 
-                    d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
+        const F_FLOAT4 rhor = d_rhor_spline_b(d_type2rhor_ij,m);
+        rho[j] += ((rhor.x*p + rhor.y)*p + rhor.z)*p + rhor.w;
       }
     }
   
@@ -581,15 +653,15 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelB<EFLAG>, const int &
   p -= m;
   p = MIN(p,1.0);
   const int d_type2frho_i = d_type2frho[itype];
-  d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
+  const F_FLOAT4 frho = d_frho_spline_a(d_type2frho_i,m);
+  d_fp[i] = (frho.x*p + frho.y)*p + frho.z;
   if (EFLAG) {
-    F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p + 
-                    d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
+    const F_FLOAT4 frho_b = d_frho_spline_b(d_type2frho_i,m);
+    F_FLOAT phi = ((frho_b.x*p + frho_b.y)*p + frho_b.z)*p + frho_b.w;
     if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax);
     if (eflag_global) ev.evdwl += phi;
     if (eflag_atom) d_eatom[i] += phi;
   }
-
 }
 
 template<class DeviceType>
@@ -639,8 +711,8 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>, const int
       p -= m;
       p = MIN(p,1.0);
       const int d_type2rhor_ji = d_type2rhor(jtype,itype);
-      rhotmp += ((d_rhor_spline(d_type2rhor_ji,m,3)*p + d_rhor_spline(d_type2rhor_ji,m,4))*p + 
-                  d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
+      const F_FLOAT4 rhor = d_rhor_spline_b(d_type2rhor_ji,m);
+      rhotmp += ((rhor.x*p + rhor.y)*p + rhor.z)*p + rhor.w;
     }
   
   }
@@ -657,10 +729,11 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelAB<EFLAG>, const int
   p -= m;
   p = MIN(p,1.0);
   const int d_type2frho_i = d_type2frho[itype];
-  d_fp[i] = (d_frho_spline(d_type2frho_i,m,0)*p + d_frho_spline(d_type2frho_i,m,1))*p + d_frho_spline(d_type2frho_i,m,2);
+  const F_FLOAT4 frho = d_frho_spline_a(d_type2frho_i,m);
+  d_fp[i] = (frho.x*p + frho.y)*p + frho.z;
   if (EFLAG) {
-    F_FLOAT phi = ((d_frho_spline(d_type2frho_i,m,3)*p + d_frho_spline(d_type2frho_i,m,4))*p + 
-                    d_frho_spline(d_type2frho_i,m,5))*p + d_frho_spline(d_type2frho_i,m,6);
+    const F_FLOAT4 frho_b = d_frho_spline_b(d_type2frho_i,m);
+    F_FLOAT phi = ((frho_b.x*p + frho_b.y)*p + frho_b.z)*p + frho_b.w;
     if (d_rho[i] > rhomax) phi += d_fp[i] * (d_rho[i]-rhomax);
     if (eflag_global) ev.evdwl += phi;
     if (eflag_atom) d_eatom[i] += phi;
@@ -729,16 +802,18 @@ void PairEAMKokkos<DeviceType>::operator()(TagPairEAMKernelC<NEIGHFLAG,NEWTON_PA
       //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
 
       const int d_type2rhor_ij = d_type2rhor(itype,jtype);
-      const F_FLOAT rhoip = (d_rhor_spline(d_type2rhor_ij,m,0)*p + d_rhor_spline(d_type2rhor_ij,m,1))*p +
-                             d_rhor_spline(d_type2rhor_ij,m,2);
+      const F_FLOAT4 rhor_ij = d_rhor_spline_a(d_type2rhor_ij,m);
+      const F_FLOAT rhoip = (rhor_ij.x*p + rhor_ij.y)*p + rhor_ij.z;
+
       const int d_type2rhor_ji = d_type2rhor(jtype,itype);
-      const F_FLOAT rhojp = (d_rhor_spline(d_type2rhor_ji,m,0)*p + d_rhor_spline(d_type2rhor_ji,m,1))*p + 
-                             d_rhor_spline(d_type2rhor_ji,m,2);
+      const F_FLOAT4 rhor_ji = d_rhor_spline_a(d_type2rhor_ji,m);
+      const F_FLOAT rhojp = (rhor_ji.x*p + rhor_ji.y)*p + rhor_ji.z;
+
       const int d_type2z2r_ij = d_type2z2r(itype,jtype);
-      const F_FLOAT z2p = (d_z2r_spline(d_type2z2r_ij,m,0)*p + d_z2r_spline(d_type2z2r_ij,m,1))*p + 
-                           d_z2r_spline(d_type2z2r_ij,m,2);
-      const F_FLOAT z2 = ((d_z2r_spline(d_type2z2r_ij,m,3)*p + d_z2r_spline(d_type2z2r_ij,m,4))*p + 
-                           d_z2r_spline(d_type2z2r_ij,m,5))*p + d_z2r_spline(d_type2z2r_ij,m,6);
+      const F_FLOAT4 z2r_a = d_z2r_spline_a(d_type2z2r_ij,m);
+      const F_FLOAT z2p = (z2r_a.x*p + z2r_a.y)*p + z2r_a.z;
+      const F_FLOAT4 z2r_b = d_z2r_spline_b(d_type2z2r_ij,m);
+      const F_FLOAT z2 = ((z2r_b.x*p + z2r_b.y)*p + z2r_b.z)*p + z2r_b.w;
 
       const F_FLOAT recip = 1.0/r;
       const F_FLOAT phi = z2*recip;
diff --git a/src/KOKKOS/pair_eam_kokkos.h b/src/KOKKOS/pair_eam_kokkos.h
index 98d0fc35c7..4bd43cd38a 100755
--- a/src/KOKKOS/pair_eam_kokkos.h
+++ b/src/KOKKOS/pair_eam_kokkos.h
@@ -136,17 +136,17 @@ class PairEAMKokkos : public PairEAM {
   DAT::t_int_2d_randomread d_type2rhor;
   DAT::t_int_2d_randomread d_type2z2r;
 
-  typedef Kokkos::DualView<F_FLOAT**[7],Kokkos::LayoutRight,DeviceType> tdual_ffloat_2d_n7;
-  typedef typename tdual_ffloat_2d_n7::t_dev_const_randomread t_ffloat_2d_n7_randomread;
-  typedef typename tdual_ffloat_2d_n7::t_host t_host_ffloat_2d_n7;
+  typedef Kokkos::DualView<F_FLOAT4**,Kokkos::LayoutLeft,DeviceType> tdual_ffloat4;
+  typedef typename tdual_ffloat4::t_dev_const_randomread t_ffloat4_randomread;
+  typedef typename tdual_ffloat4::t_host t_host_ffloat4;
 
-  t_ffloat_2d_n7_randomread d_frho_spline;
-  t_ffloat_2d_n7_randomread d_rhor_spline;
-  t_ffloat_2d_n7_randomread d_z2r_spline;
+  t_ffloat4_randomread d_frho_spline_a, d_frho_spline_b;
+  t_ffloat4_randomread d_rhor_spline_a, d_rhor_spline_b;
+  t_ffloat4_randomread d_z2r_spline_a, d_z2r_spline_b;
+  void interpolate(int, double, double *, t_host_ffloat4, t_host_ffloat4, int);
 
   virtual void file2array();
   void array2spline();
-  void interpolate(int, double, double *, t_host_ffloat_2d_n7, int);
 
   typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread d_ilist;
-- 
GitLab