optimized data access and using approximate exponential for USER-OMP version

d0124eac · Axel Kohlmeyer · 5685131f · d0124eac · d0124eac
Commit d0124eac authored 8 years ago by Axel Kohlmeyer
--- a/src/USER-MISC/pair_agni.cpp
+++ b/src/USER-MISC/pair_agni.cpp
@@ -66,11 +66,6 @@ static const char cite_pair_agni[] =
 #define MAXLINE 10240
 #define MAXWORD 40

-struct _3vec {
-  double x,y,z;
-};
-typedef struct _3vec _3vec_t;
-
 /* ---------------------------------------------------------------------- */

 PairAGNI::PairAGNI(LAMMPS *lmp) : Pair(lmp)
@@ -145,7 +140,7 @@ void PairAGNI::compute(int eflag, int vflag)
  firstneigh = list->firstneigh;

  double fxtmp,fytmp,fztmp;
-  _3vec_t *V;
+  double *Vx, *Vy, *Vz;

  // loop over full neighbor list of my atoms

@@ -158,8 +153,12 @@ void PairAGNI::compute(int eflag, int vflag)
    fxtmp = fytmp = fztmp = 0.0;

    const Param &iparam = params[elem2param[itype]];
-    V = new _3vec_t[iparam.numeta];
-    memset(V,0,iparam.numeta *sizeof(_3vec_t));
+    Vx = new double[iparam.numeta];
+    Vy = new double[iparam.numeta];
+    Vz = new double[iparam.numeta];
+    memset(Vx,0,iparam.numeta*sizeof(double));
+    memset(Vy,0,iparam.numeta*sizeof(double));
+    memset(Vz,0,iparam.numeta*sizeof(double));

    jlist = firstneigh[i];
    jnum = numneigh[i];
@@ -179,12 +178,12 @@ void PairAGNI::compute(int eflag, int vflag)
        const double wX = cF*delx/r;
        const double wY = cF*dely/r;
        const double wZ = cF*delz/r;
-        
+
        for (k = 0; k < iparam.numeta; ++k) {
          const double e = exp(-(iparam.eta[k]*rsq));
-          V[k].x += wX*e;
-          V[k].y += wY*e;
-          V[k].z += wZ*e;
+          Vx[k] += wX*e;
+          Vy[k] += wY*e;
+          Vz[k] += wZ*e;
        }
      }
    }
@@ -192,13 +191,13 @@ void PairAGNI::compute(int eflag, int vflag)
    for (j = 0; j < iparam.numtrain; ++j) {
      double kx = 0.0;
      double ky = 0.0;
-      double kz = 0.0;    
+      double kz = 0.0;

      for(int k = 0; k < iparam.numeta; ++k) {
        const double xu = iparam.xU[k][j];
-        kx += square(V[k].x - xu);  
-        ky += square(V[k].y - xu);
-        kz += square(V[k].z - xu);        
+        kx += square(Vx[k] - xu);
+        ky += square(Vy[k] - xu);
+        kz += square(Vz[k] - xu);
      }
      const double e = -0.5/(square(iparam.sigma));
      fxtmp += iparam.alpha[j]*exp(kx*e);
@@ -214,7 +213,9 @@ void PairAGNI::compute(int eflag, int vflag)

    if (evflag) ev_tally_xyz_full(i,0.0,0.0,fxtmp,fytmp,fztmp,delx,dely,delz);

-    delete [] V;
+    delete [] Vx;
+    delete [] Vy;
+    delete [] Vz;
  }

  if (vflag_fdotr) virial_fdotr_compute();
@@ -428,7 +429,7 @@ void PairAGNI::read_file(char *file)
      params[curparam].xU = new double*[numeta];
      for (i = 0; i < numeta; ++i)
        params[curparam].xU[i] = new double[numtrain];
-      
+
      wantdata = curparam;
      curparam = -1;
    } else if ((curparam >=0) && (nwords == 2) && (strcmp(words[0],"Rc") == 0)) {
@@ -457,7 +458,7 @@ void PairAGNI::read_file(char *file)
      }
      params[wantdata].yU[n] = atof(words[params[wantdata].numeta+1]);
      params[wantdata].alpha[n] = atof(words[params[wantdata].numeta+2]);
-      
+
    } else {
      if (comm->me == 0)
        error->warning(FLERR,"Ignoring unknown content in AGNI potential file.");

--- a/src/USER-OMP/pair_agni_omp.cpp
+++ b/src/USER-OMP/pair_agni_omp.cpp
@@ -14,6 +14,7 @@

 #include <math.h>
 #include <string.h>
+#include <stdint.h>
 #include "pair_agni_omp.h"
 #include "atom.h"
 #include "comm.h"
@@ -28,6 +29,127 @@
 using namespace LAMMPS_NS;
 using namespace MathSpecial;

+/*
+   Copyright (c) 2012,2013   Axel Kohlmeyer <akohlmey@gmail.com>
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   * Redistributions of source code must retain the above copyright
+     notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+   * Neither the name of the <organization> nor the
+     names of its contributors may be used to endorse or promote products
+     derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/* faster versions of 2**x, e**x, and 10**x in single and double precision.
+ *
+ * Based on the Cephes math library 2.8
+ */
+
+/* internal definitions for the fastermath library */
+
+/* IEEE 754 double precision floating point data manipulation */
+typedef union
+{
+    double   f;
+    uint64_t u;
+    struct {int32_t  i0,i1;};
+}  udi_t;
+#define FM_DOUBLE_BIAS 1023
+#define FM_DOUBLE_EMASK 2146435072
+#define FM_DOUBLE_MBITS 20
+#define FM_DOUBLE_MMASK 1048575
+#define FM_DOUBLE_EZERO 1072693248
+
+/* generate 2**num in floating point by bitshifting */
+#define FM_DOUBLE_INIT_EXP(var,num)                 \
+    var.i0 = 0;                                     \
+    var.i1 = (((int) num) + FM_DOUBLE_BIAS) << 20
+
+/* double precision constants */
+#define FM_DOUBLE_LOG2OFE  1.4426950408889634074
+#define FM_DOUBLE_LOGEOF2  6.9314718055994530942e-1
+#define FM_DOUBLE_LOG2OF10 3.32192809488736234789
+#define FM_DOUBLE_LOG10OF2 3.0102999566398119521e-1
+#define FM_DOUBLE_LOG10OFE 4.3429448190325182765e-1
+#define FM_DOUBLE_SQRT2    1.41421356237309504880
+#define FM_DOUBLE_SQRTH    0.70710678118654752440
+
+/* optimizer friendly implementation of exp2(x).
+ *
+ * strategy:
+ *
+ * split argument into an integer part and a fraction:
+ * ipart = floor(x+0.5);
+ * fpart = x - ipart;
+ *
+ * compute exp2(ipart) from setting the ieee754 exponent
+ * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[
+ *
+ * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart)
+ */
+
+static const double fm_exp2_q[] = {
+/*  1.00000000000000000000e0, */
+    2.33184211722314911771e2,
+    4.36821166879210612817e3
+};
+static const double fm_exp2_p[] = {
+    2.30933477057345225087e-2,
+    2.02020656693165307700e1,
+    1.51390680115615096133e3
+};
+
+static double fm_exp2(double x)
+{
+    double   ipart, fpart, px, qx;
+    udi_t    epart;
+
+    ipart = floor(x+0.5);
+    fpart = x - ipart;
+    FM_DOUBLE_INIT_EXP(epart,ipart);
+
+    x = fpart*fpart;
+
+    px =        fm_exp2_p[0];
+    px = px*x + fm_exp2_p[1];
+    qx =    x + fm_exp2_q[0];
+    px = px*x + fm_exp2_p[2];
+    qx = qx*x + fm_exp2_q[1];
+
+    px = px * fpart;
+
+    x = 1.0 + 2.0*(px/(qx-px));
+    return epart.f*x;
+}
+
+static double fm_exp(double x)
+{
+#if defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    return fm_exp2(FM_DOUBLE_LOG2OFE * (x));
+#endif
+#endif
+    return exp(x);
+}
+
 /* ---------------------------------------------------------------------- */

 PairAGNIOMP::PairAGNIOMP(LAMMPS *lmp) :
@@ -85,7 +207,7 @@ void PairAGNIOMP::eval(int iifrom, int iito, ThrData * const thr)
  firstneigh = list->firstneigh;

  double fxtmp,fytmp,fztmp;
-  dbl3_t *V;
+  double *Vx, *Vy, *Vz;

  // loop over full neighbor list of my atoms

@@ -99,8 +221,12 @@ void PairAGNIOMP::eval(int iifrom, int iito, ThrData * const thr)
    fxtmp = fytmp = fztmp = 0.0;

    const Param &iparam = params[elem2param[itype]];
-    V = new dbl3_t[iparam.numeta];
-    memset(V,0,iparam.numeta *sizeof(dbl3_t));
+    Vx = new double[iparam.numeta];
+    Vy = new double[iparam.numeta];
+    Vz = new double[iparam.numeta];
+    memset(Vx,0,iparam.numeta*sizeof(double));
+    memset(Vy,0,iparam.numeta*sizeof(double));
+    memset(Vz,0,iparam.numeta*sizeof(double));

    jlist = firstneigh[i];
    jnum = numneigh[i];
@@ -120,12 +246,12 @@ void PairAGNIOMP::eval(int iifrom, int iito, ThrData * const thr)
        const double wX = cF*delx/r;
        const double wY = cF*dely/r;
        const double wZ = cF*delz/r;
-        
+
        for (k = 0; k < iparam.numeta; ++k) {
-          const double e = exp(-(iparam.eta[k]*rsq));
-          V[k].x += wX*e;
-          V[k].y += wY*e;
-          V[k].z += wZ*e;
+          const double e = fm_exp(-(iparam.eta[k]*rsq));
+          Vx[k] += wX*e;
+          Vy[k] += wY*e;
+          Vz[k] += wZ*e;
        }
      }
    }
@@ -133,18 +259,18 @@ void PairAGNIOMP::eval(int iifrom, int iito, ThrData * const thr)
    for (j = 0; j < iparam.numtrain; ++j) {
      double kx = 0.0;
      double ky = 0.0;
-      double kz = 0.0;    
+      double kz = 0.0;

      for(int k = 0; k < iparam.numeta; ++k) {
        const double xu = iparam.xU[k][j];
-        kx += square(V[k].x - xu);  
-        ky += square(V[k].y - xu);
-        kz += square(V[k].z - xu);        
+        kx += square(Vx[k] - xu);
+        ky += square(Vy[k] - xu);
+        kz += square(Vz[k] - xu);
      }
      const double e = -0.5/(square(iparam.sigma));
-      fxtmp += iparam.alpha[j]*exp(kx*e);
-      fytmp += iparam.alpha[j]*exp(ky*e);
-      fztmp += iparam.alpha[j]*exp(kz*e);
+      fxtmp += iparam.alpha[j]*fm_exp(kx*e);
+      fytmp += iparam.alpha[j]*fm_exp(ky*e);
+      fztmp += iparam.alpha[j]*fm_exp(kz*e);
    }
    fxtmp += iparam.b;
    fytmp += iparam.b;
@@ -156,7 +282,9 @@ void PairAGNIOMP::eval(int iifrom, int iito, ThrData * const thr)
    if (EVFLAG) ev_tally_xyz_full_thr(this,i,0.0,0.0,
                                      fxtmp,fytmp,fztmp,
                                      delx,dely,delz,thr);
-    delete [] V;
+    delete [] Vx;
+    delete [] Vy;
+    delete [] Vz;
  }
 }