From 30431d4edb77a52f7c2c46d8dfde95888242c0f8 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 8 Aug 2017 16:57:27 -0400
Subject: [PATCH] rework Install.py for gpu library. make it consistent with
 other lib folders and support python3

---
 lib/gpu/.gitignore                 | 10 +++---
 lib/gpu/Install.py                 | 58 ++++++++++++++++--------------
 lib/gpu/Makefile.linux             |  2 +-
 lib/gpu/Makefile.mingw32-cross     | 17 ---------
 lib/gpu/Makefile.mingw32-cross-mpi | 19 ----------
 lib/gpu/Makefile.mingw64-cross     | 18 ----------
 lib/gpu/Makefile.mingw64-cross-mpi | 20 -----------
 lib/gpu/Makefile.mpi               |  1 +
 lib/gpu/Makefile.serial            | 31 ++++++++++++----
 9 files changed, 63 insertions(+), 113 deletions(-)
 delete mode 100644 lib/gpu/Makefile.mingw32-cross
 delete mode 100644 lib/gpu/Makefile.mingw32-cross-mpi
 delete mode 100644 lib/gpu/Makefile.mingw64-cross
 delete mode 100644 lib/gpu/Makefile.mingw64-cross-mpi
 create mode 120000 lib/gpu/Makefile.mpi

diff --git a/lib/gpu/.gitignore b/lib/gpu/.gitignore
index 228a9f7731..9ad6046a09 100644
--- a/lib/gpu/.gitignore
+++ b/lib/gpu/.gitignore
@@ -1,4 +1,6 @@
-obj
-obj_ocl
-ocl_get_devices
-nvc_get_devices
+/obj
+/obj_ocl
+/ocl_get_devices
+/nvc_get_devices
+/*.cubin
+/*_cubin.h
diff --git a/lib/gpu/Install.py b/lib/gpu/Install.py
index c6cd1f3021..657f1c8fcc 100644
--- a/lib/gpu/Install.py
+++ b/lib/gpu/Install.py
@@ -3,53 +3,57 @@
 # Install.py tool to build the GPU library
 # used to automate the steps described in the README file in this dir
 
-import sys,os,re,commands
+from __future__ import print_function
+import sys,os,subprocess
 
 # help message
 
 help = """
-Syntax from src dir: make lib-gpu args="-i isuffix -h hdir -a arch -p precision -e esuffix -m -o osuffix"
-Syntax from lib dir: python Install.py -i isuffix -h hdir -a arch -p precision -e esuffix -m -o osuffix
+Syntax from src dir: make lib-gpu args="-m machine -h hdir -a arch -p precision -e esuffix -m -o osuffix"
+Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -e esuffix -m -o osuffix
 
 specify one or more options, order does not matter
 
-copies an existing Makefile.isuffix in lib/gpu to Makefile.auto 
+copies an existing Makefile.machine in lib/gpu to Makefile.auto 
 optionally edits these variables in Makefile.auto:
   CUDA_HOME, CUDA_ARCH, CUDA_PRECISION, EXTRAMAKE
 optionally uses Makefile.auto to build the GPU library -> libgpu.a
   and to copy a Makefile.lammps.esuffix -> Makefile.lammps
 optionally copies Makefile.auto to a new Makefile.osuffix
 
-  -i = use Makefile.isuffix as starting point, copy to Makefile.auto
-       default isuffix = linux
+  -m = use Makefile.machine as starting point, copy to Makefile.auto
+       default machine = linux
   -h = set CUDA_HOME variable in Makefile.auto to hdir
        hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
   -a = set CUDA_ARCH variable in Makefile.auto to arch
-       use arch = ?? for K40 (Tesla)
-       use arch = 37 for dual K80 (Tesla)
-       use arch = 60 for P100 (Pascal)
+       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0) 
+                     or GeForce GTX 580 or similar
+       use arch = 30 for Tesla K10 (Kepler)
+       use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
+       use arch = 37 for Tesla dual K80 (Kepler)
+       use arch = 60 for Tesla P100 (Pascal)
   -p = set CUDA_PRECISION variable in Makefile.auto to precision
        use precision = double or mixed or single
   -e = set EXTRAMAKE variable in Makefile.auto to Makefile.lammps.esuffix
-  -m = make the GPU library using Makefile.auto
+  -b = make the GPU library using Makefile.auto
        first performs a "make clean"
-       produces libgpu.a if successful
+       then produces libgpu.a if successful
        also copies EXTRAMAKE file -> Makefile.lammps
          -e can set which Makefile.lammps.esuffix file is copied
   -o = copy final Makefile.auto to Makefile.osuffix
 
 Examples:
 
-make lib-gpu args="-m"      # build GPU lib with default Makefile.linux
-make lib-gpu args="-i xk7 -p single -o xk7.single"      # create new Makefile.xk7.single, altered for single-precision
-make lib-gpu args="-i xk7 -p single -o xk7.single -m"   # ditto, also build GPU lib
+make lib-gpu args="-b"      # build GPU lib with default Makefile.linux
+make lib-gpu args="-m xk7 -p single -o xk7.single"      # create new Makefile.xk7.single, altered for single-precision
+make lib-gpu args="-m mpi -a 35 -p single -o mpi.mixed -b" # create new Makefile.mpi.mixed, also build GPU lib with these settings
 """
 
 # print error message or help
 
 def error(str=None):
-  if not str: print help
-  else: print "ERROR",str
+  if not str: print(help)
+  else: print("ERROR",str)
   sys.exit()
 
 # parse args
@@ -65,7 +69,7 @@ outflag = 0
 
 iarg = 0
 while iarg < nargs:
-  if args[iarg] == "-i":
+  if args[iarg] == "-m":
     if iarg+2 > nargs: error()
     isuffix = args[iarg+1]
     iarg += 2
@@ -89,7 +93,7 @@ while iarg < nargs:
     eflag = 1
     lmpsuffix = args[iarg+1]
     iarg += 2
-  elif args[iarg] == "-m":
+  elif args[iarg] == "-b":
     makeflag = 1
     iarg += 1
   elif args[iarg] == "-o":
@@ -117,9 +121,9 @@ fp = open("Makefile.auto",'w')
 for line in lines:
   words = line.split()
   if len(words) != 3:
-    print >>fp,line,
+    fp.write(line)
     continue
-  
+
   if hflag and words[0] == "CUDA_HOME" and words[1] == '=':
     line = line.replace(words[2],hdir)
   if aflag and words[0] == "CUDA_ARCH" and words[1] == '=':
@@ -128,20 +132,20 @@ for line in lines:
     line = line.replace(words[2],precstr)
   if eflag and words[0] == "EXTRAMAKE" and words[1] == '=':
     line = line.replace(words[2],"Makefile.lammps.%s" % lmpsuffix)
-    
-  print >>fp,line,
 
+  fp.write(line)
 fp.close()
 
 # perform make
 # make operations copies EXTRAMAKE file to Makefile.lammps
 
 if makeflag:
-  print "Building libgpu.a ..."
+  print("Building libgpu.a ...")
   cmd = "rm -f libgpu.a"
-  commands.getoutput(cmd)
+  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
   cmd = "make -f Makefile.auto clean; make -f Makefile.auto"
-  commands.getoutput(cmd)
+  txt = subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
+  print(txt.decode('UTF-8'))
   if not os.path.exists("libgpu.a"):
     error("Build of lib/gpu/libgpu.a was NOT successful")
   if not os.path.exists("Makefile.lammps"):
@@ -150,6 +154,6 @@ if makeflag:
 # copy new Makefile.auto to Makefile.osuffix
 
 if outflag:
-  print "Creating new Makefile.%s" % osuffix
+  print("Creating new Makefile.%s" % osuffix)
   cmd = "cp Makefile.auto Makefile.%s" % osuffix
-  commands.getoutput(cmd)
+  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index d72c0ba437..dfcc5bf7d3 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -37,7 +37,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math $(LMP_INC)
 
-CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
+CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC) # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.mingw32-cross b/lib/gpu/Makefile.mingw32-cross
deleted file mode 100644
index 6f77634755..0000000000
--- a/lib/gpu/Makefile.mingw32-cross
+++ /dev/null
@@ -1,17 +0,0 @@
-CUDA_HOME = ../../tools/mingw-cross/OpenCL
-
-OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-        -I$(CUDA_HOME)/include
-OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic -L../../src/STUBS -lmpi_mingw32
-OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DFERMI_OCL
-EXTRAMAKE = Makefile.lammps.mingw-cross
-
-BIN_DIR = Obj_mingw32
-OBJ_DIR = Obj_mingw32
-LIB_DIR = Obj_mingw32
-AR = i686-w64-mingw32-ar
-BSH = /bin/sh
-
-include Opencl.makefile
diff --git a/lib/gpu/Makefile.mingw32-cross-mpi b/lib/gpu/Makefile.mingw32-cross-mpi
deleted file mode 100644
index 94099cd90b..0000000000
--- a/lib/gpu/Makefile.mingw32-cross-mpi
+++ /dev/null
@@ -1,19 +0,0 @@
-CUDA_HOME = ../../tools/mingw-cross/OpenCL
-
-OCL_CPP = i686-w64-mingw32-g++ -O2 -march=i686 -mtune=generic -mfpmath=387 \
-        -mpc64 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
-	-I../../tools/mingw-cross/mpich2-win32/include/ \
-        -DMPICH_IGNORE_CXX_SEEK
-OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw32 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-	-L../../tools/mingw-cross/mpich2-win32/lib -lmpi
-OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DFERMI_OCL
-EXTRAMAKE = Makefile.lammps.mingw-cross
-
-BIN_DIR = Obj_mingw32-mpi
-OBJ_DIR = Obj_mingw32-mpi
-LIB_DIR = Obj_mingw32-mpi
-AR = i686-w64-mingw32-ar
-BSH = /bin/sh
-
-include Opencl.makefile
diff --git a/lib/gpu/Makefile.mingw64-cross b/lib/gpu/Makefile.mingw64-cross
deleted file mode 100644
index 54f6af8c65..0000000000
--- a/lib/gpu/Makefile.mingw64-cross
+++ /dev/null
@@ -1,18 +0,0 @@
-CUDA_HOME = ../../tools/mingw-cross/OpenCL
-
-OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-	-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS \
-        -I$(CUDA_HOME)/include
-OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-	-L../../src/STUBS -lmpi_mingw64
-OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DFERMI_OCL
-EXTRAMAKE = Makefile.lammps.mingw-cross
-
-BIN_DIR = Obj_mingw64
-OBJ_DIR = Obj_mingw64
-LIB_DIR = Obj_mingw64
-AR = x86_64-w64-mingw32-ar
-BSH = /bin/sh
-
-include Opencl.makefile
diff --git a/lib/gpu/Makefile.mingw64-cross-mpi b/lib/gpu/Makefile.mingw64-cross-mpi
deleted file mode 100644
index 2ff72d98b1..0000000000
--- a/lib/gpu/Makefile.mingw64-cross-mpi
+++ /dev/null
@@ -1,20 +0,0 @@
-CUDA_HOME = ../../tools/mingw-cross/OpenCL
-
-OCL_CPP = x86_64-w64-mingw32-g++ -O3 -march=core2 -mtune=core2 -mpc64 \
-	-msse2 -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include \
-	-I../../tools/mingw-cross/mpich2-win64/include/ \
-        -DMPICH_IGNORE_CXX_SEEK
- 
-OCL_LINK = -static -Wl,--enable-stdcall-fixup -L$(CUDA_HOME)/../Obj_mingw64 -Wl,-Bdynamic,-lOpenCL,-Bstatic \
-	-L../../tools/mingw-cross/mpich2-win64/lib -lmpi
-OCL_PREC = -D_SINGLE_DOUBLE
-OCL_TUNE = -DFERMI_OCL
-EXTRAMAKE = Makefile.lammps.mingw-cross
-
-BIN_DIR = Obj_mingw64-mpi
-OBJ_DIR = Obj_mingw64-mpi
-LIB_DIR = Obj_mingw64-mpi
-AR = x86_64-w64-mingw32-ar
-BSH = /bin/sh
-
-include Opencl.makefile
diff --git a/lib/gpu/Makefile.mpi b/lib/gpu/Makefile.mpi
new file mode 120000
index 0000000000..8bad27d081
--- /dev/null
+++ b/lib/gpu/Makefile.mpi
@@ -0,0 +1 @@
+Makefile.linux
\ No newline at end of file
diff --git a/lib/gpu/Makefile.serial b/lib/gpu/Makefile.serial
index 809e99cc94..9348dc565a 100644
--- a/lib/gpu/Makefile.serial
+++ b/lib/gpu/Makefile.serial
@@ -1,5 +1,5 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Makefile for CUDA using MPI STUBS library
+#  Generic Linux Makefile for CUDA 
 #     - Change CUDA_ARCH for your GPU
 # ------------------------------------------------------------------------- */
 
@@ -7,23 +7,38 @@
 
 EXTRAMAKE = Makefile.lammps.standard
 
-CUDA_HOME = $(HOME)/cuda
+ifeq ($(CUDA_HOME),)
+CUDA_HOME = /usr/local/cuda
+endif
+
 NVCC = nvcc
 
 # Tesla CUDA
-CUDA_ARCH = -arch=sm_20
+CUDA_ARCH = -arch=sm_21
 # newer CUDA
 #CUDA_ARCH = -arch=sm_13
 # older CUDA
 #CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
+CUDA_ARCH = -arch=sm_35
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+# precision for GPU calculations
+# -D_SINGLE_SINGLE  # Single precision for all calculations
+# -D_DOUBLE_DOUBLE  # Double precision for all calculations
+# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
 
 CUDA_PRECISION = -D_SINGLE_DOUBLE
+
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
-CUDA_LIB = -L$(CUDA_HOME)/lib64 -L../../src/STUBS -lmpi
-CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
+CUDA_LIB = -L$(CUDA_HOME)/lib64 -L../../src/STUBS -lmpi_stubs
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math $(LMP_INC)
 
-CUDR_CPP = g++ -DMPI_GERYON -DUCL_NO_EXIT -I../../src/STUBS
-CUDR_OPTS = -O2 
+CUDR_CPP = g++ -DMPI_GERYON -DUCL_NO_EXIT -fPIC -I../../src/STUBS
+CUDR_OPTS = -O2 $(LMP_INC) # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
 OBJ_DIR = ./
@@ -31,5 +46,7 @@ LIB_DIR = ./
 AR = ar
 BSH = /bin/sh
 
+CUDPP_OPT = -DUSE_CUDPP -Icudpp_mini
+
 include Nvidia.makefile
 
-- 
GitLab