From de8176b4fc0556d758d7c49bea43f92c5006f234 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 20 Jul 2018 14:41:54 -0400
Subject: [PATCH] various minor OpenCL related fixes and improvements to the
 GPU package

- document previously undocumented OpenCL tune settings
- implement OpenCL platform selection through prefixing the device type with the platform id separated by a colon
- allow passing custom tune parameters though postfixing the device type with the 13 tuneable parameters separated by commas
- remove an extra clear() that would delete device properties structs an cause LAMMPS to output garbage strings
---
 doc/src/package.txt         | 49 +++++++++++++++++++++++++++++++------
 lib/gpu/geryon/ocl_device.h | 16 ++++++------
 lib/gpu/lal_device.cpp      | 26 ++++++++++++++++----
 lib/gpu/lal_device.h        |  2 +-
 src/GPU/gpu_extra.h         |  3 +++
 5 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/doc/src/package.txt b/doc/src/package.txt
index 5c698934e8..5fd42f67d3 100644
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@@ -33,8 +33,10 @@ args = arguments specific to the style :l
         last = ID of last GPU to be used on each node
       {tpa} value = Nthreads
         Nthreads = # of GPU threads used per atom
-      {device} value = device_type
-        device_type = {kepler} or {fermi} or {cypress} or {generic}
+      {device} value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
+        platform_id = numerical OpenCL platform id (default: -1)
+        device_type = {kepler} or {fermi} or {cypress} or {intel} or {phi} or {generic} or {custom}
+        val1,val2,... = custom OpenCL tune parameters (see below for details)
       {blocksize} value = size
         size = thread block size for pair force computation
   {intel} args = NPhi keyword value ...
@@ -96,6 +98,9 @@ args = arguments specific to the style :l
 package gpu 1
 package gpu 1 split 0.75
 package gpu 2 split -1.0
+package gpu 1 device kepler
+package gpu 1 device 2:generic
+package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
 package kokkos neigh half comm device
 package omp 0 neigh no
 package omp 4
@@ -244,12 +249,40 @@ the value can improve performance. The number of threads per atom must
 be a power of 2 and currently cannot be greater than 32.
 
 The {device} keyword can be used to tune parameters optimized for a
-specific accelerator, when using OpenCL.  For CUDA, the {device}
-keyword is ignored.  Currently, the device type is limited to NVIDIA
-Kepler, NVIDIA Fermi, AMD Cypress, or a generic device.  More devices
-may be added later.  The default device type can be specified when
-building LAMMPS with the GPU library, via settings in the
-lib/gpu/Makefile that is used.
+specific accelerator and platform when using OpenCL. OpenCL supports
+the concept of a [platform], which represents one or more devices that
+share the same driver (e.g. there would be a different platform for
+GPUs from different vendors or for CPU based accelerator support).
+In LAMMPS only one platform can be active at a time and by default
+the first platform with an accelerator is selected. This is equivalent
+to using a platform ID of -1. The platform ID is a number corresponding
+to the output of the ocl_get_devices tool. The platform ID is passed
+to the GPU library, by prefixing the {device} keyword with that number
+separated by a colon. For CUDA, the {device} keyword is ignored.
+Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
+Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
+More devices may be added later.  The default device type can be
+specified when building LAMMPS with the GPU library, via setting a
+variable in the lib/gpu/Makefile that is used.
+
+In addition, a device type {custom} is available, which is followed by
+13 comma separated numbers, which allows to set those tweakable parameters
+from the package command. It can be combined with the (colon separated)
+platform id. The individual settings are:
+
+MEM_THREADS
+THREADS_PER_ATOM
+THREADS_PER_CHARGE
+BLOCK_PAIR
+MAX_SHARED_TYPES
+BLOCK_NBOR_BUILD
+BLOCK_BIO_PAIR
+BLOCK_ELLIPSE
+WARP_SIZE
+PPPM_BLOCK_1D
+BLOCK_CELL_2D
+BLOCK_CELL_ID
+MAX_BIO_SHARED_TYPES :ul
 
 The {blocksize} keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index 2b2367545e..14455e38a5 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -165,8 +165,8 @@ class UCL_Device {
   /// Get the current OpenCL device name
   inline std::string name() { return name(_device); }
   /// Get the OpenCL device name
-  inline std::string name(const int i)
-    { return std::string(_properties[i].name); }
+  inline std::string name(const int i) {
+    return std::string(_properties[i].name); }
 
   /// Get a string telling the type of the current device
   inline std::string device_type_name() { return device_type_name(_device); }
@@ -281,7 +281,7 @@ class UCL_Device {
   inline cl_device_id & cl_device() { return _cl_device; }
 
   /// Select the platform that has accelerators
-  inline void set_platform_accelerator(int pid=-1);
+  inline int set_platform_accelerator(int pid=-1);
 
  private:
   int _num_platforms;          // Number of platforms
@@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() {
 
 void UCL_Device::clear() {
   _properties.clear();
+  _cl_devices.clear();
   if (_device>-1) {
     for (size_t i=0; i<_cq.size(); i++) {
       CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
@@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) {
 
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
-  clear();
-
   cl_device_id *device_list = new cl_device_id[_num_devices];
   cl_uint n;
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
@@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) {
 
 // Select the platform that is associated with accelerators
 // if pid < 0, select the first platform
-void UCL_Device::set_platform_accelerator(int pid) {
+int UCL_Device::set_platform_accelerator(int pid) {
   if (pid < 0) {
     int found = 0;
     for (int n=0; n<_num_platforms; n++) {
@@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) {
           break;
         }
       }
-      if (found) break;
+      if (found) return UCL_SUCCESS;
     }
+    return UCL_ERROR;
   } else {
-    set_platform(pid);
+    return set_platform(pid);
   }
 }
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 0ea128a5b3..7f54432a74 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -34,8 +34,8 @@ using namespace LAMMPS_AL;
 
 template <class numtyp, class acctyp>
 DeviceT::Device() : _init_count(0), _device_init(false),
-                                  _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0), _compiled(false) {
+                    _gpu_mode(GPU_FORCE), _first_device(0),
+                    _last_device(0), _platform_id(-1), _compiled(false) {
 }
 
 template <class numtyp, class acctyp>
@@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   _particle_split=p_split;
   _cell_size=cell_size;
   _block_pair=block_pair;
+  // support selecting platform though "package device" keyword.
+  // "0:generic" will select platform 0 and tune for generic device
+  // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
+  if (ocl_vendor) {
+    char *sep = NULL;
+    if ((sep = strstr(ocl_vendor,":"))) {
+      *sep = '\0';
+      _platform_id = atoi(ocl_vendor);
+      ocl_vendor = sep+1;
+    }
+  }
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
@@ -135,6 +146,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
     return -7;
   #endif
 
+  if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
+    return -12;
+
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
@@ -191,13 +205,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
     _ocl_vendor_string="-DUSE_OPENCL";
     int token_count=0;
     std::string params[13];
-    char *pch = strtok(ocl_vendor,"\" ");
+    char *pch = strtok(ocl_vendor,",");
+    pch = strtok(NULL,",");
+    if (pch == NULL) return -11;
     while (pch != NULL) {
       if (token_count==13)
         return -11;
       params[token_count]=pch;
       token_count++;
-      pch = strtok(NULL,"\" ");
+      pch = strtok(NULL,",");
     }
     _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
                         " -DTHREADS_PER_ATOM="+params[1]+
@@ -656,7 +672,7 @@ int DeviceT::compile_kernels() {
   dev_program=new UCL_Program(*gpu);
   int success=dev_program->load_string(device,compile_string().c_str());
   if (success!=UCL_SUCCESS)
-    return -4;
+    return -6;
   k_zero.set_function(*dev_program,"kernel_zero");
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 95e9f2a430..695b0a62f9 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -292,7 +292,7 @@ class Device {
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
       _replica_size;
-  int _gpu_mode, _first_device, _last_device, _nthreads;
+  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
   double _particle_split;
   double _cpu_full;
   double _ptx_arch;
diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
index 56a4f15f1b..111d13c563 100644
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@@ -58,6 +58,9 @@ namespace GPU_EXTRA {
       else if (all_success == -11)
         error->all(FLERR,
                    "Invalid custom OpenCL parameter string.");
+      else if (all_success == -12)
+        error->all(FLERR,
+                   "Invalid OpenCL platform ID.");
       else
         error->all(FLERR,"Unknown error in GPU library");
     }
-- 
GitLab