From de8176b4fc0556d758d7c49bea43f92c5006f234 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer <akohlmey@gmail.com> Date: Fri, 20 Jul 2018 14:41:54 -0400 Subject: [PATCH] various minor OpenCL related fixes and improvements to the GPU package - document previously undocumented OpenCL tune settings - implement OpenCL platform selection through prefixing the device type with the platform id separated by a colon - allow passing custom tune parameters though postfixing the device type with the 13 tuneable parameters separated by commas - remove an extra clear() that would delete device properties structs an cause LAMMPS to output garbage strings --- doc/src/package.txt | 49 +++++++++++++++++++++++++++++++------ lib/gpu/geryon/ocl_device.h | 16 ++++++------ lib/gpu/lal_device.cpp | 26 ++++++++++++++++---- lib/gpu/lal_device.h | 2 +- src/GPU/gpu_extra.h | 3 +++ 5 files changed, 74 insertions(+), 22 deletions(-) diff --git a/doc/src/package.txt b/doc/src/package.txt index 5c698934e8..5fd42f67d3 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -33,8 +33,10 @@ args = arguments specific to the style :l last = ID of last GPU to be used on each node {tpa} value = Nthreads Nthreads = # of GPU threads used per atom - {device} value = device_type - device_type = {kepler} or {fermi} or {cypress} or {generic} + {device} value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13 + platform_id = numerical OpenCL platform id (default: -1) + device_type = {kepler} or {fermi} or {cypress} or {intel} or {phi} or {generic} or {custom} + val1,val2,... = custom OpenCL tune parameters (see below for details) {blocksize} value = size size = thread block size for pair force computation {intel} args = NPhi keyword value ... @@ -96,6 +98,9 @@ args = arguments specific to the style :l package gpu 1 package gpu 1 split 0.75 package gpu 2 split -1.0 +package gpu 1 device kepler +package gpu 1 device 2:generic +package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128 package kokkos neigh half comm device package omp 0 neigh no package omp 4 @@ -244,12 +249,40 @@ the value can improve performance. The number of threads per atom must be a power of 2 and currently cannot be greater than 32. The {device} keyword can be used to tune parameters optimized for a -specific accelerator, when using OpenCL. For CUDA, the {device} -keyword is ignored. Currently, the device type is limited to NVIDIA -Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices -may be added later. The default device type can be specified when -building LAMMPS with the GPU library, via settings in the -lib/gpu/Makefile that is used. +specific accelerator and platform when using OpenCL. OpenCL supports +the concept of a [platform], which represents one or more devices that +share the same driver (e.g. there would be a different platform for +GPUs from different vendors or for CPU based accelerator support). +In LAMMPS only one platform can be active at a time and by default +the first platform with an accelerator is selected. This is equivalent +to using a platform ID of -1. The platform ID is a number corresponding +to the output of the ocl_get_devices tool. The platform ID is passed +to the GPU library, by prefixing the {device} keyword with that number +separated by a colon. For CUDA, the {device} keyword is ignored. +Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA +Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device. +More devices may be added later. The default device type can be +specified when building LAMMPS with the GPU library, via setting a +variable in the lib/gpu/Makefile that is used. + +In addition, a device type {custom} is available, which is followed by +13 comma separated numbers, which allows to set those tweakable parameters +from the package command. It can be combined with the (colon separated) +platform id. The individual settings are: + +MEM_THREADS +THREADS_PER_ATOM +THREADS_PER_CHARGE +BLOCK_PAIR +MAX_SHARED_TYPES +BLOCK_NBOR_BUILD +BLOCK_BIO_PAIR +BLOCK_ELLIPSE +WARP_SIZE +PPPM_BLOCK_1D +BLOCK_CELL_2D +BLOCK_CELL_ID +MAX_BIO_SHARED_TYPES :ul The {blocksize} keyword allows you to tweak the number of threads used per thread block. This number should be a multiple of 32 (for GPUs) diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h index 2b2367545e..14455e38a5 100644 --- a/lib/gpu/geryon/ocl_device.h +++ b/lib/gpu/geryon/ocl_device.h @@ -165,8 +165,8 @@ class UCL_Device { /// Get the current OpenCL device name inline std::string name() { return name(_device); } /// Get the OpenCL device name - inline std::string name(const int i) - { return std::string(_properties[i].name); } + inline std::string name(const int i) { + return std::string(_properties[i].name); } /// Get a string telling the type of the current device inline std::string device_type_name() { return device_type_name(_device); } @@ -281,7 +281,7 @@ class UCL_Device { inline cl_device_id & cl_device() { return _cl_device; } /// Select the platform that has accelerators - inline void set_platform_accelerator(int pid=-1); + inline int set_platform_accelerator(int pid=-1); private: int _num_platforms; // Number of platforms @@ -324,6 +324,7 @@ UCL_Device::~UCL_Device() { void UCL_Device::clear() { _properties.clear(); + _cl_devices.clear(); if (_device>-1) { for (size_t i=0; i<_cq.size(); i++) { CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back())); @@ -520,8 +521,6 @@ int UCL_Device::device_type(const int i) { // Set the CUDA device to the specified device number int UCL_Device::set(int num) { - clear(); - cl_device_id *device_list = new cl_device_id[_num_devices]; cl_uint n; CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices, @@ -612,7 +611,7 @@ void UCL_Device::print_all(std::ostream &out) { // Select the platform that is associated with accelerators // if pid < 0, select the first platform -void UCL_Device::set_platform_accelerator(int pid) { +int UCL_Device::set_platform_accelerator(int pid) { if (pid < 0) { int found = 0; for (int n=0; n<_num_platforms; n++) { @@ -625,10 +624,11 @@ void UCL_Device::set_platform_accelerator(int pid) { break; } } - if (found) break; + if (found) return UCL_SUCCESS; } + return UCL_ERROR; } else { - set_platform(pid); + return set_platform(pid); } } diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 0ea128a5b3..7f54432a74 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -34,8 +34,8 @@ using namespace LAMMPS_AL; template <class numtyp, class acctyp> DeviceT::Device() : _init_count(0), _device_init(false), - _gpu_mode(GPU_FORCE), _first_device(0), - _last_device(0), _compiled(false) { + _gpu_mode(GPU_FORCE), _first_device(0), + _last_device(0), _platform_id(-1), _compiled(false) { } template <class numtyp, class acctyp> @@ -67,6 +67,17 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, _particle_split=p_split; _cell_size=cell_size; _block_pair=block_pair; + // support selecting platform though "package device" keyword. + // "0:generic" will select platform 0 and tune for generic device + // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu + if (ocl_vendor) { + char *sep = NULL; + if ((sep = strstr(ocl_vendor,":"))) { + *sep = '\0'; + _platform_id = atoi(ocl_vendor); + ocl_vendor = sep+1; + } + } // Get the rank/size within the world MPI_Comm_rank(_comm_world,&_world_me); @@ -135,6 +146,9 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, return -7; #endif + if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS) + return -12; + if (gpu->set(my_gpu)!=UCL_SUCCESS) return -6; @@ -191,13 +205,15 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { _ocl_vendor_string="-DUSE_OPENCL"; int token_count=0; std::string params[13]; - char *pch = strtok(ocl_vendor,"\" "); + char *pch = strtok(ocl_vendor,","); + pch = strtok(NULL,","); + if (pch == NULL) return -11; while (pch != NULL) { if (token_count==13) return -11; params[token_count]=pch; token_count++; - pch = strtok(NULL,"\" "); + pch = strtok(NULL,","); } _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+ " -DTHREADS_PER_ATOM="+params[1]+ @@ -656,7 +672,7 @@ int DeviceT::compile_kernels() { dev_program=new UCL_Program(*gpu); int success=dev_program->load_string(device,compile_string().c_str()); if (success!=UCL_SUCCESS) - return -4; + return -6; k_zero.set_function(*dev_program,"kernel_zero"); k_info.set_function(*dev_program,"kernel_info"); _compiled=true; diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 95e9f2a430..695b0a62f9 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -292,7 +292,7 @@ class Device { MPI_Comm _comm_world, _comm_replica, _comm_gpu; int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, _replica_size; - int _gpu_mode, _first_device, _last_device, _nthreads; + int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads; double _particle_split; double _cpu_full; double _ptx_arch; diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h index 56a4f15f1b..111d13c563 100644 --- a/src/GPU/gpu_extra.h +++ b/src/GPU/gpu_extra.h @@ -58,6 +58,9 @@ namespace GPU_EXTRA { else if (all_success == -11) error->all(FLERR, "Invalid custom OpenCL parameter string."); + else if (all_success == -12) + error->all(FLERR, + "Invalid OpenCL platform ID."); else error->all(FLERR,"Unknown error in GPU library"); } -- GitLab