diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt index 14f4103aed8d6df6e782eba0e63c196c2daf380d..597ed73e5316e6eb57d53d227798eeab2cb363ca 100644 --- a/doc/src/Speed_kokkos.txt +++ b/doc/src/Speed_kokkos.txt @@ -96,6 +96,16 @@ software version 7.5 or later must be installed on your system. See the discussion for the "GPU package"_Speed_gpu.html for details of how to check and do this. +NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI +library is CUDA-aware and has support for GPU-direct. This is not always +the case, especially when using pre-compiled MPI libraries provided by +a Linux distribution. This is not a problem when using only a single +GPU and a single MPI rank on a desktop. When running with multiple +MPI ranks, you may see segmentation faults without GPU-direct support. +These can be avoided by adding the flags '-pk kokkos gpu/direct off' +to the LAMMPS command line or by using the command +"package kokkos gpu/direct off"_package.html in the input file. + Use a C++11 compatible compiler and set KOKKOS_ARCH variable in /src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as described above. Then do the following: @@ -262,9 +272,11 @@ the # of physical GPUs on the node. You can assign multiple MPI tasks to the same GPU with the KOKKOS package, but this is usually only faster if significant portions of the input script have not been ported to use Kokkos. Using CUDA MPS is recommended in this -scenario. As above for multi-core CPUs (and no GPU), if N is the -number of physical cores/node, then the number of MPI tasks/node -should not exceed N. +scenario. Using a CUDA-aware MPI library with support for GPU-direct +is highly recommended. GPU-direct use can be avoided by using "-pk kokkos gpu/direct no". +As above for multi-core CPUs (and no GPU), if N is the number of +physical cores/node, then the number of MPI tasks/node should not +exceed N. -k on g Ng :pre diff --git a/doc/src/package.txt b/doc/src/package.txt index c2e7345bb74c4e25a5dc4861194c704d3017232d..3d25a64d3174aacc9218ed4d26e96cae5355ddef 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -84,6 +84,9 @@ args = arguments specific to the style :l no = perform communication pack/unpack in non-KOKKOS mode host = perform pack/unpack on host (e.g. with OpenMP threading) device = perform pack/unpack on device (e.g. on GPU) + {gpu/direct} = {off} or {on} + off = do not use GPU-direct + on = use GPU-direct (default) {omp} args = Nthreads keyword value ... Nthread = # of OpenMP threads to associate with each MPI process zero or more keyword/value pairs may be appended @@ -480,15 +483,15 @@ The value options for all 3 keywords are {no} or {host} or {device}. A value of {no} means to use the standard non-KOKKOS method of packing/unpacking data for the communication. A value of {host} means to use the host, typically a multi-core CPU, and perform the -packing/unpacking in parallel with threads. A value of {device} means -to use the device, typically a GPU, to perform the packing/unpacking -operation. +packing/unpacking in parallel with threads. A value of {device} +means to use the device, typically a GPU, to perform the +packing/unpacking operation. The optimal choice for these keywords depends on the input script and the hardware used. The {no} value is useful for verifying that the -Kokkos-based {host} and {device} values are working correctly. It may -also be the fastest choice when using Kokkos styles in MPI-only mode -(i.e. with a thread count of 1). +Kokkos-based {host} and {device} values are working correctly. +It may also be the fastest choice when using Kokkos styles in +MPI-only mode (i.e. with a thread count of 1). When running on CPUs or Xeon Phi, the {host} and {device} values work identically. When using GPUs, the {device} value will typically be @@ -504,6 +507,18 @@ typically faster to let the host handle communication, by using the {host} value. Using {host} instead of {no} will enable use of multiple threads to pack/unpack communicated data. +The {gpu/direct} keyword chooses whether GPU-direct will be used. When +this keyword is set to {on}, buffers in GPU memory are passed directly +through MPI send/receive calls. This reduces overhead of first copying +the data to the host CPU. However GPU-direct is not supported on all +systems, which can lead to segmentation faults and would require +using a value of {off}. If LAMMPS can safely detect that GPU-direct is +not available (currently only possible with OpenMPI v2.0.0 or later), +then the {gpu/direct} keyword is automatically set to {off} by default. +When the {gpu/direct} keyword is set to {off} while any of the {comm} +keywords are set to {device}, the value for these {comm} keywords will +be automatically changed to {host}. + :line The {omp} style invokes settings associated with the use of the @@ -610,12 +625,14 @@ is used. If it is not used, you must invoke the package intel command in your input script or or via the "-pk intel" "command-line switch"_Section_start.html#start_6. -For the KOKKOS package, the option defaults neigh = full, -neigh/qeq = full, newton = off, binsize = 0.0, and comm = device. -These settings are made automatically by the required "-k on" "command-line -switch"_Section_start.html#start_6. You can change them bu using the -package kokkos command in your input script or via the "-pk kokkos" -"command-line switch"_Section_start.html#start_6. +For the KOKKOS package, the option defaults neigh = full, neigh/qeq = +full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on. +When LAMMPS can safely detect, that GPU-direct is not available, the +default value of gpu/direct becomes "off". +These settings are made automatically by the required "-k on" +"command-line switch"_Section_start.html#start_6. You can change them by +using the package kokkos command in your input script or via the "-pk +kokkos" "command-line switch"_Section_start.html#start_6. For the OMP package, the default is Nthreads = 0 and the option defaults are neigh = yes. These settings are made automatically if diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 96d1c64d0d1aa389caca8c7ef8a635dd406fc2e9..21840b7c3ee8fd396b90e9983cd57c2d49ef64aa 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -404,12 +404,30 @@ void CommKokkos::forward_comm_pair_device(Pair *pair) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - if (recvnum[iswap]) - MPI_Irecv(k_buf_recv_pair.view<DeviceType>().data(),nsize*recvnum[iswap],MPI_DOUBLE, + double* buf_send_pair; + double* buf_recv_pair; + if (lmp->kokkos->gpu_direct_flag) { + buf_send_pair = k_buf_send_pair.view<DeviceType>().data(); + buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data(); + } else { + k_buf_send_pair.modify<DeviceType>(); + k_buf_send_pair.sync<LMPHostType>(); + buf_send_pair = k_buf_send_pair.h_view.data(); + buf_recv_pair = k_buf_recv_pair.h_view.data(); + } + + if (recvnum[iswap]) { + MPI_Irecv(buf_recv_pair,nsize*recvnum[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); + } if (sendnum[iswap]) - MPI_Send(k_buf_send_pair.view<DeviceType>().data(),n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world); if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_direct_flag) { + k_buf_recv_pair.modify<LMPHostType>(); + k_buf_recv_pair.sync<DeviceType>(); + } } else k_buf_recv_pair = k_buf_send_pair; // unpack buffer diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp index 847fa5907a9f05cae3f8e4777097bc92029ebd09..64a9d6992f0f56a64e5cbbb3d6fb674929f5ca41 100644 --- a/src/KOKKOS/gridcomm_kokkos.cpp +++ b/src/KOKKOS/gridcomm_kokkos.cpp @@ -18,6 +18,7 @@ #include "memory_kokkos.h" #include "error.h" #include "kokkos_base.h" +#include "kokkos.h" using namespace LAMMPS_NS; @@ -526,11 +527,28 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which) DeviceType::fence(); if (swap[m].sendproc != me) { - MPI_Irecv(k_buf2.view<DeviceType>().data(),nforward*swap[m].nunpack,MPI_FFT_SCALAR, + FFT_SCALAR* buf1; + FFT_SCALAR* buf2; + if (lmp->kokkos->gpu_direct_flag) { + buf1 = k_buf1.view<DeviceType>().data(); + buf2 = k_buf2.view<DeviceType>().data(); + } else { + k_buf1.modify<DeviceType>(); + k_buf1.sync<LMPHostType>(); + buf1 = k_buf1.h_view.data(); + buf2 = k_buf2.h_view.data(); + } + + MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR, swap[m].recvproc,0,gridcomm,&request); - MPI_Send(k_buf1.view<DeviceType>().data(),nforward*swap[m].npack,MPI_FFT_SCALAR, + MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR, swap[m].sendproc,0,gridcomm); MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_direct_flag) { + k_buf2.modify<LMPHostType>(); + k_buf2.sync<DeviceType>(); + } } kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m); @@ -559,11 +577,28 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which) DeviceType::fence(); if (swap[m].recvproc != me) { - MPI_Irecv(k_buf2.view<DeviceType>().data(),nreverse*swap[m].npack,MPI_FFT_SCALAR, + FFT_SCALAR* buf1; + FFT_SCALAR* buf2; + if (lmp->kokkos->gpu_direct_flag) { + buf1 = k_buf1.view<DeviceType>().data(); + buf2 = k_buf2.view<DeviceType>().data(); + } else { + k_buf1.modify<DeviceType>(); + k_buf1.sync<LMPHostType>(); + buf1 = k_buf1.h_view.data(); + buf2 = k_buf2.h_view.data(); + } + + MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR, swap[m].sendproc,0,gridcomm,&request); - MPI_Send(k_buf1.view<DeviceType>().data(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR, + MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR, swap[m].recvproc,0,gridcomm); MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_direct_flag) { + k_buf2.modify<LMPHostType>(); + k_buf2.sync<DeviceType>(); + } } kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 3bbff6be7e77c93930b23b1bf937c28c1a2afd04..fb6b8d8d45aa5d4e1e582c9b745ec53906aa999b 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -11,6 +11,7 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ +#include <mpi.h> #include <cstdio> #include <cstring> #include <cstdlib> @@ -25,6 +26,37 @@ #include "error.h" #include "memory_kokkos.h" +#ifdef KOKKOS_HAVE_CUDA + +// for detecting GPU-direct support: +// the function int have_gpu_direct() +// - returns -1 if GPU-direct support is unknown +// - returns 0 if no GPU-direct support available +// - returns 1 if GPU-direct support is available + +#define GPU_DIRECT_UNKNOWN static int have_gpu_direct() {return -1;} + +// OpenMPI supports detecting GPU-direct as of version 2.0.0 +#if OPEN_MPI + +#if (OMPI_MAJOR_VERSION >= 2) +#include <mpi-ext.h> +#if defined(MPIX_CUDA_AWARE_SUPPORT) +static int have_gpu_direct() { return MPIX_Query_cuda_support(); } +#else +GPU_DIRECT_UNKNOWN +#endif + +#else // old OpenMPI +GPU_DIRECT_UNKNOWN +#endif + +#else // unknown MPI library +GPU_DIRECT_UNKNOWN +#endif + +#endif // KOKKOS_HAVE_CUDA + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ @@ -106,13 +138,32 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) // initialize Kokkos if (me == 0) { - if (screen) fprintf(screen," using %d GPU(s)\n",ngpu); - if (logfile) fprintf(logfile," using %d GPU(s)\n",ngpu); + if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpu); + if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpu); } #ifdef KOKKOS_HAVE_CUDA if (ngpu <= 0) error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested"); + + // check and warn about GPU-direct availability when using multiple MPI tasks + + int nmpi = 0; + MPI_Comm_size(world,&nmpi); + if ((nmpi > 1) && (me == 0)) { + if ( 1 == have_gpu_direct() ) { + ; // all good, nothing to warn about + } else if (-1 == have_gpu_direct() ) { + error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available," + " but cannot determine if this is the case\n try" + " '-pk kokkos gpu/direct off' when getting segmentation faults"); + } else if ( 0 == have_gpu_direct() ) { + error->warning(FLERR,"GPU-direct is NOT available, " + "using '-pk kokkos gpu/direct off' by default"); + } else { + ; // should never get here + } + } #endif Kokkos::InitArguments args; @@ -133,6 +184,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) exchange_comm_on_host = 0; forward_comm_on_host = 0; reverse_comm_on_host = 0; + gpu_direct_flag = 1; + +#if KOKKOS_USE_CUDA + // only if we can safely detect, that GPU-direct is not available, change default + if (0 == have_gpu_direct()) gpu_direct_flag = 0; +#endif #ifdef KILL_KOKKOS_ON_SIGSEGV signal(SIGSEGV, my_signal_handler); @@ -163,6 +220,7 @@ void KokkosLMP::accelerator(int narg, char **arg) double binsize = 0.0; exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; + gpu_direct_flag = 1; int iarg = 0; while (iarg < narg) { @@ -204,6 +262,7 @@ void KokkosLMP::accelerator(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else if (strcmp(arg[iarg+1],"host") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; @@ -245,9 +304,26 @@ void KokkosLMP::accelerator(int narg, char **arg) reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; + } else if (strcmp(arg[iarg],"gpu/direct") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"off") == 0) gpu_direct_flag = 0; + else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1; + else error->all(FLERR,"Illegal package kokkos command"); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } + // if "gpu/direct off" and "comm device", change to "comm host" + + if (!gpu_direct_flag) { + if (exchange_comm_classic == 0 && exchange_comm_on_host == 0) + exchange_comm_on_host = 1; + if (forward_comm_classic == 0 && forward_comm_on_host == 0) + forward_comm_on_host = 1; + if (reverse_comm_classic == 0 && reverse_comm_on_host == 0) + reverse_comm_on_host = 1; + } + // set newton flags // set neighbor binsize, same as neigh_modify command diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 846b7254afe5de89f3739c8ef92ba41b83c078a6..cf209c0adb97e10a7bf5b45daa4502b3ad15d46a 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -34,6 +34,7 @@ class KokkosLMP : protected Pointers { int num_threads,ngpu; int numa; int auto_sync; + int gpu_direct_flag; KokkosLMP(class LAMMPS *, int, char **); ~KokkosLMP();