diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt index f26b2df7e1032326f4fedee73b562c4bf7d6f520..5f90885fe69117c429588c1fcec025a0b8934158 100644 --- a/doc/src/Speed_kokkos.txt +++ b/doc/src/Speed_kokkos.txt @@ -96,6 +96,17 @@ software version 7.5 or later must be installed on your system. See the discussion for the "GPU package"_Speed_gpu.html for details of how to check and do this. +NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI +library is CUDA-aware and has support for GPU-direct. This is not always +the case, especially when using pre-compiled MPI libraries provided by +a Linux distribution. This is not a problem when using only a single +GPU and a single MPI rank on a desktop. When running with multiple +MPI ranks, you may see segmentation faults without GPU-direct support. +These can be avoided by adding the flags +"-pk kokkos gpu/direct off"_Run_options.html +to the LAMMPS command line or by using the command +"package kokkos gpu/direct off"_package.html in the input file. + Use a C++11 compatible compiler and set KOKKOS_ARCH variable in /src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as described above. Then do the following: @@ -254,15 +265,19 @@ supports. [Running on GPUs:] -Use the "-k" "command-line switch"_Run_options.html to specify the -number of GPUs per node. Typically the -np setting of the mpirun -command should set the number of MPI tasks/node to be equal to the # -of physical GPUs on the node. You can assign multiple MPI tasks to -the same GPU with the KOKKOS package, but this is usually only faster -if significant portions of the input script have not been ported to -use Kokkos. Using CUDA MPS is recommended in this scenario. As above -for multi-core CPUs (and no GPU), if N is the number of physical -cores/node, then the number of MPI tasks/node should not exceed N. +Use the "-k" "command-line switch"_Run_options.thml to +specify the number of GPUs per node. Typically the -np setting of the +mpirun command should set the number of MPI tasks/node to be equal to +the number of physical GPUs on the node. You can assign multiple MPI +tasks to the same GPU with the KOKKOS package, but this is usually +only faster if significant portions of the input script have not +been ported to use Kokkos. Using CUDA MPS is recommended in this +scenario. Using a CUDA-aware MPI library with support for GPU-direct +is highly recommended. GPU-direct use can be avoided by using +"-pk kokkos gpu/direct no"_package.html. +As above for multi-core CPUs (and no GPU), if N is the number of +physical cores/node, then the number of MPI tasks/node should not +exceed N. -k on g Ng :pre diff --git a/doc/src/compute_property_local.txt b/doc/src/compute_property_local.txt index b0ec088cf79d009e7c3062907d73dbfe3279e6da..c4ad0afc95cd3fcf7655d5861c66a3f488683519 100644 --- a/doc/src/compute_property_local.txt +++ b/doc/src/compute_property_local.txt @@ -19,8 +19,8 @@ one or more attributes may be appended :l patom1 patom2 ptype1 ptype2 batom1 batom2 btype aatom1 aatom2 aatom3 atype - datom1 datom2 datom3 dtype - iatom1 iatom2 iatom3 itype :pre + datom1 datom2 datom3 datom4 dtype + iatom1 iatom2 iatom3 iatom4 itype :pre natom1, natom2 = IDs of 2 atoms in each pair (within neighbor cutoff) ntype1, ntype2 = type of 2 atoms in each pair (within neighbor cutoff) diff --git a/doc/src/package.txt b/doc/src/package.txt index 76246c22f6266308e4f7e799a5298399a374d618..8b0581929ff10cffd2661b5268761e61a276e418 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -84,6 +84,9 @@ args = arguments specific to the style :l no = perform communication pack/unpack in non-KOKKOS mode host = perform pack/unpack on host (e.g. with OpenMP threading) device = perform pack/unpack on device (e.g. on GPU) + {gpu/direct} = {off} or {on} + off = do not use GPU-direct + on = use GPU-direct (default) {omp} args = Nthreads keyword value ... Nthread = # of OpenMP threads to associate with each MPI process zero or more keyword/value pairs may be appended @@ -479,15 +482,15 @@ The value options for all 3 keywords are {no} or {host} or {device}. A value of {no} means to use the standard non-KOKKOS method of packing/unpacking data for the communication. A value of {host} means to use the host, typically a multi-core CPU, and perform the -packing/unpacking in parallel with threads. A value of {device} means -to use the device, typically a GPU, to perform the packing/unpacking -operation. +packing/unpacking in parallel with threads. A value of {device} +means to use the device, typically a GPU, to perform the +packing/unpacking operation. The optimal choice for these keywords depends on the input script and the hardware used. The {no} value is useful for verifying that the -Kokkos-based {host} and {device} values are working correctly. It may -also be the fastest choice when using Kokkos styles in MPI-only mode -(i.e. with a thread count of 1). +Kokkos-based {host} and {device} values are working correctly. +It may also be the fastest choice when using Kokkos styles in +MPI-only mode (i.e. with a thread count of 1). When running on CPUs or Xeon Phi, the {host} and {device} values work identically. When using GPUs, the {device} value will typically be @@ -503,6 +506,18 @@ typically faster to let the host handle communication, by using the {host} value. Using {host} instead of {no} will enable use of multiple threads to pack/unpack communicated data. +The {gpu/direct} keyword chooses whether GPU-direct will be used. When +this keyword is set to {on}, buffers in GPU memory are passed directly +through MPI send/receive calls. This reduces overhead of first copying +the data to the host CPU. However GPU-direct is not supported on all +systems, which can lead to segmentation faults and would require +using a value of {off}. If LAMMPS can safely detect that GPU-direct is +not available (currently only possible with OpenMPI v2.0.0 or later), +then the {gpu/direct} keyword is automatically set to {off} by default. +When the {gpu/direct} keyword is set to {off} while any of the {comm} +keywords are set to {device}, the value for these {comm} keywords will +be automatically changed to {host}. + :line The {omp} style invokes settings associated with the use of the @@ -609,11 +624,13 @@ script or or via the "-pk intel" "command-line switch"_Run_options.html. For the KOKKOS package, the option defaults neigh = full, neigh/qeq = -full, newton = off, binsize = 0.0, and comm = device. These settings -are made automatically by the required "-k on" "command-line -switch"_Run_options.html. You can change them bu using the package -kokkos command in your input script or via the "-pk kokkos" -"command-line switch"_Run_options.html. +full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on. +When LAMMPS can safely detect, that GPU-direct is not available, the +default value of gpu/direct becomes "off". +These settings are made automatically by the required "-k on" +"command-line switch"_Run_options.html. You can change them by +using the package kokkos command in your input script or via the +"-pk kokkos command-line switch"_Run_options.html. For the OMP package, the default is Nthreads = 0 and the option defaults are neigh = yes. These settings are made automatically if diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 96d1c64d0d1aa389caca8c7ef8a635dd406fc2e9..21840b7c3ee8fd396b90e9983cd57c2d49ef64aa 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -404,12 +404,30 @@ void CommKokkos::forward_comm_pair_device(Pair *pair) // if self, set recv buffer to send buffer if (sendproc[iswap] != me) { - if (recvnum[iswap]) - MPI_Irecv(k_buf_recv_pair.view<DeviceType>().data(),nsize*recvnum[iswap],MPI_DOUBLE, + double* buf_send_pair; + double* buf_recv_pair; + if (lmp->kokkos->gpu_direct_flag) { + buf_send_pair = k_buf_send_pair.view<DeviceType>().data(); + buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data(); + } else { + k_buf_send_pair.modify<DeviceType>(); + k_buf_send_pair.sync<LMPHostType>(); + buf_send_pair = k_buf_send_pair.h_view.data(); + buf_recv_pair = k_buf_recv_pair.h_view.data(); + } + + if (recvnum[iswap]) { + MPI_Irecv(buf_recv_pair,nsize*recvnum[iswap],MPI_DOUBLE, recvproc[iswap],0,world,&request); + } if (sendnum[iswap]) - MPI_Send(k_buf_send_pair.view<DeviceType>().data(),n,MPI_DOUBLE,sendproc[iswap],0,world); + MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world); if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_direct_flag) { + k_buf_recv_pair.modify<LMPHostType>(); + k_buf_recv_pair.sync<DeviceType>(); + } } else k_buf_recv_pair = k_buf_send_pair; // unpack buffer diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp index 847fa5907a9f05cae3f8e4777097bc92029ebd09..64a9d6992f0f56a64e5cbbb3d6fb674929f5ca41 100644 --- a/src/KOKKOS/gridcomm_kokkos.cpp +++ b/src/KOKKOS/gridcomm_kokkos.cpp @@ -18,6 +18,7 @@ #include "memory_kokkos.h" #include "error.h" #include "kokkos_base.h" +#include "kokkos.h" using namespace LAMMPS_NS; @@ -526,11 +527,28 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which) DeviceType::fence(); if (swap[m].sendproc != me) { - MPI_Irecv(k_buf2.view<DeviceType>().data(),nforward*swap[m].nunpack,MPI_FFT_SCALAR, + FFT_SCALAR* buf1; + FFT_SCALAR* buf2; + if (lmp->kokkos->gpu_direct_flag) { + buf1 = k_buf1.view<DeviceType>().data(); + buf2 = k_buf2.view<DeviceType>().data(); + } else { + k_buf1.modify<DeviceType>(); + k_buf1.sync<LMPHostType>(); + buf1 = k_buf1.h_view.data(); + buf2 = k_buf2.h_view.data(); + } + + MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR, swap[m].recvproc,0,gridcomm,&request); - MPI_Send(k_buf1.view<DeviceType>().data(),nforward*swap[m].npack,MPI_FFT_SCALAR, + MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR, swap[m].sendproc,0,gridcomm); MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_direct_flag) { + k_buf2.modify<LMPHostType>(); + k_buf2.sync<DeviceType>(); + } } kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m); @@ -559,11 +577,28 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which) DeviceType::fence(); if (swap[m].recvproc != me) { - MPI_Irecv(k_buf2.view<DeviceType>().data(),nreverse*swap[m].npack,MPI_FFT_SCALAR, + FFT_SCALAR* buf1; + FFT_SCALAR* buf2; + if (lmp->kokkos->gpu_direct_flag) { + buf1 = k_buf1.view<DeviceType>().data(); + buf2 = k_buf2.view<DeviceType>().data(); + } else { + k_buf1.modify<DeviceType>(); + k_buf1.sync<LMPHostType>(); + buf1 = k_buf1.h_view.data(); + buf2 = k_buf2.h_view.data(); + } + + MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR, swap[m].sendproc,0,gridcomm,&request); - MPI_Send(k_buf1.view<DeviceType>().data(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR, + MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR, swap[m].recvproc,0,gridcomm); MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_direct_flag) { + k_buf2.modify<LMPHostType>(); + k_buf2.sync<DeviceType>(); + } } kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 3bbff6be7e77c93930b23b1bf937c28c1a2afd04..fb6b8d8d45aa5d4e1e582c9b745ec53906aa999b 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -11,6 +11,7 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ +#include <mpi.h> #include <cstdio> #include <cstring> #include <cstdlib> @@ -25,6 +26,37 @@ #include "error.h" #include "memory_kokkos.h" +#ifdef KOKKOS_HAVE_CUDA + +// for detecting GPU-direct support: +// the function int have_gpu_direct() +// - returns -1 if GPU-direct support is unknown +// - returns 0 if no GPU-direct support available +// - returns 1 if GPU-direct support is available + +#define GPU_DIRECT_UNKNOWN static int have_gpu_direct() {return -1;} + +// OpenMPI supports detecting GPU-direct as of version 2.0.0 +#if OPEN_MPI + +#if (OMPI_MAJOR_VERSION >= 2) +#include <mpi-ext.h> +#if defined(MPIX_CUDA_AWARE_SUPPORT) +static int have_gpu_direct() { return MPIX_Query_cuda_support(); } +#else +GPU_DIRECT_UNKNOWN +#endif + +#else // old OpenMPI +GPU_DIRECT_UNKNOWN +#endif + +#else // unknown MPI library +GPU_DIRECT_UNKNOWN +#endif + +#endif // KOKKOS_HAVE_CUDA + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ @@ -106,13 +138,32 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) // initialize Kokkos if (me == 0) { - if (screen) fprintf(screen," using %d GPU(s)\n",ngpu); - if (logfile) fprintf(logfile," using %d GPU(s)\n",ngpu); + if (screen) fprintf(screen," will use up to %d GPU(s) per node\n",ngpu); + if (logfile) fprintf(logfile," will use up to %d GPU(s) per node\n",ngpu); } #ifdef KOKKOS_HAVE_CUDA if (ngpu <= 0) error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested"); + + // check and warn about GPU-direct availability when using multiple MPI tasks + + int nmpi = 0; + MPI_Comm_size(world,&nmpi); + if ((nmpi > 1) && (me == 0)) { + if ( 1 == have_gpu_direct() ) { + ; // all good, nothing to warn about + } else if (-1 == have_gpu_direct() ) { + error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available," + " but cannot determine if this is the case\n try" + " '-pk kokkos gpu/direct off' when getting segmentation faults"); + } else if ( 0 == have_gpu_direct() ) { + error->warning(FLERR,"GPU-direct is NOT available, " + "using '-pk kokkos gpu/direct off' by default"); + } else { + ; // should never get here + } + } #endif Kokkos::InitArguments args; @@ -133,6 +184,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) exchange_comm_on_host = 0; forward_comm_on_host = 0; reverse_comm_on_host = 0; + gpu_direct_flag = 1; + +#if KOKKOS_USE_CUDA + // only if we can safely detect, that GPU-direct is not available, change default + if (0 == have_gpu_direct()) gpu_direct_flag = 0; +#endif #ifdef KILL_KOKKOS_ON_SIGSEGV signal(SIGSEGV, my_signal_handler); @@ -163,6 +220,7 @@ void KokkosLMP::accelerator(int narg, char **arg) double binsize = 0.0; exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; + gpu_direct_flag = 1; int iarg = 0; while (iarg < narg) { @@ -204,6 +262,7 @@ void KokkosLMP::accelerator(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; + exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else if (strcmp(arg[iarg+1],"host") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; @@ -245,9 +304,26 @@ void KokkosLMP::accelerator(int narg, char **arg) reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); iarg += 2; + } else if (strcmp(arg[iarg],"gpu/direct") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"off") == 0) gpu_direct_flag = 0; + else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1; + else error->all(FLERR,"Illegal package kokkos command"); + iarg += 2; } else error->all(FLERR,"Illegal package kokkos command"); } + // if "gpu/direct off" and "comm device", change to "comm host" + + if (!gpu_direct_flag) { + if (exchange_comm_classic == 0 && exchange_comm_on_host == 0) + exchange_comm_on_host = 1; + if (forward_comm_classic == 0 && forward_comm_on_host == 0) + forward_comm_on_host = 1; + if (reverse_comm_classic == 0 && reverse_comm_on_host == 0) + reverse_comm_on_host = 1; + } + // set newton flags // set neighbor binsize, same as neigh_modify command diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 846b7254afe5de89f3739c8ef92ba41b83c078a6..cf209c0adb97e10a7bf5b45daa4502b3ad15d46a 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -34,6 +34,7 @@ class KokkosLMP : protected Pointers { int num_threads,ngpu; int numa; int auto_sync; + int gpu_direct_flag; KokkosLMP(class LAMMPS *, int, char **); ~KokkosLMP(); diff --git a/src/balance.cpp b/src/balance.cpp index 7dd13e8766ac9c16056035afe31d1e3f8e38a017..2a953caf477acc84863fa3f39e4ad54e518128cd 100644 --- a/src/balance.cpp +++ b/src/balance.cpp @@ -647,6 +647,21 @@ int *Balance::bisection(int sortflag) double *shrinklo = &shrinkall[0]; double *shrinkhi = &shrinkall[3]; + // if shrink size in any dim is zero, use box size in that dim + + if (shrinklo[0] == shrinkhi[0]) { + shrinklo[0] = boxlo[0]; + shrinkhi[0] = boxhi[0]; + } + if (shrinklo[1] == shrinkhi[1]) { + shrinklo[1] = boxlo[1]; + shrinkhi[1] = boxhi[1]; + } + if (shrinklo[2] == shrinkhi[2]) { + shrinklo[2] = boxlo[2]; + shrinkhi[2] = boxhi[2]; + } + // invoke RCB // then invert() to create list of proc assignments for my atoms // NOTE: (3/2017) can remove undocumented "old" option at some point diff --git a/src/rcb.cpp b/src/rcb.cpp index 3027920310055351370fd6bff70f61038f01b726..13e27b6fbfda836c53b381db8171f59f4b921e0d 100644 --- a/src/rcb.cpp +++ b/src/rcb.cpp @@ -241,9 +241,11 @@ void RCB::compute(int dimension, int n, double **x, double *wt, // dim_select = selected cut dimension // valuehalf_select = valuehalf in that dimension // dotmark_select = dot markings in that dimension + // initialize largest = -1.0 to insure a cut in some dim is accepted + // e.g. if current recursed box is size 0 in all dims int dim_select = -1; - double largest = 0.0; + double largest = -1.0; for (dim = 0; dim < dimension; dim++) {