diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt
index 14f4103aed8d6df6e782eba0e63c196c2daf380d..597ed73e5316e6eb57d53d227798eeab2cb363ca 100644
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@@ -96,6 +96,16 @@ software version 7.5 or later must be installed on your system. See
 the discussion for the "GPU package"_Speed_gpu.html for details of how
 to check and do this.
 
+NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI
+library is CUDA-aware and has support for GPU-direct. This is not always
+the case, especially when using pre-compiled MPI libraries provided by
+a Linux distribution. This is not a problem when using only a single
+GPU and a single MPI rank on a desktop. When running with multiple
+MPI ranks, you may see segmentation faults without GPU-direct support.
+These can be avoided by adding the flags '-pk kokkos gpu/direct off'
+to the LAMMPS command line or by using the command
+"package kokkos gpu/direct off"_package.html in the input file.
+
 Use a C++11 compatible compiler and set KOKKOS_ARCH variable in
 /src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as
 described above.  Then do the following:
@@ -262,9 +272,11 @@ the # of physical GPUs on the node.  You can assign multiple MPI tasks
 to the same GPU with the KOKKOS package, but this is usually only
 faster if significant portions of the input script have not been
 ported to use Kokkos. Using CUDA MPS is recommended in this
-scenario. As above for multi-core CPUs (and no GPU), if N is the
-number of physical cores/node, then the number of MPI tasks/node
-should not exceed N.
+scenario. Using a CUDA-aware MPI library with support for GPU-direct
+is highly recommended. GPU-direct use can be avoided by using "-pk kokkos gpu/direct no".
+As above for multi-core CPUs (and no GPU), if N is the number of
+physical cores/node, then the number of MPI tasks/node should not
+exceed N.
 
 -k on g Ng :pre
 
diff --git a/doc/src/package.txt b/doc/src/package.txt
index c2e7345bb74c4e25a5dc4861194c704d3017232d..3d25a64d3174aacc9218ed4d26e96cae5355ddef 100644
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@@ -84,6 +84,9 @@ args = arguments specific to the style :l
         no = perform communication pack/unpack in non-KOKKOS mode
         host = perform pack/unpack on host (e.g. with OpenMP threading)
         device = perform pack/unpack on device (e.g. on GPU)
+      {gpu/direct} = {off} or {on}
+        off = do not use GPU-direct
+        on = use GPU-direct (default)
   {omp} args = Nthreads keyword value ...
     Nthread = # of OpenMP threads to associate with each MPI process
     zero or more keyword/value pairs may be appended
@@ -480,15 +483,15 @@ The value options for all 3 keywords are {no} or {host} or {device}.
 A value of {no} means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication.  A value of {host} means
 to use the host, typically a multi-core CPU, and perform the
-packing/unpacking in parallel with threads.  A value of {device} means
-to use the device, typically a GPU, to perform the packing/unpacking
-operation.
+packing/unpacking in parallel with threads.  A value of {device}
+means to use the device, typically a GPU, to perform the
+packing/unpacking operation.
 
 The optimal choice for these keywords depends on the input script and
 the hardware used.  The {no} value is useful for verifying that the
-Kokkos-based {host} and {device} values are working correctly.  It may
-also be the fastest choice when using Kokkos styles in MPI-only mode
-(i.e. with a thread count of 1).
+Kokkos-based {host} and {device} values are working correctly. 
+It may also be the fastest choice when using Kokkos styles in
+MPI-only mode (i.e. with a thread count of 1).
 
 When running on CPUs or Xeon Phi, the {host} and {device} values work
 identically.  When using GPUs, the {device} value will typically be
@@ -504,6 +507,18 @@ typically faster to let the host handle communication, by using the
 {host} value.  Using {host} instead of {no} will enable use of
 multiple threads to pack/unpack communicated data.
 
+The {gpu/direct} keyword chooses whether GPU-direct will be used. When
+this keyword is set to {on}, buffers in GPU memory are passed directly
+through MPI send/receive calls. This reduces overhead of first copying
+the data to the host CPU. However GPU-direct is not supported on all
+systems, which can lead to segmentation faults and would require
+using a value of {off}. If LAMMPS can safely detect that GPU-direct is
+not available (currently only possible with OpenMPI v2.0.0 or later),
+then the {gpu/direct} keyword is automatically set to {off} by default.
+When the {gpu/direct} keyword is set to {off} while any of the {comm}
+keywords are set to {device}, the value for these {comm} keywords will
+be automatically changed to {host}.
+
 :line
 
 The {omp} style invokes settings associated with the use of the
@@ -610,12 +625,14 @@ is used.  If it is not used, you must invoke the package intel
 command in your input script or or via the "-pk intel" "command-line
 switch"_Section_start.html#start_6.
 
-For the KOKKOS package, the option defaults neigh = full,
-neigh/qeq = full, newton = off, binsize = 0.0, and comm = device.
-These settings are made automatically by the required "-k on" "command-line
-switch"_Section_start.html#start_6.  You can change them bu using the
-package kokkos command in your input script or via the "-pk kokkos"
-"command-line switch"_Section_start.html#start_6.
+For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
+full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
+When LAMMPS can safely detect, that GPU-direct is not available, the
+default value of gpu/direct becomes "off".
+These settings are made automatically by the required "-k on"
+"command-line switch"_Section_start.html#start_6. You can change them by
+using the package kokkos command in your input script or via the "-pk
+kokkos" "command-line switch"_Section_start.html#start_6.
 
 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 96d1c64d0d1aa389caca8c7ef8a635dd406fc2e9..21840b7c3ee8fd396b90e9983cd57c2d49ef64aa 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -404,12 +404,30 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
     // if self, set recv buffer to send buffer
 
     if (sendproc[iswap] != me) {
-      if (recvnum[iswap])
-        MPI_Irecv(k_buf_recv_pair.view<DeviceType>().data(),nsize*recvnum[iswap],MPI_DOUBLE,
+      double* buf_send_pair;
+      double* buf_recv_pair;
+      if (lmp->kokkos->gpu_direct_flag) {
+        buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
+        buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
+      } else {
+        k_buf_send_pair.modify<DeviceType>();
+        k_buf_send_pair.sync<LMPHostType>();
+        buf_send_pair = k_buf_send_pair.h_view.data();
+        buf_recv_pair = k_buf_recv_pair.h_view.data();
+      }
+
+      if (recvnum[iswap]) {
+        MPI_Irecv(buf_recv_pair,nsize*recvnum[iswap],MPI_DOUBLE,
                   recvproc[iswap],0,world,&request);
+      }
       if (sendnum[iswap])
-        MPI_Send(k_buf_send_pair.view<DeviceType>().data(),n,MPI_DOUBLE,sendproc[iswap],0,world);
+        MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world);
       if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_direct_flag) {
+        k_buf_recv_pair.modify<LMPHostType>();
+        k_buf_recv_pair.sync<DeviceType>();
+      }
     } else k_buf_recv_pair = k_buf_send_pair;
 
     // unpack buffer
diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 847fa5907a9f05cae3f8e4777097bc92029ebd09..64a9d6992f0f56a64e5cbbb3d6fb674929f5ca41 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -18,6 +18,7 @@
 #include "memory_kokkos.h"
 #include "error.h"
 #include "kokkos_base.h"
+#include "kokkos.h"
 
 using namespace LAMMPS_NS;
 
@@ -526,11 +527,28 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
     DeviceType::fence();
 
     if (swap[m].sendproc != me) {
-      MPI_Irecv(k_buf2.view<DeviceType>().data(),nforward*swap[m].nunpack,MPI_FFT_SCALAR,
+      FFT_SCALAR* buf1;
+      FFT_SCALAR* buf2;
+      if (lmp->kokkos->gpu_direct_flag) {
+        buf1 = k_buf1.view<DeviceType>().data();
+        buf2 = k_buf2.view<DeviceType>().data();
+      } else {
+        k_buf1.modify<DeviceType>();
+        k_buf1.sync<LMPHostType>();
+        buf1 = k_buf1.h_view.data();
+        buf2 = k_buf2.h_view.data();
+      }
+
+      MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR,
                 swap[m].recvproc,0,gridcomm,&request);
-      MPI_Send(k_buf1.view<DeviceType>().data(),nforward*swap[m].npack,MPI_FFT_SCALAR,
+      MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR,
                swap[m].sendproc,0,gridcomm);
       MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_direct_flag) {
+        k_buf2.modify<LMPHostType>();
+        k_buf2.sync<DeviceType>();
+      }
     }
 
     kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
@@ -559,11 +577,28 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
     DeviceType::fence();
 
     if (swap[m].recvproc != me) {
-      MPI_Irecv(k_buf2.view<DeviceType>().data(),nreverse*swap[m].npack,MPI_FFT_SCALAR,
+      FFT_SCALAR* buf1;
+      FFT_SCALAR* buf2;
+      if (lmp->kokkos->gpu_direct_flag) {
+        buf1 = k_buf1.view<DeviceType>().data();
+        buf2 = k_buf2.view<DeviceType>().data();
+      } else {
+        k_buf1.modify<DeviceType>();
+        k_buf1.sync<LMPHostType>();
+        buf1 = k_buf1.h_view.data();
+        buf2 = k_buf2.h_view.data();
+      }
+
+      MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR,
                 swap[m].sendproc,0,gridcomm,&request);
-      MPI_Send(k_buf1.view<DeviceType>().data(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
+      MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
                swap[m].recvproc,0,gridcomm);
       MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_direct_flag) {
+        k_buf2.modify<LMPHostType>();
+        k_buf2.sync<DeviceType>();
+      }
     }
 
     kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 3bbff6be7e77c93930b23b1bf937c28c1a2afd04..fb6b8d8d45aa5d4e1e582c9b745ec53906aa999b 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -11,6 +11,7 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+#include <mpi.h>
 #include <cstdio>
 #include <cstring>
 #include <cstdlib>
@@ -25,6 +26,37 @@
 #include "error.h"
 #include "memory_kokkos.h"
 
+#ifdef KOKKOS_HAVE_CUDA
+
+// for detecting GPU-direct support:
+// the function  int have_gpu_direct()
+// - returns -1 if GPU-direct support is unknown
+// - returns  0 if no GPU-direct support available
+// - returns  1 if GPU-direct support is available
+
+#define GPU_DIRECT_UNKNOWN static int have_gpu_direct() {return -1;}
+
+// OpenMPI supports detecting GPU-direct as of version 2.0.0
+#if OPEN_MPI
+
+#if (OMPI_MAJOR_VERSION >= 2)
+#include <mpi-ext.h>
+#if defined(MPIX_CUDA_AWARE_SUPPORT)
+static int have_gpu_direct() { return MPIX_Query_cuda_support(); }
+#else
+GPU_DIRECT_UNKNOWN
+#endif
+
+#else // old OpenMPI
+GPU_DIRECT_UNKNOWN
+#endif
+
+#else // unknown MPI library
+GPU_DIRECT_UNKNOWN
+#endif
+
+#endif // KOKKOS_HAVE_CUDA
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -106,13 +138,32 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   // initialize Kokkos
 
   if (me == 0) {
-    if (screen) fprintf(screen,"  using %d GPU(s)\n",ngpu);
-    if (logfile) fprintf(logfile,"  using %d GPU(s)\n",ngpu);
+    if (screen) fprintf(screen,"  will use up to %d GPU(s) per node\n",ngpu);
+    if (logfile) fprintf(logfile,"  will use up to %d GPU(s) per node\n",ngpu);
   }
 
 #ifdef KOKKOS_HAVE_CUDA
   if (ngpu <= 0)
     error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");
+
+  // check and warn about GPU-direct availability when using multiple MPI tasks
+
+  int nmpi = 0;
+  MPI_Comm_size(world,&nmpi);
+  if ((nmpi > 1) && (me == 0)) {
+    if ( 1 == have_gpu_direct() ) {
+      ; // all good, nothing to warn about
+    } else if (-1 == have_gpu_direct() ) {
+      error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available,"
+                     " but cannot determine if this is the case\n         try"
+                     " '-pk kokkos gpu/direct off' when getting segmentation faults");
+    } else if ( 0 == have_gpu_direct() ) {
+      error->warning(FLERR,"GPU-direct is NOT available, "
+                     "using '-pk kokkos gpu/direct off' by default");
+    } else {
+      ; // should never get here
+    }
+  }
 #endif
 
   Kokkos::InitArguments args;
@@ -133,6 +184,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   exchange_comm_on_host = 0;
   forward_comm_on_host = 0;
   reverse_comm_on_host = 0;
+  gpu_direct_flag = 1;
+
+#if KOKKOS_USE_CUDA
+  // only if we can safely detect, that GPU-direct is not available, change default
+  if (0 == have_gpu_direct()) gpu_direct_flag = 0;
+#endif
 
 #ifdef KILL_KOKKOS_ON_SIGSEGV
   signal(SIGSEGV, my_signal_handler);
@@ -163,6 +220,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
   double binsize = 0.0;
   exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
   exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
+  gpu_direct_flag = 1;
 
   int iarg = 0;
   while (iarg < narg) {
@@ -204,6 +262,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else if (strcmp(arg[iarg+1],"host") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
@@ -245,9 +304,26 @@ void KokkosLMP::accelerator(int narg, char **arg)
         reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"gpu/direct") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"off") == 0) gpu_direct_flag = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1;
+      else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
     } else error->all(FLERR,"Illegal package kokkos command");
   }
 
+  // if "gpu/direct off" and "comm device", change to "comm host"
+
+  if (!gpu_direct_flag) {
+   if (exchange_comm_classic == 0 && exchange_comm_on_host == 0)
+     exchange_comm_on_host = 1;
+   if (forward_comm_classic == 0 && forward_comm_on_host == 0)
+     forward_comm_on_host = 1;
+   if (reverse_comm_classic == 0 && reverse_comm_on_host == 0)
+     reverse_comm_on_host = 1;
+  }
+
   // set newton flags
   // set neighbor binsize, same as neigh_modify command
 
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 846b7254afe5de89f3739c8ef92ba41b83c078a6..cf209c0adb97e10a7bf5b45daa4502b3ad15d46a 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -34,6 +34,7 @@ class KokkosLMP : protected Pointers {
   int num_threads,ngpu;
   int numa;
   int auto_sync;
+  int gpu_direct_flag;
 
   KokkosLMP(class LAMMPS *, int, char **);
   ~KokkosLMP();