diff --git a/doc/src/Speed_kokkos.txt b/doc/src/Speed_kokkos.txt
index f26b2df7e1032326f4fedee73b562c4bf7d6f520..5f90885fe69117c429588c1fcec025a0b8934158 100644
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@@ -96,6 +96,17 @@ software version 7.5 or later must be installed on your system. See
 the discussion for the "GPU package"_Speed_gpu.html for details of how
 to check and do this.
 
+NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI
+library is CUDA-aware and has support for GPU-direct. This is not always
+the case, especially when using pre-compiled MPI libraries provided by
+a Linux distribution. This is not a problem when using only a single
+GPU and a single MPI rank on a desktop. When running with multiple
+MPI ranks, you may see segmentation faults without GPU-direct support.
+These can be avoided by adding the flags
+"-pk kokkos gpu/direct off"_Run_options.html
+to the LAMMPS command line or by using the command
+"package kokkos gpu/direct off"_package.html in the input file.
+
 Use a C++11 compatible compiler and set KOKKOS_ARCH variable in
 /src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as
 described above.  Then do the following:
@@ -254,15 +265,19 @@ supports.
 
 [Running on GPUs:]
 
-Use the "-k" "command-line switch"_Run_options.html to specify the
-number of GPUs per node. Typically the -np setting of the mpirun
-command should set the number of MPI tasks/node to be equal to the #
-of physical GPUs on the node.  You can assign multiple MPI tasks to
-the same GPU with the KOKKOS package, but this is usually only faster
-if significant portions of the input script have not been ported to
-use Kokkos. Using CUDA MPS is recommended in this scenario. As above
-for multi-core CPUs (and no GPU), if N is the number of physical
-cores/node, then the number of MPI tasks/node should not exceed N.
+Use the "-k" "command-line switch"_Run_options.thml to
+specify the number of GPUs per node. Typically the -np setting of the
+mpirun command should set the number of MPI tasks/node to be equal to
+the number of physical GPUs on the node.  You can assign multiple MPI
+tasks to the same GPU with the KOKKOS package, but this is usually
+only faster if significant portions of the input script have not
+been ported to use Kokkos. Using CUDA MPS is recommended in this
+scenario. Using a CUDA-aware MPI library with support for GPU-direct
+is highly recommended. GPU-direct use can be avoided by using
+"-pk kokkos gpu/direct no"_package.html.
+As above for multi-core CPUs (and no GPU), if N is the number of
+physical cores/node, then the number of MPI tasks/node should not
+exceed N.
 
 -k on g Ng :pre
 
diff --git a/doc/src/compute_property_local.txt b/doc/src/compute_property_local.txt
index b0ec088cf79d009e7c3062907d73dbfe3279e6da..c4ad0afc95cd3fcf7655d5861c66a3f488683519 100644
--- a/doc/src/compute_property_local.txt
+++ b/doc/src/compute_property_local.txt
@@ -19,8 +19,8 @@ one or more attributes may be appended :l
                         patom1 patom2 ptype1 ptype2
                         batom1 batom2 btype
                         aatom1 aatom2 aatom3 atype
-                        datom1 datom2 datom3 dtype
-                        iatom1 iatom2 iatom3 itype :pre
+                        datom1 datom2 datom3 datom4 dtype
+                        iatom1 iatom2 iatom3 iatom4 itype :pre
 
      natom1, natom2 = IDs of 2 atoms in each pair (within neighbor cutoff)
      ntype1, ntype2 = type of 2 atoms in each pair (within neighbor cutoff)
diff --git a/doc/src/package.txt b/doc/src/package.txt
index 76246c22f6266308e4f7e799a5298399a374d618..8b0581929ff10cffd2661b5268761e61a276e418 100644
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@@ -84,6 +84,9 @@ args = arguments specific to the style :l
         no = perform communication pack/unpack in non-KOKKOS mode
         host = perform pack/unpack on host (e.g. with OpenMP threading)
         device = perform pack/unpack on device (e.g. on GPU)
+      {gpu/direct} = {off} or {on}
+        off = do not use GPU-direct
+        on = use GPU-direct (default)
   {omp} args = Nthreads keyword value ...
     Nthread = # of OpenMP threads to associate with each MPI process
     zero or more keyword/value pairs may be appended
@@ -479,15 +482,15 @@ The value options for all 3 keywords are {no} or {host} or {device}.
 A value of {no} means to use the standard non-KOKKOS method of
 packing/unpacking data for the communication.  A value of {host} means
 to use the host, typically a multi-core CPU, and perform the
-packing/unpacking in parallel with threads.  A value of {device} means
-to use the device, typically a GPU, to perform the packing/unpacking
-operation.
+packing/unpacking in parallel with threads.  A value of {device}
+means to use the device, typically a GPU, to perform the
+packing/unpacking operation.
 
 The optimal choice for these keywords depends on the input script and
 the hardware used.  The {no} value is useful for verifying that the
-Kokkos-based {host} and {device} values are working correctly.  It may
-also be the fastest choice when using Kokkos styles in MPI-only mode
-(i.e. with a thread count of 1).
+Kokkos-based {host} and {device} values are working correctly. 
+It may also be the fastest choice when using Kokkos styles in
+MPI-only mode (i.e. with a thread count of 1).
 
 When running on CPUs or Xeon Phi, the {host} and {device} values work
 identically.  When using GPUs, the {device} value will typically be
@@ -503,6 +506,18 @@ typically faster to let the host handle communication, by using the
 {host} value.  Using {host} instead of {no} will enable use of
 multiple threads to pack/unpack communicated data.
 
+The {gpu/direct} keyword chooses whether GPU-direct will be used. When
+this keyword is set to {on}, buffers in GPU memory are passed directly
+through MPI send/receive calls. This reduces overhead of first copying
+the data to the host CPU. However GPU-direct is not supported on all
+systems, which can lead to segmentation faults and would require
+using a value of {off}. If LAMMPS can safely detect that GPU-direct is
+not available (currently only possible with OpenMPI v2.0.0 or later),
+then the {gpu/direct} keyword is automatically set to {off} by default.
+When the {gpu/direct} keyword is set to {off} while any of the {comm}
+keywords are set to {device}, the value for these {comm} keywords will
+be automatically changed to {host}.
+
 :line
 
 The {omp} style invokes settings associated with the use of the
@@ -609,11 +624,13 @@ script or or via the "-pk intel" "command-line
 switch"_Run_options.html.
 
 For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
-full, newton = off, binsize = 0.0, and comm = device.  These settings
-are made automatically by the required "-k on" "command-line
-switch"_Run_options.html.  You can change them bu using the package
-kokkos command in your input script or via the "-pk kokkos"
-"command-line switch"_Run_options.html.
+full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
+When LAMMPS can safely detect, that GPU-direct is not available, the
+default value of gpu/direct becomes "off".
+These settings are made automatically by the required "-k on"
+"command-line switch"_Run_options.html. You can change them by
+using the package kokkos command in your input script or via the
+"-pk kokkos command-line switch"_Run_options.html.
 
 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 96d1c64d0d1aa389caca8c7ef8a635dd406fc2e9..21840b7c3ee8fd396b90e9983cd57c2d49ef64aa 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -404,12 +404,30 @@ void CommKokkos::forward_comm_pair_device(Pair *pair)
     // if self, set recv buffer to send buffer
 
     if (sendproc[iswap] != me) {
-      if (recvnum[iswap])
-        MPI_Irecv(k_buf_recv_pair.view<DeviceType>().data(),nsize*recvnum[iswap],MPI_DOUBLE,
+      double* buf_send_pair;
+      double* buf_recv_pair;
+      if (lmp->kokkos->gpu_direct_flag) {
+        buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
+        buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
+      } else {
+        k_buf_send_pair.modify<DeviceType>();
+        k_buf_send_pair.sync<LMPHostType>();
+        buf_send_pair = k_buf_send_pair.h_view.data();
+        buf_recv_pair = k_buf_recv_pair.h_view.data();
+      }
+
+      if (recvnum[iswap]) {
+        MPI_Irecv(buf_recv_pair,nsize*recvnum[iswap],MPI_DOUBLE,
                   recvproc[iswap],0,world,&request);
+      }
       if (sendnum[iswap])
-        MPI_Send(k_buf_send_pair.view<DeviceType>().data(),n,MPI_DOUBLE,sendproc[iswap],0,world);
+        MPI_Send(buf_send_pair,n,MPI_DOUBLE,sendproc[iswap],0,world);
       if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_direct_flag) {
+        k_buf_recv_pair.modify<LMPHostType>();
+        k_buf_recv_pair.sync<DeviceType>();
+      }
     } else k_buf_recv_pair = k_buf_send_pair;
 
     // unpack buffer
diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
index 847fa5907a9f05cae3f8e4777097bc92029ebd09..64a9d6992f0f56a64e5cbbb3d6fb674929f5ca41 100644
--- a/src/KOKKOS/gridcomm_kokkos.cpp
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -18,6 +18,7 @@
 #include "memory_kokkos.h"
 #include "error.h"
 #include "kokkos_base.h"
+#include "kokkos.h"
 
 using namespace LAMMPS_NS;
 
@@ -526,11 +527,28 @@ void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
     DeviceType::fence();
 
     if (swap[m].sendproc != me) {
-      MPI_Irecv(k_buf2.view<DeviceType>().data(),nforward*swap[m].nunpack,MPI_FFT_SCALAR,
+      FFT_SCALAR* buf1;
+      FFT_SCALAR* buf2;
+      if (lmp->kokkos->gpu_direct_flag) {
+        buf1 = k_buf1.view<DeviceType>().data();
+        buf2 = k_buf2.view<DeviceType>().data();
+      } else {
+        k_buf1.modify<DeviceType>();
+        k_buf1.sync<LMPHostType>();
+        buf1 = k_buf1.h_view.data();
+        buf2 = k_buf2.h_view.data();
+      }
+
+      MPI_Irecv(buf2,nforward*swap[m].nunpack,MPI_FFT_SCALAR,
                 swap[m].recvproc,0,gridcomm,&request);
-      MPI_Send(k_buf1.view<DeviceType>().data(),nforward*swap[m].npack,MPI_FFT_SCALAR,
+      MPI_Send(buf1,nforward*swap[m].npack,MPI_FFT_SCALAR,
                swap[m].sendproc,0,gridcomm);
       MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_direct_flag) {
+        k_buf2.modify<LMPHostType>();
+        k_buf2.sync<DeviceType>();
+      }
     }
 
     kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
@@ -559,11 +577,28 @@ void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
     DeviceType::fence();
 
     if (swap[m].recvproc != me) {
-      MPI_Irecv(k_buf2.view<DeviceType>().data(),nreverse*swap[m].npack,MPI_FFT_SCALAR,
+      FFT_SCALAR* buf1;
+      FFT_SCALAR* buf2;
+      if (lmp->kokkos->gpu_direct_flag) {
+        buf1 = k_buf1.view<DeviceType>().data();
+        buf2 = k_buf2.view<DeviceType>().data();
+      } else {
+        k_buf1.modify<DeviceType>();
+        k_buf1.sync<LMPHostType>();
+        buf1 = k_buf1.h_view.data();
+        buf2 = k_buf2.h_view.data();
+      }
+
+      MPI_Irecv(buf2,nreverse*swap[m].npack,MPI_FFT_SCALAR,
                 swap[m].sendproc,0,gridcomm,&request);
-      MPI_Send(k_buf1.view<DeviceType>().data(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
+      MPI_Send(buf1,nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
                swap[m].recvproc,0,gridcomm);
       MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_direct_flag) {
+        k_buf2.modify<LMPHostType>();
+        k_buf2.sync<DeviceType>();
+      }
     }
 
     kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 3bbff6be7e77c93930b23b1bf937c28c1a2afd04..fb6b8d8d45aa5d4e1e582c9b745ec53906aa999b 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -11,6 +11,7 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+#include <mpi.h>
 #include <cstdio>
 #include <cstring>
 #include <cstdlib>
@@ -25,6 +26,37 @@
 #include "error.h"
 #include "memory_kokkos.h"
 
+#ifdef KOKKOS_HAVE_CUDA
+
+// for detecting GPU-direct support:
+// the function  int have_gpu_direct()
+// - returns -1 if GPU-direct support is unknown
+// - returns  0 if no GPU-direct support available
+// - returns  1 if GPU-direct support is available
+
+#define GPU_DIRECT_UNKNOWN static int have_gpu_direct() {return -1;}
+
+// OpenMPI supports detecting GPU-direct as of version 2.0.0
+#if OPEN_MPI
+
+#if (OMPI_MAJOR_VERSION >= 2)
+#include <mpi-ext.h>
+#if defined(MPIX_CUDA_AWARE_SUPPORT)
+static int have_gpu_direct() { return MPIX_Query_cuda_support(); }
+#else
+GPU_DIRECT_UNKNOWN
+#endif
+
+#else // old OpenMPI
+GPU_DIRECT_UNKNOWN
+#endif
+
+#else // unknown MPI library
+GPU_DIRECT_UNKNOWN
+#endif
+
+#endif // KOKKOS_HAVE_CUDA
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -106,13 +138,32 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   // initialize Kokkos
 
   if (me == 0) {
-    if (screen) fprintf(screen,"  using %d GPU(s)\n",ngpu);
-    if (logfile) fprintf(logfile,"  using %d GPU(s)\n",ngpu);
+    if (screen) fprintf(screen,"  will use up to %d GPU(s) per node\n",ngpu);
+    if (logfile) fprintf(logfile,"  will use up to %d GPU(s) per node\n",ngpu);
   }
 
 #ifdef KOKKOS_HAVE_CUDA
   if (ngpu <= 0)
     error->all(FLERR,"Kokkos has been compiled for CUDA but no GPUs are requested");
+
+  // check and warn about GPU-direct availability when using multiple MPI tasks
+
+  int nmpi = 0;
+  MPI_Comm_size(world,&nmpi);
+  if ((nmpi > 1) && (me == 0)) {
+    if ( 1 == have_gpu_direct() ) {
+      ; // all good, nothing to warn about
+    } else if (-1 == have_gpu_direct() ) {
+      error->warning(FLERR,"Kokkos with CUDA assumes GPU-direct is available,"
+                     " but cannot determine if this is the case\n         try"
+                     " '-pk kokkos gpu/direct off' when getting segmentation faults");
+    } else if ( 0 == have_gpu_direct() ) {
+      error->warning(FLERR,"GPU-direct is NOT available, "
+                     "using '-pk kokkos gpu/direct off' by default");
+    } else {
+      ; // should never get here
+    }
+  }
 #endif
 
   Kokkos::InitArguments args;
@@ -133,6 +184,12 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   exchange_comm_on_host = 0;
   forward_comm_on_host = 0;
   reverse_comm_on_host = 0;
+  gpu_direct_flag = 1;
+
+#if KOKKOS_USE_CUDA
+  // only if we can safely detect, that GPU-direct is not available, change default
+  if (0 == have_gpu_direct()) gpu_direct_flag = 0;
+#endif
 
 #ifdef KILL_KOKKOS_ON_SIGSEGV
   signal(SIGSEGV, my_signal_handler);
@@ -163,6 +220,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
   double binsize = 0.0;
   exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
   exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
+  gpu_direct_flag = 1;
 
   int iarg = 0;
   while (iarg < narg) {
@@ -204,6 +262,7 @@ void KokkosLMP::accelerator(int narg, char **arg)
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else if (strcmp(arg[iarg+1],"host") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
@@ -245,9 +304,26 @@ void KokkosLMP::accelerator(int narg, char **arg)
         reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"gpu/direct") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"off") == 0) gpu_direct_flag = 0;
+      else if (strcmp(arg[iarg+1],"on") == 0) gpu_direct_flag = 1;
+      else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
     } else error->all(FLERR,"Illegal package kokkos command");
   }
 
+  // if "gpu/direct off" and "comm device", change to "comm host"
+
+  if (!gpu_direct_flag) {
+   if (exchange_comm_classic == 0 && exchange_comm_on_host == 0)
+     exchange_comm_on_host = 1;
+   if (forward_comm_classic == 0 && forward_comm_on_host == 0)
+     forward_comm_on_host = 1;
+   if (reverse_comm_classic == 0 && reverse_comm_on_host == 0)
+     reverse_comm_on_host = 1;
+  }
+
   // set newton flags
   // set neighbor binsize, same as neigh_modify command
 
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 846b7254afe5de89f3739c8ef92ba41b83c078a6..cf209c0adb97e10a7bf5b45daa4502b3ad15d46a 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -34,6 +34,7 @@ class KokkosLMP : protected Pointers {
   int num_threads,ngpu;
   int numa;
   int auto_sync;
+  int gpu_direct_flag;
 
   KokkosLMP(class LAMMPS *, int, char **);
   ~KokkosLMP();
diff --git a/src/balance.cpp b/src/balance.cpp
index 7dd13e8766ac9c16056035afe31d1e3f8e38a017..2a953caf477acc84863fa3f39e4ad54e518128cd 100644
--- a/src/balance.cpp
+++ b/src/balance.cpp
@@ -647,6 +647,21 @@ int *Balance::bisection(int sortflag)
   double *shrinklo = &shrinkall[0];
   double *shrinkhi = &shrinkall[3];
 
+  // if shrink size in any dim is zero, use box size in that dim
+
+  if (shrinklo[0] == shrinkhi[0]) {
+    shrinklo[0] = boxlo[0];
+    shrinkhi[0] = boxhi[0];
+  }
+  if (shrinklo[1] == shrinkhi[1]) {
+    shrinklo[1] = boxlo[1];
+    shrinkhi[1] = boxhi[1];
+  }
+  if (shrinklo[2] == shrinkhi[2]) {
+    shrinklo[2] = boxlo[2];
+    shrinkhi[2] = boxhi[2];
+  }
+
   // invoke RCB
   // then invert() to create list of proc assignments for my atoms
   // NOTE: (3/2017) can remove undocumented "old" option at some point
diff --git a/src/rcb.cpp b/src/rcb.cpp
index 3027920310055351370fd6bff70f61038f01b726..13e27b6fbfda836c53b381db8171f59f4b921e0d 100644
--- a/src/rcb.cpp
+++ b/src/rcb.cpp
@@ -241,9 +241,11 @@ void RCB::compute(int dimension, int n, double **x, double *wt,
     // dim_select = selected cut dimension
     // valuehalf_select = valuehalf in that dimension
     // dotmark_select = dot markings in that dimension
+    // initialize largest = -1.0 to insure a cut in some dim is accepted
+    //   e.g. if current recursed box is size 0 in all dims
 
     int dim_select = -1;
-    double largest = 0.0;
+    double largest = -1.0;
 
     for (dim = 0; dim < dimension; dim++) {