diff --git a/doc/src/Manual_build.txt b/doc/src/Manual_build.txt
index 747a55a7296d0ab206c0119ecb7a85dd5e8e7a36..a6b881cb79708c37760dc482cac2a7017342d7df 100644
--- a/doc/src/Manual_build.txt
+++ b/doc/src/Manual_build.txt
@@ -81,7 +81,7 @@ sudo yum install python3-virtualenv :pre
 
 Fedora (since version 22) :h4
 
-sudo dnf install python3-virtualenv pre
+sudo dnf install python3-virtualenv :pre
 
 MacOS X :h4
 
diff --git a/doc/src/Packages_details.txt b/doc/src/Packages_details.txt
index 5ab85a80c842c8a0a1672af8140e27e0be33d0e8..892774be383f379ade01ca6265e69ea6b6f4bcfc 100644
--- a/doc/src/Packages_details.txt
+++ b/doc/src/Packages_details.txt
@@ -382,6 +382,11 @@ switches"_Run_options.html.  Also see the "GPU"_#PKG-GPU, "OPT"_#PKG-OPT,
 have styles optimized for CPUs, KNLs, and GPUs.
 
 You must have a C++11 compatible compiler to use this package.
+KOKKOS makes extensive use of advanced C++ features, which can
+expose compiler bugs, especially when compiling for maximum
+performance at high optimization levels. Please see the file
+lib/kokkos/README for a list of compilers and their respective
+platforms, that are known to work.
 
 [Authors:] The KOKKOS package was created primarily by Christian Trott
 and Stan Moore (Sandia), with contributions from other folks as well.
diff --git a/doc/src/Packages_standard.txt b/doc/src/Packages_standard.txt
index a79caae141c68a4a0f5bcd6383658e20c3a307a9..55d0d616f417a5511898b98f61d9b7b0b348edd9 100644
--- a/doc/src/Packages_standard.txt
+++ b/doc/src/Packages_standard.txt
@@ -25,42 +25,42 @@ refers to the examples/USER/atc directory.  The "Library" column
 indicates whether an extra library is needed to build and use the
 package:
 
-dash = no library
+no  = no library
 sys = system library: you likely have it on your machine
 int = internal library: provided with LAMMPS, but you may need to build it
 ext = external library: you will need to download and install it on your machine :ul
 
 Package, Description, Doc page, Example, Library
-"ASPHERE"_Packages_details.html#PKG-ASPHERE, aspherical particle models, "Howto spherical"_Howto_spherical.html, ellipse, -
-"BODY"_Packages_details.html#PKG-BODY, body-style particles, "Howto body"_Howto_body.html, body, -
-"CLASS2"_Packages_details.html#PKG-CLASS2, class 2 force fields, "pair_style lj/class2"_pair_class2.html, -, -
-"COLLOID"_Packages_details.html#PKG-COLLOID, colloidal particles, "atom_style colloid"_atom_style.html, colloid, -
-"COMPRESS"_Packages_details.html#PKG-COMPRESS, I/O compression, "dump */gz"_dump.html, -, sys
-"CORESHELL"_Packages_details.html#PKG-CORESHELL, adiabatic core/shell model, "Howto coreshell"_Howto_coreshell.html, coreshell, -
-"DIPOLE"_Packages_details.html#PKG-DIPOLE, point dipole particles, "pair_style dipole/cut"_pair_dipole.html, dipole, -
+"ASPHERE"_Packages_details.html#PKG-ASPHERE, aspherical particle models, "Howto spherical"_Howto_spherical.html, ellipse, no
+"BODY"_Packages_details.html#PKG-BODY, body-style particles, "Howto body"_Howto_body.html, body, no
+"CLASS2"_Packages_details.html#PKG-CLASS2, class 2 force fields, "pair_style lj/class2"_pair_class2.html, n/a, no
+"COLLOID"_Packages_details.html#PKG-COLLOID, colloidal particles, "atom_style colloid"_atom_style.html, colloid, no
+"COMPRESS"_Packages_details.html#PKG-COMPRESS, I/O compression, "dump */gz"_dump.html, n/a, sys
+"CORESHELL"_Packages_details.html#PKG-CORESHELL, adiabatic core/shell model, "Howto coreshell"_Howto_coreshell.html, coreshell, no
+"DIPOLE"_Packages_details.html#PKG-DIPOLE, point dipole particles, "pair_style dipole/cut"_pair_dipole.html, dipole, no
 "GPU"_Packages_details.html#PKG-GPU, GPU-enabled styles, "Section gpu"_Speed_gpu.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, int
-"GRANULAR"_Packages_details.html#PKG-GRANULAR, granular systems, "Howto granular"_Howto_granular.html, pour, -
+"GRANULAR"_Packages_details.html#PKG-GRANULAR, granular systems, "Howto granular"_Howto_granular.html, pour, no
 "KIM"_Packages_details.html#PKG-KIM, OpenKIM wrapper, "pair_style kim"_pair_kim.html, kim, ext
-"KOKKOS"_Packages_details.html#PKG-KOKKOS, Kokkos-enabled styles, "Speed kokkos"_Speed_kokkos.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, -
-"KSPACE"_Packages_details.html#PKG-KSPACE, long-range Coulombic solvers, "kspace_style"_kspace_style.html, peptide, -
+"KOKKOS"_Packages_details.html#PKG-KOKKOS, Kokkos-enabled styles, "Speed kokkos"_Speed_kokkos.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
+"KSPACE"_Packages_details.html#PKG-KSPACE, long-range Coulombic solvers, "kspace_style"_kspace_style.html, peptide, no
 "LATTE"_Packages_details.html#PKG-LATTE, quantum DFTB forces via LATTE, "fix latte"_fix_latte.html, latte, ext
-"MANYBODY"_Packages_details.html#PKG-MANYBODY, many-body potentials, "pair_style tersoff"_pair_tersoff.html, shear, -
-"MC"_Packages_details.html#PKG-MC, Monte Carlo options, "fix gcmc"_fix_gcmc.html, -, -
+"MANYBODY"_Packages_details.html#PKG-MANYBODY, many-body potentials, "pair_style tersoff"_pair_tersoff.html, shear, no
+"MC"_Packages_details.html#PKG-MC, Monte Carlo options, "fix gcmc"_fix_gcmc.html, n/a, no
 "MEAM"_Packages_details.html#PKG-MEAM, modified EAM potential, "pair_style meam"_pair_meam.html, meam, int
-"MISC"_Packages_details.html#PKG-MISC, miscellanous single-file commands, -, -, -
-"MOLECULE"_Packages_details.html#PKG-MOLECULE, molecular system force fields, "Howto bioFF"_Howto_bioFF.html, peptide, -
-"MPIIO"_Packages_details.html#PKG-MPIIO, MPI parallel I/O dump and restart, "dump"_dump.html, -, -
+"MISC"_Packages_details.html#PKG-MISC, miscellanous single-file commands, n/a, no, no
+"MOLECULE"_Packages_details.html#PKG-MOLECULE, molecular system force fields, "Howto bioFF"_Howto_bioFF.html, peptide, no
+"MPIIO"_Packages_details.html#PKG-MPIIO, MPI parallel I/O dump and restart, "dump"_dump.html, n/a, no
 "MSCG"_Packages_details.html#PKG-MSCG, multi-scale coarse-graining wrapper, "fix mscg"_fix_mscg.html, mscg, ext
-"OPT"_Packages_details.html#PKG-OPT, optimized pair styles, "Speed opt"_Speed_opt.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, -
-"PERI"_Packages_details.html#PKG-PERI, Peridynamics models, "pair_style peri"_pair_peri.html, peri, -
+"OPT"_Packages_details.html#PKG-OPT, optimized pair styles, "Speed opt"_Speed_opt.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
+"PERI"_Packages_details.html#PKG-PERI, Peridynamics models, "pair_style peri"_pair_peri.html, peri, no
 "POEMS"_Packages_details.html#PKG-POEMS, coupled rigid body motion, "fix poems"_fix_poems.html, rigid, int
 "PYTHON"_Packages_details.html#PKG-PYTHON, embed Python code in an input script, "python"_python.html, python, sys
-"QEQ"_Packages_details.html#PKG-QEQ, QEq charge equilibration, "fix qeq"_fix_qeq.html, qeq, -
+"QEQ"_Packages_details.html#PKG-QEQ, QEq charge equilibration, "fix qeq"_fix_qeq.html, qeq, no
 "REAX"_Packages_details.html#PKG-REAX, ReaxFF potential (Fortran), "pair_style reax"_pair_reax.html, reax, int
-"REPLICA"_Packages_details.html#PKG-REPLICA2, multi-replica methods, "Howto replica"_Howto_replica.html, tad, -
-"RIGID"_Packages_details.html#PKG-RIGID, rigid bodies and constraints, "fix rigid"_fix_rigid.html, rigid, -
-"SHOCK"_Packages_details.html#PKG-SHOCK, shock loading methods, "fix msst"_fix_msst.html, -, -
-"SNAP"_Packages_details.html#PKG-SNAP, quantum-fitted potential, "pair_style snap"_pair_snap.html, snap, -
-"SPIN"_Packages_details.html#PKG-SPIN, magnetic atomic spin dynamics, "Howto spins"_Howto_spins.html, SPIN, -
-"SRD"_Packages_details.html#PKG-SRD, stochastic rotation dynamics, "fix srd"_fix_srd.html, srd, -
-"VORONOI"_Packages_details.html#PKG-VORONOI, Voronoi tesselation, "compute voronoi/atom"_compute_voronoi_atom.html, -, ext :tb(ea=c,ca1=l)
+"REPLICA"_Packages_details.html#PKG-REPLICA2, multi-replica methods, "Howto replica"_Howto_replica.html, tad, no
+"RIGID"_Packages_details.html#PKG-RIGID, rigid bodies and constraints, "fix rigid"_fix_rigid.html, rigid, no
+"SHOCK"_Packages_details.html#PKG-SHOCK, shock loading methods, "fix msst"_fix_msst.html, n/a, no
+"SNAP"_Packages_details.html#PKG-SNAP, quantum-fitted potential, "pair_style snap"_pair_snap.html, snap, no
+"SPIN"_Packages_details.html#PKG-SPIN, magnetic atomic spin dynamics, "Howto spins"_Howto_spins.html, SPIN, no
+"SRD"_Packages_details.html#PKG-SRD, stochastic rotation dynamics, "fix srd"_fix_srd.html, srd, no
+"VORONOI"_Packages_details.html#PKG-VORONOI, Voronoi tesselation, "compute voronoi/atom"_compute_voronoi_atom.html, n/a, ext :tb(ea=c,ca1=l)
diff --git a/doc/src/Packages_user.txt b/doc/src/Packages_user.txt
index 7157ebead0ffaef0ec03508272f4a58b698719da..c1a52fd0d0d864493d49f849aa5c531fcb1cd40f 100644
--- a/doc/src/Packages_user.txt
+++ b/doc/src/Packages_user.txt
@@ -32,7 +32,7 @@ refers to the examples/USER/atc directory.  The "Library" column
 indicates whether an extra library is needed to build and use the
 package:
 
-dash = no library
+no  = no library
 sys = system library: you likely have it on your machine
 int = internal library: provided with LAMMPS, but you may need to build it
 ext = external library: you will need to download and install it on your machine :ul
@@ -40,35 +40,35 @@ ext = external library: you will need to download and install it on your machine
 Package, Description, Doc page, Example, Library
 "USER-ATC"_Packages_details.html#PKG-USER-ATC, atom-to-continuum coupling, "fix atc"_fix_atc.html, USER/atc, int
 "USER-AWPMD"_Packages_details.html#PKG-USER-AWPMD, wave-packet MD, "pair_style awpmd/cut"_pair_awpmd.html, USER/awpmd, int
-"USER-BOCS"_Packages_details.html#PKG-USER-BOCS, BOCS bottom up coarse graining, "fix bocs"_fix_bocs.html, USER/bocs, -
-"USER-CGDNA"_Packages_details.html#PKG-USER-CGDNA, coarse-grained DNA force fields, src/USER-CGDNA/README, USER/cgdna, -
-"USER-CGSDK"_Packages_details.html#PKG-USER-CGSDK, SDK coarse-graining model, "pair_style lj/sdk"_pair_sdk.html, USER/cgsdk, -
+"USER-BOCS"_Packages_details.html#PKG-USER-BOCS, BOCS bottom up coarse graining, "fix bocs"_fix_bocs.html, USER/bocs, no
+"USER-CGDNA"_Packages_details.html#PKG-USER-CGDNA, coarse-grained DNA force fields, src/USER-CGDNA/README, USER/cgdna, no
+"USER-CGSDK"_Packages_details.html#PKG-USER-CGSDK, SDK coarse-graining model, "pair_style lj/sdk"_pair_sdk.html, USER/cgsdk, no
 "USER-COLVARS"_Packages_details.html#PKG-USER-COLVARS, collective variables library, "fix colvars"_fix_colvars.html, USER/colvars, int
-"USER-DIFFRACTION"_Packages_details.html#PKG-USER-DIFFRACTION, virtual x-ray and electron diffraction,"compute xrd"_compute_xrd.html, USER/diffraction, -
-"USER-DPD"_Packages_details.html#PKG-USER-DPD, reactive dissipative particle dynamics, src/USER-DPD/README, USER/dpd, -
-"USER-DRUDE"_Packages_details.html#PKG-USER-DRUDE, Drude oscillators, "Howto drude"_Howto_drude.html, USER/drude, -
-"USER-EFF"_Packages_details.html#PKG-USER-EFF, electron force field,"pair_style eff/cut"_pair_eff.html, USER/eff, -
-"USER-FEP"_Packages_details.html#PKG-USER-FEP, free energy perturbation,"compute fep"_compute_fep.html, USER/fep, -
-"USER-H5MD"_Packages_details.html#PKG-USER-H5MD, dump output via HDF5,"dump h5md"_dump_h5md.html, -, ext
-"USER-INTEL"_Packages_details.html#PKG-USER-INTEL, optimized Intel CPU and KNL styles,"Speed intel"_Speed_intel.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, -
-"USER-LB"_Packages_details.html#PKG-USER-LB, Lattice Boltzmann fluid,"fix lb/fluid"_fix_lb_fluid.html, USER/lb, -
-"USER-MANIFOLD"_Packages_details.html#PKG-USER-MANIFOLD, motion on 2d surfaces,"fix manifoldforce"_fix_manifoldforce.html, USER/manifold, -
-"USER-MEAMC"_Packages_details.html#PKG-USER-MEAMC, modified EAM potential (C++), "pair_style meam/c"_pair_meam.html, meam, -
-"USER-MESO"_Packages_details.html#PKG-USER-MESO, mesoscale DPD models, "pair_style edpd"_pair_meso.html, USER/meso, -
-"USER-MGPT"_Packages_details.html#PKG-USER-MGPT, fast MGPT multi-ion potentials, "pair_style mgpt"_pair_mgpt.html, USER/mgpt, -
-"USER-MISC"_Packages_details.html#PKG-USER-MISC, single-file contributions, USER-MISC/README, USER/misc, -
-"USER-MOFFF"_Packages_details.html#PKG-USER-MOFFF, styles for "MOF-FF"_MOFplus force field, "pair_style buck6d/coul/gauss"_pair_buck6d_coul_gauss.html, USER/mofff, -
-"USER-MOLFILE"_Packages_details.html#PKG-USER-MOLFILE, "VMD"_vmd_home molfile plug-ins,"dump molfile"_dump_molfile.html, -, ext
-"USER-NETCDF"_Packages_details.html#PKG-USER-NETCDF, dump output via NetCDF,"dump netcdf"_dump_netcdf.html, -, ext
-"USER-OMP"_Packages_details.html#PKG-USER-OMP, OpenMP-enabled styles,"Speed omp"_Speed_omp.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, -
-"USER-PHONON"_Packages_details.html#PKG-USER-PHONON, phonon dynamical matrix,"fix phonon"_fix_phonon.html, USER/phonon, -
+"USER-DIFFRACTION"_Packages_details.html#PKG-USER-DIFFRACTION, virtual x-ray and electron diffraction,"compute xrd"_compute_xrd.html, USER/diffraction, no
+"USER-DPD"_Packages_details.html#PKG-USER-DPD, reactive dissipative particle dynamics, src/USER-DPD/README, USER/dpd, no
+"USER-DRUDE"_Packages_details.html#PKG-USER-DRUDE, Drude oscillators, "Howto drude"_Howto_drude.html, USER/drude, no
+"USER-EFF"_Packages_details.html#PKG-USER-EFF, electron force field,"pair_style eff/cut"_pair_eff.html, USER/eff, no
+"USER-FEP"_Packages_details.html#PKG-USER-FEP, free energy perturbation,"compute fep"_compute_fep.html, USER/fep, no
+"USER-H5MD"_Packages_details.html#PKG-USER-H5MD, dump output via HDF5,"dump h5md"_dump_h5md.html, n/a, ext
+"USER-INTEL"_Packages_details.html#PKG-USER-INTEL, optimized Intel CPU and KNL styles,"Speed intel"_Speed_intel.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
+"USER-LB"_Packages_details.html#PKG-USER-LB, Lattice Boltzmann fluid,"fix lb/fluid"_fix_lb_fluid.html, USER/lb, no
+"USER-MANIFOLD"_Packages_details.html#PKG-USER-MANIFOLD, motion on 2d surfaces,"fix manifoldforce"_fix_manifoldforce.html, USER/manifold, no
+"USER-MEAMC"_Packages_details.html#PKG-USER-MEAMC, modified EAM potential (C++), "pair_style meam/c"_pair_meam.html, meam, no
+"USER-MESO"_Packages_details.html#PKG-USER-MESO, mesoscale DPD models, "pair_style edpd"_pair_meso.html, USER/meso, no
+"USER-MGPT"_Packages_details.html#PKG-USER-MGPT, fast MGPT multi-ion potentials, "pair_style mgpt"_pair_mgpt.html, USER/mgpt, no
+"USER-MISC"_Packages_details.html#PKG-USER-MISC, single-file contributions, USER-MISC/README, USER/misc, no
+"USER-MOFFF"_Packages_details.html#PKG-USER-MOFFF, styles for "MOF-FF"_MOFplus force field, "pair_style buck6d/coul/gauss"_pair_buck6d_coul_gauss.html, USER/mofff, no
+"USER-MOLFILE"_Packages_details.html#PKG-USER-MOLFILE, "VMD"_vmd_home molfile plug-ins,"dump molfile"_dump_molfile.html, n/a, ext
+"USER-NETCDF"_Packages_details.html#PKG-USER-NETCDF, dump output via NetCDF,"dump netcdf"_dump_netcdf.html, n/a, ext
+"USER-OMP"_Packages_details.html#PKG-USER-OMP, OpenMP-enabled styles,"Speed omp"_Speed_omp.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
+"USER-PHONON"_Packages_details.html#PKG-USER-PHONON, phonon dynamical matrix,"fix phonon"_fix_phonon.html, USER/phonon, no
 "USER-QMMM"_Packages_details.html#PKG-USER-QMMM, QM/MM coupling,"fix qmmm"_fix_qmmm.html, USER/qmmm, ext
-"USER-QTB"_Packages_details.html#PKG-USER-QTB, quantum nuclear effects,"fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, -
+"USER-QTB"_Packages_details.html#PKG-USER-QTB, quantum nuclear effects,"fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, no
 "USER-QUIP"_Packages_details.html#PKG-USER-QUIP, QUIP/libatoms interface,"pair_style quip"_pair_quip.html, USER/quip, ext
-"USER-REAXC"_Packages_details.html#PKG-USER-REAXC, ReaxFF potential (C/C++) ,"pair_style reaxc"_pair_reaxc.html, reax, -
+"USER-REAXC"_Packages_details.html#PKG-USER-REAXC, ReaxFF potential (C/C++) ,"pair_style reaxc"_pair_reaxc.html, reax, no
 "USER-SMD"_Packages_details.html#PKG-USER-SMD, smoothed Mach dynamics,"SMD User Guide"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, ext
-"USER-SMTBQ"_Packages_details.html#PKG-USER-SMTBQ, second moment tight binding QEq potential,"pair_style smtbq"_pair_smtbq.html, USER/smtbq, -
-"USER-SPH"_Packages_details.html#PKG-USER-SPH, smoothed particle hydrodynamics,"SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, -
-"USER-TALLY"_Packages_details.html#PKG-USER-TALLY, pairwise tally computes,"compute XXX/tally"_compute_tally.html, USER/tally, -
-"USER-UEF"_Packages_details.html#PKG-USER-UEF, extensional flow,"fix nvt/uef"_fix_nh_uef.html, USER/uef, -
-"USER-VTK"_Packages_details.html#PKG-USER-VTK, dump output via VTK, "compute vtk"_dump_vtk.html, -, ext :tb(ea=c,ca1=l)
+"USER-SMTBQ"_Packages_details.html#PKG-USER-SMTBQ, second moment tight binding QEq potential,"pair_style smtbq"_pair_smtbq.html, USER/smtbq, no
+"USER-SPH"_Packages_details.html#PKG-USER-SPH, smoothed particle hydrodynamics,"SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, no
+"USER-TALLY"_Packages_details.html#PKG-USER-TALLY, pairwise tally computes,"compute XXX/tally"_compute_tally.html, USER/tally, no
+"USER-UEF"_Packages_details.html#PKG-USER-UEF, extensional flow,"fix nvt/uef"_fix_nh_uef.html, USER/uef, no
+"USER-VTK"_Packages_details.html#PKG-USER-VTK, dump output via VTK, "compute vtk"_dump_vtk.html, n/a, ext :tb(ea=c,ca1=l)
diff --git a/doc/src/Speed_compare.txt b/doc/src/Speed_compare.txt
index 1a17b39c79b672e43062770253ae5e82fc1f998d..c93407515e5cebe6b8e278c62ea36b731c2b121b 100644
--- a/doc/src/Speed_compare.txt
+++ b/doc/src/Speed_compare.txt
@@ -9,65 +9,108 @@ Documentation"_ld - "LAMMPS Commands"_lc :c
 
 Comparison of various accelerator packages :h3
 
-NOTE: this section still needs to be re-worked with additional KOKKOS
-and USER-INTEL information.
-
 The next section compares and contrasts the various accelerator
 options, since there are multiple ways to perform OpenMP threading,
-run on GPUs, and run on Intel Xeon Phi coprocessors.
+run on GPUs, optimize for vector units on CPUs and run on Intel
+Xeon Phi (co-)processors.
 
-All 3 of these packages accelerate a LAMMPS calculation using NVIDIA
-hardware, but they do it in different ways.
+All of these packages can accelerate a LAMMPS calculation taking
+advantage of hardware features, but they do it in different ways
+and acceleration is not always guaranteed.
 
 As a consequence, for a particular simulation on specific hardware,
-one package may be faster than the other.  We give guidelines below,
-but the best way to determine which package is faster for your input
-script is to try both of them on your machine.  See the benchmarking
+one package may be faster than the other.  We give some guidelines
+below, but the best way to determine which package is faster for your
+input script is to try multiple of them on your machine and experiment
+with available performance tuning settings.  See the benchmarking
 section below for examples where this has been done.
 
 [Guidelines for using each package optimally:]
 
-The GPU package allows you to assign multiple CPUs (cores) to a single
-GPU (a common configuration for "hybrid" nodes that contain multicore
-CPU(s) and GPU(s)) and works effectively in this mode. :ulb,l
-
-The GPU package moves per-atom data (coordinates, forces)
-back-and-forth between the CPU and GPU every timestep.  The
-KOKKOS/CUDA package only does this on timesteps when a CPU calculation
-is required (e.g. to invoke a fix or compute that is non-GPU-ized).
-Hence, if you can formulate your input script to only use GPU-ized
-fixes and computes, and avoid doing I/O too often (thermo output, dump
-file snapshots, restart files), then the data transfer cost of the
-KOKKOS/CUDA package can be very low, causing it to run faster than the
-GPU package. :l
-
-The GPU package is often faster than the KOKKOS/CUDA package, if the
-number of atoms per GPU is smaller.  The crossover point, in terms of
-atoms/GPU at which the KOKKOS/CUDA package becomes faster depends
-strongly on the pair style.  For example, for a simple Lennard Jones
+Both, the GPU and the KOKKOS package allows you to assign multiple
+MPI ranks (= CPU cores) to the same GPU. For the GPU package, this
+can lead to a speedup through better utilization of the GPU (by
+overlapping computation and data transfer) and more efficient
+computation of the non-GPU accelerated parts of LAMMPS through MPI
+parallelization, as all system data is maintained and updated on
+the host. For KOKKOS, there is less to no benefit from this, due
+to its different memory management model, which tries to retain
+data on the GPU.
+ :ulb,l
+
+The GPU package moves per-atom data (coordinates, forces, and
+(optionally) neighbor list data, if not computed on the GPU) between
+the CPU and GPU at every timestep.  The KOKKOS/CUDA package only does
+this on timesteps when a CPU calculation is required (e.g. to invoke
+a fix or compute that is non-GPU-ized). Hence, if you can formulate
+your input script to only use GPU-ized fixes and computes, and avoid
+doing I/O too often (thermo output, dump file snapshots, restart files),
+then the data transfer cost of the KOKKOS/CUDA package can be very low,
+causing it to run faster than the GPU package. :l
+
+The GPU package is often faster than the KOKKOS/CUDA package, when the
+number of atoms per GPU is on the smaller side.  The crossover point,
+in terms of atoms/GPU at which the KOKKOS/CUDA package becomes faster
+depends strongly on the pair style.  For example, for a simple Lennard Jones
 system the crossover (in single precision) is often about 50K-100K
 atoms per GPU.  When performing double precision calculations the
 crossover point can be significantly smaller. :l
 
-Both packages compute bonded interactions (bonds, angles, etc) on the
-CPU.  If the GPU package is running with several MPI processes
+Both KOKKOS and GPU package compute bonded interactions (bonds, angles,
+etc) on the CPU.  If the GPU package is running with several MPI processes
 assigned to one GPU, the cost of computing the bonded interactions is
-spread across more CPUs and hence the GPU package can run faster. :l
-
-When using the GPU package with multiple CPUs assigned to one GPU, its
-performance depends to some extent on high bandwidth between the CPUs
-and the GPU.  Hence its performance is affected if full 16 PCIe lanes
-are not available for each GPU.  In HPC environments this can be the
-case if S2050/70 servers are used, where two devices generally share
-one PCIe 2.0 16x slot.  Also many multi-GPU mainboards do not provide
-full 16 lanes to each of the PCIe 2.0 16x slots. :l
+spread across more CPUs and hence the GPU package can run faster in these
+cases. :l
+
+When using LAMMPS with multiple MPI ranks assigned to the same GPU, its
+performance depends to some extent on the available bandwidth between
+the CPUs and the GPU. This can differ significantly based on the
+available bus technology, capability of the host CPU and mainboard,
+the wiring of the buses and whether switches are used to increase the
+number of available bus slots, or if GPUs are housed in an external
+enclosure.  This can become quite complex. :l
+
+To achieve significant acceleration through GPUs, both KOKKOS and GPU
+package require capable GPUs with fast on-device memory and efficient
+data transfer rates. This requests capable upper mid-level to high-end
+(desktop) GPUs. Using lower performance GPUs (e.g. on laptops) may
+result in a slowdown instead. :l
+
+For the GPU package, specifically when running in parallel with MPI,
+if it often more efficient to exclude the PPPM kspace style from GPU
+acceleration and instead run it - concurrently with a GPU accelerated
+pair style - on the CPU. This can often be easily achieved with placing
+a {suffix off} command before and a {suffix on} command after the
+{kspace_style pppm} command. :l
+
+The KOKKOS/OpenMP and USER-OMP package have different thread management
+strategies, which should result in USER-OMP being more efficient for a
+small number of threads with increasing overhead as the number of threads
+per MPI rank grows. The KOKKOS/OpenMP kernels have less overhead in that
+case, but have lower performance with few threads. :l
+
+The USER-INTEL package contains many options and settings for achieving
+additional performance on Intel hardware (CPU and accelerator cards), but
+to unlock this potential, an Intel compiler is required. The package code
+will compile with GNU gcc, but it will not be as efficient. :l
 :ule
 
-[Differences between the two packages:]
+[Differences between the GPU and KOKKOS packages:]
 
-The GPU package accelerates only pair force, neighbor list, and PPPM
-calculations. :ulb,l
+The GPU package accelerates only pair force, neighbor list, and (parts
+of) PPPM calculations. The KOKKOS package attempts to run most of the
+calculation on the GPU, but can transparently support non-accelerated
+code (with a performance penalty due to having data transfers between
+host and GPU). :ulb,l
 
 The GPU package requires neighbor lists to be built on the CPU when using
 exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
+
+The GPU package can be compiled for CUDA or OpenCL and thus supports
+both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically
+resulting in equal or better performance over OpenCL. :l
+
+OpenCL in the GPU package does theoretically also support Intel CPUs or
+Intel Xeon Phi, but the native support for those in KOKKOS (or USER-INTEL)
+is superior. :l
 :ule
diff --git a/src/USER-COLVARS/fix_colvars.h b/src/USER-COLVARS/fix_colvars.h
index 509eca5de35892b3094cf5749b7d6d088d7b0d18..3029ba9db541e095af265657d5f940998cce224f 100644
--- a/src/USER-COLVARS/fix_colvars.h
+++ b/src/USER-COLVARS/fix_colvars.h
@@ -34,7 +34,6 @@ FixStyle(colvars,FixColvars)
 #define LMP_FIX_COLVARS_H
 
 #include "fix.h"
-#include <vector>
 
 // forward declaration
 class colvarproxy_lammps;
@@ -77,13 +76,6 @@ class FixColvars : public Fix {
   int   num_coords;    // total number of atoms controlled by this fix
   tagint *taglist;     // list of all atom IDs referenced by colvars.
 
-  // TODO get rid of these
-  // std::vector<cvm::atom_pos> *coords; // coordinates of colvar atoms
-  // std::vector<cvm::rvector> *forces; // received forces of colvar atoms
-  // std::vector<cvm::rvector> *oforce; // old total forces of colvar atoms
-  // std::vector<cvm::real> *masses;
-  // std::vector<cvm::real> *charges;
-
   int   nmax;          // size of atom communication buffer.
   int   size_one;      // bytes per atom in communication buffer.
   struct commdata *comm_buf; // communication buffer