From cbc5b8bdae1d97960116bf73c82115b7f52666dd Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Tue, 31 May 2016 15:30:30 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15076
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/Makefile                                  |    2 +-
 src/USER-CUDA/Install.sh                      |  202 --
 src/USER-CUDA/README                          |   21 -
 src/USER-CUDA/atom_vec_angle_cuda.cpp         |  467 ---
 src/USER-CUDA/atom_vec_angle_cuda.h           |   69 -
 src/USER-CUDA/atom_vec_atomic_cuda.cpp        |  394 ---
 src/USER-CUDA/atom_vec_atomic_cuda.h          |   81 -
 src/USER-CUDA/atom_vec_charge_cuda.cpp        |  394 ---
 src/USER-CUDA/atom_vec_charge_cuda.h          |   69 -
 src/USER-CUDA/atom_vec_full_cuda.cpp          |  508 ---
 src/USER-CUDA/atom_vec_full_cuda.h            |   69 -
 src/USER-CUDA/comm_cuda.cpp                   | 1375 --------
 src/USER-CUDA/comm_cuda.h                     |   69 -
 src/USER-CUDA/compute_pe_cuda.cpp             |   61 -
 src/USER-CUDA/compute_pe_cuda.h               |   59 -
 src/USER-CUDA/compute_pressure_cuda.cpp       |   97 -
 src/USER-CUDA/compute_pressure_cuda.h         |   63 -
 src/USER-CUDA/compute_temp_cuda.cpp           |  215 --
 src/USER-CUDA/compute_temp_cuda.h             |   76 -
 src/USER-CUDA/compute_temp_partial_cuda.cpp   |  360 --
 src/USER-CUDA/compute_temp_partial_cuda.h     |   84 -
 src/USER-CUDA/cuda.cpp                        | 1067 ------
 src/USER-CUDA/cuda_data.h                     |  796 -----
 src/USER-CUDA/cuda_modify_flags.h             |   45 -
 src/USER-CUDA/cuda_neigh_list.cpp             |  184 --
 src/USER-CUDA/cuda_neigh_list.h               |   83 -
 src/USER-CUDA/domain_cuda.cpp                 |  345 --
 src/USER-CUDA/domain_cuda.h                   |   41 -
 src/USER-CUDA/fft3d_cuda.cpp                  |  609 ----
 src/USER-CUDA/fft3d_cuda.h                    |  148 -
 src/USER-CUDA/fft3d_wrap_cuda.cpp             |  111 -
 src/USER-CUDA/fft3d_wrap_cuda.h               |   68 -
 src/USER-CUDA/fix_addforce_cuda.cpp           |  193 --
 src/USER-CUDA/fix_addforce_cuda.h             |   64 -
 src/USER-CUDA/fix_aveforce_cuda.cpp           |  262 --
 src/USER-CUDA/fix_aveforce_cuda.h             |   68 -
 src/USER-CUDA/fix_enforce2d_cuda.cpp          |  171 -
 src/USER-CUDA/fix_enforce2d_cuda.h            |   55 -
 src/USER-CUDA/fix_freeze_cuda.cpp             |  137 -
 src/USER-CUDA/fix_freeze_cuda.h               |   57 -
 src/USER-CUDA/fix_gravity_cuda.cpp            |  180 -
 src/USER-CUDA/fix_gravity_cuda.h              |   60 -
 src/USER-CUDA/fix_nh_cuda.cpp                 | 2072 ------------
 src/USER-CUDA/fix_nh_cuda.h                   |  126 -
 src/USER-CUDA/fix_npt_cuda.cpp                |   75 -
 src/USER-CUDA/fix_npt_cuda.h                  |   36 -
 src/USER-CUDA/fix_nve_cuda.cpp                |  157 -
 src/USER-CUDA/fix_nve_cuda.h                  |   63 -
 src/USER-CUDA/fix_nvt_cuda.cpp                |   52 -
 src/USER-CUDA/fix_nvt_cuda.h                  |   36 -
 src/USER-CUDA/fix_set_force_cuda.cpp          |  184 --
 src/USER-CUDA/fix_set_force_cuda.h            |   63 -
 src/USER-CUDA/fix_shake_cuda.cpp              | 2885 -----------------
 src/USER-CUDA/fix_shake_cuda.h                |  130 -
 src/USER-CUDA/fix_temp_berendsen_cuda.cpp     |  219 --
 src/USER-CUDA/fix_temp_berendsen_cuda.h       |   58 -
 src/USER-CUDA/fix_temp_rescale_cuda.cpp       |  224 --
 src/USER-CUDA/fix_temp_rescale_cuda.h         |   61 -
 src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp |  237 --
 src/USER-CUDA/fix_temp_rescale_limit_cuda.h   |   61 -
 src/USER-CUDA/fix_viscous_cuda.cpp            |  105 -
 src/USER-CUDA/fix_viscous_cuda.h              |   55 -
 src/USER-CUDA/modify_cuda.cpp                 |  437 ---
 src/USER-CUDA/modify_cuda.h                   |   83 -
 src/USER-CUDA/neigh_full_cuda.cpp             |  307 --
 src/USER-CUDA/neighbor_cuda.cpp               |  240 --
 src/USER-CUDA/neighbor_cuda.h                 |   39 -
 src/USER-CUDA/pair_born_coul_long_cuda.cpp    |  183 --
 src/USER-CUDA/pair_born_coul_long_cuda.h      |   57 -
 src/USER-CUDA/pair_buck_coul_cut_cuda.cpp     |  170 -
 src/USER-CUDA/pair_buck_coul_cut_cuda.h       |   57 -
 src/USER-CUDA/pair_buck_coul_long_cuda.cpp    |  181 --
 src/USER-CUDA/pair_buck_coul_long_cuda.h      |   57 -
 src/USER-CUDA/pair_buck_cuda.cpp              |  166 -
 src/USER-CUDA/pair_buck_cuda.h                |   57 -
 src/USER-CUDA/pair_eam_alloy_cuda.cpp         |  326 --
 src/USER-CUDA/pair_eam_alloy_cuda.h           |   44 -
 src/USER-CUDA/pair_eam_cuda.cpp               |  265 --
 src/USER-CUDA/pair_eam_cuda.h                 |   80 -
 src/USER-CUDA/pair_eam_fs_cuda.cpp            |  335 --
 src/USER-CUDA/pair_eam_fs_cuda.h              |   44 -
 src/USER-CUDA/pair_gran_hooke_cuda.cpp        |  250 --
 src/USER-CUDA/pair_gran_hooke_cuda.h          |   57 -
 src/USER-CUDA/pair_lj96_cut_cuda.cpp          |  179 -
 src/USER-CUDA/pair_lj96_cut_cuda.h            |   57 -
 .../pair_lj_charmm_coul_charmm_cuda.cpp       |  188 --
 .../pair_lj_charmm_coul_charmm_cuda.h         |   63 -
 ...ir_lj_charmm_coul_charmm_implicit_cuda.cpp |  183 --
 ...pair_lj_charmm_coul_charmm_implicit_cuda.h |   62 -
 .../pair_lj_charmm_coul_long_cuda.cpp         |  196 --
 src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h |   62 -
 .../pair_lj_class2_coul_cut_cuda.cpp          |  162 -
 src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h  |   57 -
 .../pair_lj_class2_coul_long_cuda.cpp         |  175 -
 src/USER-CUDA/pair_lj_class2_coul_long_cuda.h |   57 -
 src/USER-CUDA/pair_lj_class2_cuda.cpp         |  167 -
 src/USER-CUDA/pair_lj_class2_cuda.h           |   57 -
 src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp   |  162 -
 src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h     |   57 -
 src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp |  163 -
 src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h   |   57 -
 src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp  |  216 --
 src/USER-CUDA/pair_lj_cut_coul_long_cuda.h    |   57 -
 src/USER-CUDA/pair_lj_cut_cuda.cpp            |  179 -
 src/USER-CUDA/pair_lj_cut_cuda.h              |   57 -
 .../pair_lj_cut_experimental_cuda.cpp         |  178 -
 src/USER-CUDA/pair_lj_cut_experimental_cuda.h |   57 -
 src/USER-CUDA/pair_lj_expand_cuda.cpp         |  180 -
 src/USER-CUDA/pair_lj_expand_cuda.h           |   57 -
 .../pair_lj_gromacs_coul_gromacs_cuda.cpp     |  194 --
 .../pair_lj_gromacs_coul_gromacs_cuda.h       |   68 -
 src/USER-CUDA/pair_lj_gromacs_cuda.cpp        |  177 -
 src/USER-CUDA/pair_lj_gromacs_cuda.h          |   68 -
 src/USER-CUDA/pair_lj_sdk_coul_long_cuda.cpp  |  193 --
 src/USER-CUDA/pair_lj_sdk_coul_long_cuda.h    |   59 -
 src/USER-CUDA/pair_lj_sdk_cuda.cpp            |  184 --
 src/USER-CUDA/pair_lj_sdk_cuda.h              |   65 -
 src/USER-CUDA/pair_lj_smooth_cuda.cpp         |  177 -
 src/USER-CUDA/pair_lj_smooth_cuda.h           |   68 -
 src/USER-CUDA/pair_morse_cuda.cpp             |  177 -
 src/USER-CUDA/pair_morse_cuda.h               |   57 -
 src/USER-CUDA/pair_sw_cuda.cpp                |  207 --
 src/USER-CUDA/pair_sw_cuda.h                  |   66 -
 src/USER-CUDA/pair_tersoff_cuda.cpp           |  204 --
 src/USER-CUDA/pair_tersoff_cuda.h             |   66 -
 src/USER-CUDA/pair_tersoff_zbl_cuda.cpp       |  220 --
 src/USER-CUDA/pair_tersoff_zbl_cuda.h         |   53 -
 src/USER-CUDA/pppm_cuda.cpp                   | 1420 --------
 src/USER-CUDA/pppm_cuda.h                     |  113 -
 src/USER-CUDA/pppm_old.cpp                    | 2839 ----------------
 src/USER-CUDA/pppm_old.h                      |  271 --
 src/USER-CUDA/user_cuda.h                     |  159 -
 src/USER-CUDA/verlet_cuda.cpp                 | 1230 -------
 src/USER-CUDA/verlet_cuda.h                   |   63 -
 134 files changed, 1 insertion(+), 31295 deletions(-)
 delete mode 100755 src/USER-CUDA/Install.sh
 delete mode 100644 src/USER-CUDA/README
 delete mode 100644 src/USER-CUDA/atom_vec_angle_cuda.cpp
 delete mode 100644 src/USER-CUDA/atom_vec_angle_cuda.h
 delete mode 100644 src/USER-CUDA/atom_vec_atomic_cuda.cpp
 delete mode 100644 src/USER-CUDA/atom_vec_atomic_cuda.h
 delete mode 100644 src/USER-CUDA/atom_vec_charge_cuda.cpp
 delete mode 100644 src/USER-CUDA/atom_vec_charge_cuda.h
 delete mode 100644 src/USER-CUDA/atom_vec_full_cuda.cpp
 delete mode 100644 src/USER-CUDA/atom_vec_full_cuda.h
 delete mode 100644 src/USER-CUDA/comm_cuda.cpp
 delete mode 100644 src/USER-CUDA/comm_cuda.h
 delete mode 100644 src/USER-CUDA/compute_pe_cuda.cpp
 delete mode 100644 src/USER-CUDA/compute_pe_cuda.h
 delete mode 100644 src/USER-CUDA/compute_pressure_cuda.cpp
 delete mode 100644 src/USER-CUDA/compute_pressure_cuda.h
 delete mode 100644 src/USER-CUDA/compute_temp_cuda.cpp
 delete mode 100644 src/USER-CUDA/compute_temp_cuda.h
 delete mode 100644 src/USER-CUDA/compute_temp_partial_cuda.cpp
 delete mode 100644 src/USER-CUDA/compute_temp_partial_cuda.h
 delete mode 100644 src/USER-CUDA/cuda.cpp
 delete mode 100644 src/USER-CUDA/cuda_data.h
 delete mode 100644 src/USER-CUDA/cuda_modify_flags.h
 delete mode 100644 src/USER-CUDA/cuda_neigh_list.cpp
 delete mode 100644 src/USER-CUDA/cuda_neigh_list.h
 delete mode 100644 src/USER-CUDA/domain_cuda.cpp
 delete mode 100644 src/USER-CUDA/domain_cuda.h
 delete mode 100644 src/USER-CUDA/fft3d_cuda.cpp
 delete mode 100644 src/USER-CUDA/fft3d_cuda.h
 delete mode 100644 src/USER-CUDA/fft3d_wrap_cuda.cpp
 delete mode 100644 src/USER-CUDA/fft3d_wrap_cuda.h
 delete mode 100644 src/USER-CUDA/fix_addforce_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_addforce_cuda.h
 delete mode 100644 src/USER-CUDA/fix_aveforce_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_aveforce_cuda.h
 delete mode 100644 src/USER-CUDA/fix_enforce2d_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_enforce2d_cuda.h
 delete mode 100644 src/USER-CUDA/fix_freeze_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_freeze_cuda.h
 delete mode 100644 src/USER-CUDA/fix_gravity_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_gravity_cuda.h
 delete mode 100644 src/USER-CUDA/fix_nh_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_nh_cuda.h
 delete mode 100644 src/USER-CUDA/fix_npt_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_npt_cuda.h
 delete mode 100644 src/USER-CUDA/fix_nve_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_nve_cuda.h
 delete mode 100644 src/USER-CUDA/fix_nvt_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_nvt_cuda.h
 delete mode 100644 src/USER-CUDA/fix_set_force_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_set_force_cuda.h
 delete mode 100644 src/USER-CUDA/fix_shake_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_shake_cuda.h
 delete mode 100644 src/USER-CUDA/fix_temp_berendsen_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_temp_berendsen_cuda.h
 delete mode 100644 src/USER-CUDA/fix_temp_rescale_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_temp_rescale_cuda.h
 delete mode 100644 src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_temp_rescale_limit_cuda.h
 delete mode 100644 src/USER-CUDA/fix_viscous_cuda.cpp
 delete mode 100644 src/USER-CUDA/fix_viscous_cuda.h
 delete mode 100644 src/USER-CUDA/modify_cuda.cpp
 delete mode 100644 src/USER-CUDA/modify_cuda.h
 delete mode 100644 src/USER-CUDA/neigh_full_cuda.cpp
 delete mode 100644 src/USER-CUDA/neighbor_cuda.cpp
 delete mode 100644 src/USER-CUDA/neighbor_cuda.h
 delete mode 100644 src/USER-CUDA/pair_born_coul_long_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_born_coul_long_cuda.h
 delete mode 100644 src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_buck_coul_cut_cuda.h
 delete mode 100644 src/USER-CUDA/pair_buck_coul_long_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_buck_coul_long_cuda.h
 delete mode 100644 src/USER-CUDA/pair_buck_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_buck_cuda.h
 delete mode 100644 src/USER-CUDA/pair_eam_alloy_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_eam_alloy_cuda.h
 delete mode 100644 src/USER-CUDA/pair_eam_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_eam_cuda.h
 delete mode 100644 src/USER-CUDA/pair_eam_fs_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_eam_fs_cuda.h
 delete mode 100644 src/USER-CUDA/pair_gran_hooke_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_gran_hooke_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj96_cut_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj96_cut_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_class2_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_class2_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_cut_coul_long_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_cut_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_cut_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_cut_experimental_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_expand_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_expand_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_gromacs_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_gromacs_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_sdk_coul_long_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_sdk_coul_long_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_sdk_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_sdk_cuda.h
 delete mode 100644 src/USER-CUDA/pair_lj_smooth_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_lj_smooth_cuda.h
 delete mode 100644 src/USER-CUDA/pair_morse_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_morse_cuda.h
 delete mode 100644 src/USER-CUDA/pair_sw_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_sw_cuda.h
 delete mode 100644 src/USER-CUDA/pair_tersoff_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_tersoff_cuda.h
 delete mode 100644 src/USER-CUDA/pair_tersoff_zbl_cuda.cpp
 delete mode 100644 src/USER-CUDA/pair_tersoff_zbl_cuda.h
 delete mode 100644 src/USER-CUDA/pppm_cuda.cpp
 delete mode 100644 src/USER-CUDA/pppm_cuda.h
 delete mode 100755 src/USER-CUDA/pppm_old.cpp
 delete mode 100644 src/USER-CUDA/pppm_old.h
 delete mode 100644 src/USER-CUDA/user_cuda.h
 delete mode 100644 src/USER-CUDA/verlet_cuda.cpp
 delete mode 100644 src/USER-CUDA/verlet_cuda.h

diff --git a/src/Makefile b/src/Makefile
index 16d3e5326c..85b212d539 100755
--- a/src/Makefile
+++ b/src/Makefile
@@ -46,7 +46,7 @@ PACKAGE = asphere body class2 colloid compress coreshell dipole gpu \
 	  kokkos kspace manybody mc meam misc molecule mpiio opt peri poems \
 	  python qeq reax replica rigid shock snap srd voronoi
 
-PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars user-cuda \
+PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \
 	   user-diffraction user-dpd user-drude user-eff user-fep user-h5md \
 	   user-intel user-lb user-manifold user-mgpt \
 	   user-misc user-molfile user-omp user-phonon user-qmmm user-qtb \
diff --git a/src/USER-CUDA/Install.sh b/src/USER-CUDA/Install.sh
deleted file mode 100755
index 96345160a6..0000000000
--- a/src/USER-CUDA/Install.sh
+++ /dev/null
@@ -1,202 +0,0 @@
-# Install/unInstall package files in LAMMPS
-# mode = 0/1/2 for uninstall/install/update
-
-mode=$1
-
-# arg1 = file, arg2 = file it depends on
-
-action () {
-  if (test $mode = 0) then
-    rm -f ../$1
-  elif (! cmp -s $1 ../$1) then
-    if (test -z "$2" || test -e ../$2) then
-      cp $1 ..
-      if (test $mode = 2) then
-        echo "  updating src/$1"
-      fi
-    fi
-  elif (test -n "$2") then
-    if (test ! -e ../$2) then
-      rm -f ../$1
-    fi
-  fi
-}
-
-# force rebuild of files with LMP_USER_CUDA switch
-
-touch ../accelerator_cuda.h
-
-# list of files with optional dependencies
-
-action atom_vec_angle_cuda.cpp atom_vec_angle.cpp
-action atom_vec_angle_cuda.h atom_vec_angle.cpp
-action atom_vec_atomic_cuda.cpp
-action atom_vec_atomic_cuda.h
-action atom_vec_charge_cuda.cpp
-action atom_vec_charge_cuda.h
-action atom_vec_full_cuda.cpp atom_vec_full.cpp
-action atom_vec_full_cuda.h atom_vec_full.cpp
-action comm_cuda.cpp
-action comm_cuda.h
-action compute_pe_cuda.cpp
-action compute_pe_cuda.h
-action compute_pressure_cuda.cpp
-action compute_pressure_cuda.h
-action compute_temp_cuda.cpp
-action compute_temp_cuda.h
-action compute_temp_partial_cuda.cpp
-action compute_temp_partial_cuda.h
-action cuda.cpp
-action cuda_data.h
-action cuda_modify_flags.h
-action cuda_neigh_list.cpp
-action cuda_neigh_list.h
-action domain_cuda.cpp
-action domain_cuda.h
-action fft3d_cuda.cpp pppm.cpp
-action fft3d_cuda.h pppm.cpp
-action fft3d_wrap_cuda.cpp pppm.cpp
-action fft3d_wrap_cuda.h pppm.cpp
-action fix_addforce_cuda.cpp
-action fix_addforce_cuda.h
-action fix_aveforce_cuda.cpp
-action fix_aveforce_cuda.h
-action fix_enforce2d_cuda.cpp
-action fix_enforce2d_cuda.h
-action fix_freeze_cuda.cpp fix_freeze.cpp
-action fix_freeze_cuda.h fix_freeze.cpp
-action fix_gravity_cuda.cpp
-action fix_gravity_cuda.h
-action fix_nh_cuda.cpp
-action fix_nh_cuda.h
-action fix_npt_cuda.cpp
-action fix_npt_cuda.h
-action fix_nve_cuda.cpp
-action fix_nve_cuda.h
-action fix_nvt_cuda.cpp
-action fix_nvt_cuda.h
-action fix_set_force_cuda.cpp
-action fix_set_force_cuda.h
-action fix_shake_cuda.cpp
-action fix_shake_cuda.h
-action fix_temp_berendsen_cuda.cpp
-action fix_temp_berendsen_cuda.h
-action fix_temp_rescale_cuda.cpp
-action fix_temp_rescale_cuda.h
-action fix_temp_rescale_limit_cuda.cpp
-action fix_temp_rescale_limit_cuda.h
-action fix_viscous_cuda.cpp
-action fix_viscous_cuda.h
-action modify_cuda.cpp
-action modify_cuda.h
-action neigh_full_cuda.cpp
-action neighbor_cuda.cpp
-action neighbor_cuda.h
-action pair_born_coul_long_cuda.cpp pair_born_coul_long.cpp
-action pair_born_coul_long_cuda.h pair_born_coul_long.cpp
-action pair_buck_coul_cut_cuda.cpp
-action pair_buck_coul_cut_cuda.h
-action pair_buck_coul_long_cuda.cpp pair_buck_coul_long.cpp
-action pair_buck_coul_long_cuda.h pair_buck_coul_long.cpp
-action pair_buck_cuda.cpp
-action pair_buck_cuda.h
-action pair_eam_alloy_cuda.cpp pair_eam_alloy.cpp
-action pair_eam_alloy_cuda.h pair_eam_alloy.cpp
-action pair_eam_cuda.cpp pair_eam.cpp
-action pair_eam_cuda.h pair_eam.cpp
-action pair_eam_fs_cuda.cpp pair_eam_fs.cpp
-action pair_eam_fs_cuda.h pair_eam_fs.cpp
-action pair_gran_hooke_cuda.cpp pair_gran_hooke.cpp
-action pair_gran_hooke_cuda.h pair_gran_hooke.cpp
-action pair_lj96_cut_cuda.cpp
-action pair_lj96_cut_cuda.h
-action pair_lj_charmm_coul_charmm_cuda.cpp pair_lj_charmm_coul_charmm.cpp
-action pair_lj_charmm_coul_charmm_cuda.h pair_lj_charmm_coul_charmm.cpp
-action pair_lj_charmm_coul_charmm_implicit_cuda.cpp pair_lj_charmm_coul_charmm_implicit.cpp
-action pair_lj_charmm_coul_charmm_implicit_cuda.h pair_lj_charmm_coul_charmm_implicit.cpp
-action pair_lj_charmm_coul_long_cuda.cpp pair_lj_charmm_coul_long.cpp
-action pair_lj_charmm_coul_long_cuda.h pair_lj_charmm_coul_long.cpp
-action pair_lj_class2_coul_cut_cuda.cpp pair_lj_class2_coul_cut.cpp
-action pair_lj_class2_coul_cut_cuda.h pair_lj_class2_coul_cut.cpp
-action pair_lj_class2_coul_long_cuda.cpp pair_lj_class2_coul_long.cpp
-action pair_lj_class2_coul_long_cuda.h pair_lj_class2_coul_long.cpp
-action pair_lj_class2_cuda.cpp pair_lj_class2.cpp
-action pair_lj_class2_cuda.h pair_lj_class2.cpp
-action pair_lj_cut_coul_cut_cuda.cpp
-action pair_lj_cut_coul_cut_cuda.h
-action pair_lj_cut_coul_debye_cuda.cpp
-action pair_lj_cut_coul_debye_cuda.h
-action pair_lj_cut_coul_long_cuda.cpp pair_lj_cut_coul_long.cpp
-action pair_lj_cut_coul_long_cuda.h pair_lj_cut_coul_long.cpp
-action pair_lj_cut_cuda.cpp
-action pair_lj_cut_cuda.h
-action pair_lj_cut_experimental_cuda.cpp
-action pair_lj_cut_experimental_cuda.h
-action pair_lj_expand_cuda.cpp
-action pair_lj_expand_cuda.h
-action pair_lj_gromacs_coul_gromacs_cuda.cpp
-action pair_lj_gromacs_coul_gromacs_cuda.h
-action pair_lj_gromacs_cuda.cpp
-action pair_lj_gromacs_cuda.h
-action pair_lj_sdk_coul_long_cuda.cpp pair_lj_sdk_coul_long.cpp
-action pair_lj_sdk_coul_long_cuda.h pair_lj_sdk_coul_long.cpp
-action pair_lj_sdk_cuda.cpp pair_lj_sdk.cpp
-action pair_lj_sdk_cuda.h pair_lj_sdk.cpp
-action pair_lj_smooth_cuda.cpp
-action pair_lj_smooth_cuda.h
-action pair_morse_cuda.cpp
-action pair_morse_cuda.h
-action pair_sw_cuda.cpp pair_sw.cpp
-action pair_sw_cuda.h pair_sw.cpp
-action pair_tersoff_cuda.cpp pair_tersoff.cpp
-action pair_tersoff_cuda.h pair_tersoff.cpp
-action pair_tersoff_zbl_cuda.cpp pair_tersoff_zbl.cpp
-action pair_tersoff_zbl_cuda.h pair_tersoff_zbl.cpp
-action pppm_cuda.cpp pppm.cpp
-action pppm_cuda.h pppm.cpp
-action pppm_old.cpp pppm.cpp
-action pppm_old.h pppm.cpp
-action user_cuda.h
-action verlet_cuda.cpp
-action verlet_cuda.h
-
-# edit 2 Makefile.package files to include/exclude package info
-
-if (test $1 = 1) then
-
-  if (test -e ../Makefile.package) then
-    sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package
-    sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package
-    sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -DLMP_USER_CUDA |' ../Makefile.package
-    sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda |' ../Makefile.package
-    sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSINC =[ \t]*|&$(user-cuda_SYSINC) |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(user-cuda_SYSLIB) |' ../Makefile.package
-    sed -i -e 's|^PKG_SYSPATH =[ \t]*|&$(user-cuda_SYSPATH) |' ../Makefile.package
-  fi
-
-  if (test -e ../Makefile.package.settings) then
-    sed -i -e '/^include.*cuda.*$/d' ../Makefile.package.settings
-    # multiline form needed for BSD sed on Macs
-    sed -i -e '4 i \
-include ..\/..\/lib\/cuda\/Makefile.lammps
-' ../Makefile.package.settings
-
-  fi
-
-elif (test $1 = 0) then
-  # need to delete a bunch of depenency files because they indirectly depend on user_cuda.h
-  for f in input.d output.d pair.d fix_omp.d
-  do \
-    rm -f ../Obj_*/$f
-  done
-  if (test -e ../Makefile.package) then
-    sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package
-    sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package
-  fi
-
-  if (test -e ../Makefile.package.settings) then
-    sed -i -e '/^include.*cuda.*$/d' ../Makefile.package.settings
-  fi
-
-fi
diff --git a/src/USER-CUDA/README b/src/USER-CUDA/README
deleted file mode 100644
index b9d2c07f8a..0000000000
--- a/src/USER-CUDA/README
+++ /dev/null
@@ -1,21 +0,0 @@
-This package provides acceleration of various LAMMPS pair styles, fix
-styles, compute styles, and long-range Coulombics via PPPM for NVIDIA
-GPUs.
- 
-See this section of the manual to get started:
-
-doc/Section_accelerate.html, sub-section 5.4
-
-There are example scripts for using this package in
-examples/USER/cuda.
-
-This package uses an external library in lib/cuda which must be
-compiled before making LAMMPS.  See the lib/cuda/README file and the
-LAMMPS manual for information on building LAMMPS with external
-libraries.  The settings in the Makefile.lammps file in that directory
-must be correct for LAMMPS to build correctly with this package
-installed.
-
-The person who created this package is Christian Trott at the
-University of Technology Ilmenau, Germany (christian.trott at
-tu-ilmenau.de).  Contact him directly if you have questions.
diff --git a/src/USER-CUDA/atom_vec_angle_cuda.cpp b/src/USER-CUDA/atom_vec_angle_cuda.cpp
deleted file mode 100644
index c393d58824..0000000000
--- a/src/USER-CUDA/atom_vec_angle_cuda.cpp
+++ /dev/null
@@ -1,467 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "atom_vec_angle_cuda.h"
-#include "comm_cuda_cu.h"
-#include "atom_vec_angle_cuda_cu.h"
-#include "atom.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "universe.h"
-#include "comm.h"
-
-using namespace LAMMPS_NS;
-
-#define BUFFACTOR 1.5
-#define BUFEXTRA 1000
-#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image molecule
-
-#define BUF_CFLOAT double
-/* ---------------------------------------------------------------------- */
-
-AtomVecAngleCuda::AtomVecAngleCuda(LAMMPS *lmp) : AtomVecAngle(lmp)
-{
-   cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-   maxsend=0;
-   cudable=true;
-   cuda_init_done=false;
-   max_nsend=0;
-   cu_copylist=NULL;
-   copylist=NULL;
-   copylist2=NULL;
-}
-
-void AtomVecAngleCuda::grow_copylist(int new_max_nsend)
-{
-  max_nsend=new_max_nsend;
-  delete cu_copylist;
-  delete [] copylist2;
-  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
-  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
-  copylist2 = new int[max_nsend];
-  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
-}
-
-void AtomVecAngleCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
-{
-  int old_maxsend=*maxsend+BUFEXTRA;
-  *maxsend = static_cast<int> (BUFFACTOR * n);
-  if (flag)
-  {
-    if(cuda->pinned)
-    {
-      double* tmp = new double[old_maxsend];
-      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
-      delete [] tmp;
-    }
-    else
-    {
-     *buf_send = (double *)
-      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
-                       "comm:buf_send");
-    }
-  }
-  else {
-   if(cuda->pinned)
-    {
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-    }
-    else
-    {
-      memory->sfree(*buf_send);
-      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
-                                          "comm:buf_send");
-    }
-  }
-}
-
-void AtomVecAngleCuda::grow_both(int n)
-{
-  if(cuda->finished_setup)
-  {
-    cuda->cu_special->upload();
-    cuda->cu_nspecial->upload();
-    cuda->downloadAll();
-  }
-  AtomVecAngle::grow(n);
-  if(cuda->finished_setup)
-  {
-    cuda->checkResize();
-    cuda->uploadAll();
-  }
-}
-
-int AtomVecAngleCuda::pack_comm(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAngle::pack_comm(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-int AtomVecAngleCuda::pack_comm_vel(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAngle::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAngleCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAngle::unpack_comm(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
-}
-
-void AtomVecAngleCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAngle::unpack_comm_vel(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAngleCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAngle::pack_reverse(n,first,buf);
-
-  int i,m,last;
-  cuda->cu_f->download();
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    buf[m++] = f[i][0];
-    buf[m++] = f[i][1];
-    buf[m++] = f[i][2];
-  }
-  cuda->cu_f->upload();
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAngleCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          {AtomVecAngle::unpack_reverse(n,list,buf); return;}
-
-  int i,j,m;
-
-  m = 0;
-  cuda->cu_f->download();
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    f[j][0] += buf[m++];
-    f[j][1] += buf[m++];
-    f[j][2] += buf[m++];
-  }
-  cuda->cu_f->upload();
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAngleCuda::pack_border(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
- if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAngle::pack_border(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_AtomVecAngleCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-  return m;
-}
-
-int AtomVecAngleCuda::pack_border_vel(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
- if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAngle::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_AtomVecAngleCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAngleCuda::unpack_border(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAngle::unpack_border(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecAngleCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-
-void AtomVecAngleCuda::unpack_border_vel(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAngle::unpack_border_vel(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecAngleCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-
-/* ----------------------------------------------------------------------
-   pack data for atom I for sending to another proc
-   xyz must be 1st 3 values, so comm::exchange() can test on them
-------------------------------------------------------------------------- */
-
-
-int AtomVecAngleCuda::pack_exchange(int dim, double *buf)
-{
-  if(cuda->oncpu)
-          return AtomVecAngle::pack_exchange(dim,buf);
-
-  if(not cuda_init_done||domain->box_change)
-  {
-          Cuda_AtomVecAngleCuda_Init(&cuda->shared_data);
-          cuda_init_done=true;
-  }
-  double** buf_pointer=(double**) buf;
-  if(*maxsend<atom->nghost || *buf_pointer==NULL)
-  {
-          grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
-          *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
-  }
-
-  if(max_nsend==0) grow_copylist(200);
-
-  int nsend_atoms = Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-  if(nsend_atoms*NCUDAEXCHANGE>*maxsend)
-  {
-          grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
-          Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-  }
-
-  int nlocal=atom->nlocal-nsend_atoms;
-
-  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i>=nlocal) copylist2[i-nlocal]=-1;
-  }
-
-  int actpos=0;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i<nlocal)
-          {
-            while(copylist2[actpos]==-1) actpos++;
-              copylist[j-1]=nlocal+actpos;
-            actpos++;
-          }
-  }
-  cu_copylist->upload();
-
-  cuda->shared_data.atom.nlocal=nlocal;
-
-  int m = Cuda_AtomVecAngleCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
-
-  my_times time1,time2;
-  my_gettime(CLOCK_REALTIME,&time1);
-
-  double* buf_p=*buf_pointer;
-  for(int j=0;j<nsend_atoms;j++)
-  {
-    int i=static_cast <int> (buf_p[j+1]);
-    int nextra=0;
-    int k;
-    buf_p[m++] = num_bond[i];
-    for (k = 0; k < num_bond[i]; k++) {
-      buf_p[m++] = bond_type[i][k];
-      buf_p[m++] = bond_atom[i][k];
-    }
-    nextra+=2*num_bond[i]+1;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    buf_p[m++] = num_angle[i];
-    for (k = 0; k < num_angle[i]; k++) {
-      buf_p[m++] = angle_type[i][k];
-      buf_p[m++] = angle_atom1[i][k];
-      buf_p[m++] = angle_atom2[i][k];
-      buf_p[m++] = angle_atom3[i][k];
-    }
-    nextra+=4*num_angle[i]+1;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    buf_p[m++] = nspecial[i][0];
-    buf_p[m++] = nspecial[i][1];
-    buf_p[m++] = nspecial[i][2];
-    for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k];
-    nextra+=nspecial[i][2]+3;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    if (atom->nextra_grow)
-      for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      {
-        int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]);
-        m+=dm;
-            nextra+=dm;
-            if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i,1);
-            if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-      }
-
-    if(i<nlocal)AtomVecAngle::copy(copylist[j],i,1);
-    (*buf_pointer)[j+1] = nextra;
-  }
-
-          my_gettime(CLOCK_REALTIME,&time2);
-          cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-  (*buf_pointer)[0] = nsend_atoms;
-  atom->nlocal-=nsend_atoms;
-  cuda->shared_data.atom.update_nlocal=2;
- //printf("End Pack Exchange\n");
-  if(m==1) return 0;
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAngleCuda::unpack_exchange(double *buf)
-{
-// printf("Begin UnPack Exchange\n");
-  if(cuda->oncpu)
-          return AtomVecAngle::unpack_exchange(buf);
-
-  int dim=cuda->shared_data.exchange_dim;
-  if(domain->box_change)
-  Cuda_AtomVecAngleCuda_Init(&cuda->shared_data);
-
-  int mfirst=0;
-  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
-  {
-  int nlocal = atom->nlocal;
-  int nsend_atoms=static_cast<int> (buf[0]);
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-
-  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data
-  int naccept = Cuda_AtomVecAngleCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
-  cu_copylist->download();
-  int m = nsend_atoms*NCUDAEXCHANGE + 1;
-  nlocal+=naccept;
-
-  my_times time1,time2;
-  my_gettime(CLOCK_REALTIME,&time1);
-
-  for(int j=0;j<nsend_atoms;j++)
-  {
-    if(copylist[j]>-1)
-    {
-           int k;
-          int i=copylist[j];
-      num_bond[i] = static_cast<int> (buf[m++]);
-      for (k = 0; k < num_bond[i]; k++) {
-            bond_type[i][k] = static_cast<int> (buf[m++]);
-            bond_atom[i][k] = static_cast<int> (buf[m++]);
-            }
-
-            num_angle[i] = static_cast<int> (buf[m++]);
-            for (k = 0; k < num_angle[i]; k++) {
-            angle_type[i][k] = static_cast<int> (buf[m++]);
-            angle_atom1[i][k] = static_cast<int> (buf[m++]);
-            angle_atom2[i][k] = static_cast<int> (buf[m++]);
-            angle_atom3[i][k] = static_cast<int> (buf[m++]);
-            }
-
-            nspecial[i][0] = static_cast<int> (buf[m++]);
-            nspecial[i][1] = static_cast<int> (buf[m++]);
-            nspecial[i][2] = static_cast<int> (buf[m++]);
-            for (k = 0; k < nspecial[i][2]; k++)
-            special[i][k] = static_cast<int> (buf[m++]);
-
-            if (atom->nextra_grow)
-        for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-                                      m += modify->fix[atom->extra_grow[iextra]]->
-                                        unpack_exchange(i,&buf[m]);
-
-    }
-    else
-    m+=static_cast <int> (buf[j+1]);
-  }
-
-          my_gettime(CLOCK_REALTIME,&time2);
-          cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-  cuda->shared_data.atom.nlocal=nlocal;
-  cuda->shared_data.atom.update_nlocal=2;
-  atom->nlocal=nlocal;
-  mfirst+=m;
-  buf=&buf[m];
-  }
-  return mfirst;
-}
diff --git a/src/USER-CUDA/atom_vec_angle_cuda.h b/src/USER-CUDA/atom_vec_angle_cuda.h
deleted file mode 100644
index 13913da1c0..0000000000
--- a/src/USER-CUDA/atom_vec_angle_cuda.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef ATOM_CLASS
-
-AtomStyle(angle/cuda,AtomVecAngleCuda)
-
-#else
-
-#ifndef LMP_ATOM_VEC_ANGLE_CUDA_H
-#define LMP_ATOM_VEC_ANGLE_CUDA_H
-
-#include "atom_vec_angle.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class AtomVecAngleCuda : public AtomVecAngle {
- public:
-  AtomVecAngleCuda(class LAMMPS *);
-  virtual ~AtomVecAngleCuda() {}
-  void grow_copylist(int n);
-  void grow_send(int n,double** buf_send,int flag);
-  void grow_both(int n);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
- private:
-  class Cuda *cuda;
-  bool cuda_init_done;
-  int* copylist;
-  int* copylist2;
-  cCudaData<int, int, xx >* cu_copylist;
-  int max_nsend;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.cpp b/src/USER-CUDA/atom_vec_atomic_cuda.cpp
deleted file mode 100644
index c54f7d3127..0000000000
--- a/src/USER-CUDA/atom_vec_atomic_cuda.cpp
+++ /dev/null
@@ -1,394 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "atom_vec_atomic_cuda.h"
-#include "comm_cuda_cu.h"
-#include "atom_vec_atomic_cuda_cu.h"
-#include "atom.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "comm.h"
-
-using namespace LAMMPS_NS;
-
-#define BUFFACTOR 1.5
-#define BUFEXTRA 1000
-#define NCUDAEXCHANGE 11 //nextra x y z vx vy vz tag type mask image
-
-
-#define BUF_CFLOAT double
-/* ---------------------------------------------------------------------- */
-
-AtomVecAtomicCuda::AtomVecAtomicCuda(LAMMPS *lmp) : AtomVecAtomic(lmp)
-{
-   cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-   maxsend=0;
-   cudable=true;
-   cuda_init_done=false;
-   max_nsend=0;
-   cu_copylist=NULL;
-   copylist=NULL;
-   copylist2=NULL;
-}
-
-void AtomVecAtomicCuda::grow_copylist(int new_max_nsend)
-{
-  max_nsend=new_max_nsend;
-  delete cu_copylist;
-  delete [] copylist2;
-  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
-  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
-  copylist2 = new int[max_nsend];
-  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
-}
-
-void AtomVecAtomicCuda::grow_send(int n,double** buf_send,int flag)
-{
-  int old_maxsend=*maxsend+BUFEXTRA;
-  *maxsend = static_cast<int> (BUFFACTOR * n);
-  if (flag)
-  {
-    if(cuda->pinned)
-    {
-      double* tmp = new double[old_maxsend];
-      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
-      delete [] tmp;
-    }
-    else
-    {
-     *buf_send = (double *)
-      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
-                       "comm:buf_send");
-    }
-  }
-  else {
-   if(cuda->pinned)
-    {
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-    }
-    else
-    {
-      memory->sfree(*buf_send);
-      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
-                                          "comm:buf_send");
-    }
-  }
-}
-
-void AtomVecAtomicCuda::grow_both(int n)
-{
-  if(cuda->finished_setup)
-  cuda->downloadAll();
-  AtomVecAtomic::grow(n);
-  if(cuda->finished_setup)
-  {
-    cuda->checkResize();
-    cuda->uploadAll();
-  }
-}
-
-int AtomVecAtomicCuda::pack_comm(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAtomic::pack_comm(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-int AtomVecAtomicCuda::pack_comm_vel(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAtomic::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicCuda::unpack_comm(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAtomic::unpack_comm(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
-}
-
-void AtomVecAtomicCuda::unpack_comm_vel(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAtomic::unpack_comm_vel(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
-}
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicCuda::pack_reverse(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAtomic::pack_reverse(n,first,buf);
-
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    buf[m++] = f[i][0];
-    buf[m++] = f[i][1];
-    buf[m++] = f[i][2];
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicCuda::unpack_reverse(int n, int *list, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          {AtomVecAtomic::unpack_reverse(n,list,buf); return;}
-
-  int i,j,m;
-
-  m = 0;
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    f[j][0] += buf[m++];
-    f[j][1] += buf[m++];
-    f[j][2] += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicCuda::pack_border(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAtomic::pack_border(n,iswap,buf,pbc_flag,pbc);
-
-  int m = Cuda_AtomVecAtomicCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-
-  return m;
-}
-
-int AtomVecAtomicCuda::pack_border_vel(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecAtomic::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
-
-  int m = Cuda_AtomVecAtomicCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-
-  return m;
-}
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicCuda::unpack_border(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAtomic::unpack_border(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax)
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecAtomicCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-
-}
-
-void AtomVecAtomicCuda::unpack_border_vel(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecAtomic::unpack_border_vel(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax)
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecAtomicCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-/* ----------------------------------------------------------------------
-   pack data for atom I for sending to another proc
-   xyz must be 1st 3 values, so comm::exchange() can test on them
-------------------------------------------------------------------------- */
-
-
-int AtomVecAtomicCuda::pack_exchange(int dim, double *buf)
-{
-  if(cuda->oncpu)
-          return AtomVecAtomic::pack_exchange(dim,buf);
-
-  if(not cuda_init_done||domain->box_change)
-  {
-          Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data);
-          cuda_init_done=true;
-  }
-  double** buf_pointer=(double**) buf;
-  if(*maxsend<atom->nghost || *buf_pointer==NULL)
-  {
-          grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
-          *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
-  }
-
-  if(max_nsend==0) grow_copylist(200);
-
-  int nsend_atoms = Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-
-  if(nsend_atoms>max_nsend) {grow_copylist(nsend_atoms+100);}
-  if(nsend_atoms*NCUDAEXCHANGE>*maxsend)
-  {
-          grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
-           Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-  }
-
-  int nlocal=atom->nlocal-nsend_atoms;
-
-  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i>=nlocal) copylist2[i-nlocal]=-1;
-  }
-
-  int actpos=0;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i<nlocal)
-          {
-            while(copylist2[actpos]==-1) actpos++;
-              copylist[j-1]=nlocal+actpos;
-            actpos++;
-          }
-  }
-  cu_copylist->upload();
-
-  cuda->shared_data.atom.nlocal=nlocal;
-
-  int m = Cuda_AtomVecAtomicCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
-  if (atom->nextra_grow)
-  for(int j=0;j<nsend_atoms;j++)
-  {
-      int i=static_cast <int> ((*buf_pointer)[j+1]);
-      int nextra=0;
-      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
-
-        int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m]));
-        m+=dm;
-                  nextra+=dm;
-                  if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i,1);
-        if(m>*maxsend)  grow_send(m,buf_pointer,1);
-      }
-      (*buf_pointer)[j+1] = nextra;
-
-  }
-
-  (*buf_pointer)[0] = nsend_atoms;
-  atom->nlocal-=nsend_atoms;
-  cuda->shared_data.atom.update_nlocal=2;
-
-  if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicCuda::unpack_exchange(double *buf)
-{
-  //printf("Unpack Begin\n");
-  if(cuda->oncpu)
-          return AtomVecAtomic::unpack_exchange(buf);
-
-  int dim=cuda->shared_data.exchange_dim;
-  if(domain->box_change)
-  Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data);
-
-  int mfirst=0;
-  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
-  {
-  int nlocal = atom->nlocal;
-
-  int nsend_atoms=static_cast<int> (buf[0]);
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-
-  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost);
-  int naccept = Cuda_AtomVecAtomicCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
-  cu_copylist->download();
-  int m = nsend_atoms*NCUDAEXCHANGE + 1;
-  nlocal+=naccept;
-  if (atom->nextra_grow)
-  for(int j=0;j<nsend_atoms;j++)
-  {
-    if(copylist[j]>-1)
-    {
-                    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-                                      m += modify->fix[atom->extra_grow[iextra]]->
-                                        unpack_exchange(copylist[j],&buf[m]);
-    }
-    else
-    {
-      m+=static_cast <int> (buf[j+1]);
-    }
-  }
-  cuda->shared_data.atom.nlocal=nlocal;
-  if(atom->nlocal!=nlocal)
-  cuda->shared_data.atom.update_nlocal=2;
-  atom->nlocal=nlocal;
-  mfirst+=m;
-  buf=&buf[m];
-  }
-  return mfirst;
-}
diff --git a/src/USER-CUDA/atom_vec_atomic_cuda.h b/src/USER-CUDA/atom_vec_atomic_cuda.h
deleted file mode 100644
index dabbd9a215..0000000000
--- a/src/USER-CUDA/atom_vec_atomic_cuda.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-#ifdef ATOM_CLASS
-
-AtomStyle(atomic/cuda,AtomVecAtomicCuda)
-
-#else
-
-#ifndef LMP_ATOM_VEC_ATOMIC_CUDA_H
-#define LMP_ATOM_VEC_ATOMIC_CUDA_H
-
-#include "atom_vec_atomic.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class AtomVecAtomicCuda : public AtomVecAtomic {
- public:
-  AtomVecAtomicCuda(class LAMMPS *);
-  virtual ~AtomVecAtomicCuda() {}
-  void grow_copylist(int n);
-  void grow_send(int n,double** buf_send,int flag);
-  void grow_both(int n);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
- private:
-  class Cuda *cuda;
-  bool cuda_init_done;
-  int* copylist;
-  int* copylist2;
-  cCudaData<int, int, xx >* cu_copylist;
-  int max_nsend;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/atom_vec_charge_cuda.cpp b/src/USER-CUDA/atom_vec_charge_cuda.cpp
deleted file mode 100644
index 07140f3e84..0000000000
--- a/src/USER-CUDA/atom_vec_charge_cuda.cpp
+++ /dev/null
@@ -1,394 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "atom_vec_charge_cuda.h"
-#include "comm_cuda_cu.h"
-#include "atom_vec_charge_cuda_cu.h"
-#include "atom.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "comm.h"
-
-using namespace LAMMPS_NS;
-
-#define BUFFACTOR 1.5
-#define BUFEXTRA 1000
-#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image q
-
-#define BUF_CFLOAT double
-/* ---------------------------------------------------------------------- */
-
-AtomVecChargeCuda::AtomVecChargeCuda(LAMMPS *lmp) : AtomVecCharge(lmp)
-{
-   cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-   maxsend=0;
-   cudable=true;
-   cuda_init_done=false;
-   max_nsend=0;
-   cu_copylist=NULL;
-   copylist=NULL;
-   copylist2=NULL;
-}
-
-void AtomVecChargeCuda::grow_copylist(int new_max_nsend)
-{
-  max_nsend=new_max_nsend;
-  delete cu_copylist;
-  delete [] copylist2;
-  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
-  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
-  copylist2 = new int[max_nsend];
-  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
-}
-
-void AtomVecChargeCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
-{
-  int old_maxsend=*maxsend+BUFEXTRA;
-  *maxsend = static_cast<int> (BUFFACTOR * n);
-  if (flag)
-  {
-    if(cuda->pinned)
-    {
-      double* tmp = new double[old_maxsend];
-      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
-      delete [] tmp;
-    }
-    else
-    {
-     *buf_send = (double *)
-      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
-                       "comm:buf_send");
-    }
-  }
-  else {
-   if(cuda->pinned)
-    {
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-    }
-    else
-    {
-      memory->sfree(*buf_send);
-      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
-                                          "comm:buf_send");
-    }
-  }
-}
-
-void AtomVecChargeCuda::grow_both(int n)
-{
-  if(cuda->finished_setup)
-  cuda->downloadAll();
-  AtomVecCharge::grow(n);
-  if(cuda->finished_setup)
-  {
-    cuda->checkResize();
-    cuda->uploadAll();
-  }
-}
-
-int AtomVecChargeCuda::pack_comm(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecCharge::pack_comm(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-int AtomVecChargeCuda::pack_comm_vel(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecCharge::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecCharge::unpack_comm(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
-}
-
-void AtomVecChargeCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecCharge::unpack_comm_vel(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecCharge::pack_reverse(n,first,buf);
-
-  int i,m,last;
-  cuda->cu_f->download();
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    buf[m++] = f[i][0];
-    buf[m++] = f[i][1];
-    buf[m++] = f[i][2];
-  }
-  cuda->cu_f->upload();
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          {AtomVecCharge::unpack_reverse(n,list,buf); return;}
-
-  int i,j,m;
-
-  m = 0;
-  cuda->cu_f->download();
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    f[j][0] += buf[m++];
-    f[j][1] += buf[m++];
-    f[j][2] += buf[m++];
-  }
-  cuda->cu_f->upload();
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeCuda::pack_border(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
- if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecCharge::pack_border(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_AtomVecChargeCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-
-  return m;
-}
-
-int AtomVecChargeCuda::pack_border_vel(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
- if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecCharge::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_AtomVecChargeCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeCuda::unpack_border(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecCharge::unpack_border(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecChargeCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-
-void AtomVecChargeCuda::unpack_border_vel(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecCharge::unpack_border_vel(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecChargeCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-
-/* ----------------------------------------------------------------------
-   pack data for atom I for sending to another proc
-   xyz must be 1st 3 values, so comm::exchange() can test on them
-------------------------------------------------------------------------- */
-
-
-int AtomVecChargeCuda::pack_exchange(int dim, double *buf)
-{
-  if(cuda->oncpu)
-          return AtomVecCharge::pack_exchange(dim,buf);
-
-  if(not cuda_init_done||domain->box_change)
-  {
-          Cuda_AtomVecChargeCuda_Init(&cuda->shared_data);
-          cuda_init_done=true;
-  }
-  double** buf_pointer=(double**) buf;
-  if(*maxsend<atom->nghost || *buf_pointer==NULL)
-  {
-          grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
-          *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
-  }
-
-  if(max_nsend==0) grow_copylist(200);
-
-  int nsend_atoms = Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-  if(nsend_atoms*NCUDAEXCHANGE>*maxsend)
-  {
-          grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
-          Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-  }
-
-  int nlocal=atom->nlocal-nsend_atoms;
-
-  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i>=nlocal) copylist2[i-nlocal]=-1;
-  }
-
-  int actpos=0;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i<nlocal)
-          {
-            while(copylist2[actpos]==-1) actpos++;
-              copylist[j-1]=nlocal+actpos;
-            actpos++;
-          }
-  }
-  cu_copylist->upload();
-
-  cuda->shared_data.atom.nlocal=nlocal;
-
-  int m = Cuda_AtomVecChargeCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
-
-  if (atom->nextra_grow)
-  for(int j=0;j<nsend_atoms;j++)
-  {
-      int i=static_cast <int> ((*buf_pointer)[j+1]);
-      int nextra=0;
-      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
-
-        int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m]));
-        m+=dm;
-                  nextra+=dm;
-                  if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i,1);
-        if(m>*maxsend)  grow_send(m,buf_pointer,1);
-      }
-      (*buf_pointer)[j+1] = nextra;
-  }
-
-  (*buf_pointer)[0] = nsend_atoms;
-  atom->nlocal-=nsend_atoms;
-  cuda->shared_data.atom.update_nlocal=2;
-
-  if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeCuda::unpack_exchange(double *buf)
-{
-  if(cuda->oncpu)
-          return AtomVecCharge::unpack_exchange(buf);
-
-  int dim=cuda->shared_data.exchange_dim;
-  if(domain->box_change)
-  Cuda_AtomVecChargeCuda_Init(&cuda->shared_data);
-
-  int mfirst=0;
-  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
-  {
-  int nlocal = atom->nlocal;
-  int nsend_atoms=static_cast<int> (buf[0]);
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-
-  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost);
-  int naccept = Cuda_AtomVecChargeCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
-  cu_copylist->download();
-  int m = nsend_atoms*NCUDAEXCHANGE + 1;
-  nlocal+=naccept;
-  if (atom->nextra_grow)
-  for(int j=0;j<nsend_atoms;j++)
-  {
-    if(copylist[j]>-1)
-    {
-                    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-                                      m += modify->fix[atom->extra_grow[iextra]]->
-                                        unpack_exchange(copylist[j],&buf[m]);
-    }
-    else
-    m+=static_cast <int> (buf[j+1]);
-  }
-  cuda->shared_data.atom.nlocal=nlocal;
-  cuda->shared_data.atom.update_nlocal=2;
-  atom->nlocal=nlocal;
-  mfirst+=m;
-  buf=&buf[m];
-  }
-  return mfirst;
-}
diff --git a/src/USER-CUDA/atom_vec_charge_cuda.h b/src/USER-CUDA/atom_vec_charge_cuda.h
deleted file mode 100644
index 25d431c917..0000000000
--- a/src/USER-CUDA/atom_vec_charge_cuda.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef ATOM_CLASS
-
-AtomStyle(charge/cuda,AtomVecChargeCuda)
-
-#else
-
-#ifndef LMP_ATOM_VEC_CHARGE_CUDA_H
-#define LMP_ATOM_VEC_CHARGE_CUDA_H
-
-#include "atom_vec_charge.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class AtomVecChargeCuda : public AtomVecCharge {
- public:
-  AtomVecChargeCuda(class LAMMPS *);
-  virtual ~AtomVecChargeCuda() {}
-  void grow_copylist(int n);
-  void grow_send(int n,double** buf_send,int flag);
-  void grow_both(int n);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
- private:
-  class Cuda *cuda;
-  bool cuda_init_done;
-  int* copylist;
-  int* copylist2;
-  cCudaData<int, int, xx >* cu_copylist;
-  int max_nsend;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/atom_vec_full_cuda.cpp b/src/USER-CUDA/atom_vec_full_cuda.cpp
deleted file mode 100644
index dda12603db..0000000000
--- a/src/USER-CUDA/atom_vec_full_cuda.cpp
+++ /dev/null
@@ -1,508 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "atom_vec_full_cuda.h"
-#include "comm_cuda_cu.h"
-#include "atom_vec_full_cuda_cu.h"
-#include "atom.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "universe.h"
-#include "comm.h"
-
-using namespace LAMMPS_NS;
-
-#define BUFFACTOR 1.5
-#define BUFEXTRA 1000
-#define NCUDAEXCHANGE 13 //nextra x y z vx vy vz tag type mask image q molecule
-
-#define BUF_CFLOAT double
-/* ---------------------------------------------------------------------- */
-
-AtomVecFullCuda::AtomVecFullCuda(LAMMPS *lmp) :
-  AtomVecFull(lmp)
-{
-   cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-   maxsend=0;
-   cudable=true;
-   cuda_init_done=false;
-   max_nsend=0;
-   cu_copylist=NULL;
-   copylist=NULL;
-   copylist2=NULL;
-}
-
-void AtomVecFullCuda::grow_copylist(int new_max_nsend)
-{
-  max_nsend=new_max_nsend;
-  delete cu_copylist;
-  delete [] copylist2;
-  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
-  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
-  copylist2 = new int[max_nsend];
-  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
-}
-
-void AtomVecFullCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
-{
-  int old_maxsend=*maxsend+BUFEXTRA;
-  *maxsend = static_cast<int> (BUFFACTOR * n);
-  if (flag)
-  {
-    if(cuda->pinned)
-    {
-      double* tmp = new double[old_maxsend];
-      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
-      delete [] tmp;
-    }
-    else
-    {
-     *buf_send = (double *)
-      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
-                       "comm:buf_send");
-    }
-  }
-  else {
-    if(cuda->pinned)
-    {
-      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
-      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
-    }
-    else
-    {
-      memory->sfree(*buf_send);
-      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
-                                          "comm:buf_send");
-    }
-  }
-}
-
-void AtomVecFullCuda::grow_both(int n)
-{
-  if(cuda->finished_setup)
-  {
-    cuda->cu_special->upload();
-    cuda->cu_nspecial->upload();
-    cuda->downloadAll();
-  }
-  AtomVecFull::grow(n);
-  if(cuda->finished_setup)
-  {
-    cuda->checkResize();
-    cuda->uploadAll();
-  }
-}
-
-int AtomVecFullCuda::pack_comm(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecFull::pack_comm(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-int AtomVecFullCuda::pack_comm_vel(int n, int* iswap, double *buf,
-                             int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecFull::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && m)
-          m=(m+1)*sizeof(X_CFLOAT)/sizeof(double);
-        return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecFull::unpack_comm(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
-}
-
-void AtomVecFullCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecFull::unpack_comm_vel(n,first,buf); return;}
-
-  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecFull::pack_reverse(n,first,buf);
-
-  int i,m,last;
-  cuda->cu_f->download();
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    buf[m++] = f[i][0];
-    buf[m++] = f[i][1];
-    buf[m++] = f[i][2];
-  }
-  cuda->cu_f->upload();
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-          {AtomVecFull::unpack_reverse(n,list,buf); return;}
-
-  int i,j,m;
-
-  m = 0;
-  cuda->cu_f->download();
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    f[j][0] += buf[m++];
-    f[j][1] += buf[m++];
-    f[j][2] += buf[m++];
-  }
-  cuda->cu_f->upload();
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullCuda::pack_border(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
- if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecFull::pack_border(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_AtomVecFullCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-  return m;
-}
-
-int AtomVecFullCuda::pack_border_vel(int n, int *iswap, double *buf,
-                               int pbc_flag, int *pbc)
-{
- if(not cuda->finished_setup || cuda->oncpu)
-          return AtomVecFull::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
-
-        int m = Cuda_AtomVecFullCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullCuda::unpack_border(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecFull::unpack_border(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecFullCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-
-void AtomVecFullCuda::unpack_border_vel(int n, int first, double *buf)
-{
-  if(not cuda->finished_setup || cuda->oncpu)
-           {AtomVecFull::unpack_border_vel(n,first,buf); return;}
-  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
-  {
-          grow_both(0);
-  }
-  int flag=Cuda_AtomVecFullCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
-  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
-}
-
-/* ----------------------------------------------------------------------
-   pack data for atom I for sending to another proc
-   xyz must be 1st 3 values, so comm::exchange() can test on them
-------------------------------------------------------------------------- */
-
-
-int AtomVecFullCuda::pack_exchange(int dim, double *buf)
-{
-  if(cuda->oncpu)
-          return AtomVecFull::pack_exchange(dim,buf);
-
-  if(not cuda_init_done||domain->box_change)
-  {
-          Cuda_AtomVecFullCuda_Init(&cuda->shared_data);
-          cuda_init_done=true;
-  }
-  double** buf_pointer=(double**) buf;
-  if(*maxsend<atom->nghost || *buf_pointer==NULL)
-  {
-          grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
-          *maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
-  }
-
-  if(max_nsend==0) grow_copylist(200);
-
-  int nsend_atoms = Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-  if(nsend_atoms*NCUDAEXCHANGE>*maxsend)
-  {
-          grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
-          Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
-  }
-
-  int nlocal=atom->nlocal-nsend_atoms;
-
-  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i>=nlocal) copylist2[i-nlocal]=-1;
-  }
-
-  int actpos=0;
-  for(int j=1;j<nsend_atoms+1;j++)
-  {
-          int i = static_cast <int> ((*buf_pointer)[j]);
-          if(i<nlocal)
-          {
-            while(copylist2[actpos]==-1) actpos++;
-              copylist[j-1]=nlocal+actpos;
-            actpos++;
-          }
-  }
-  cu_copylist->upload();
-
-  cuda->shared_data.atom.nlocal=nlocal;
-
-  int m = Cuda_AtomVecFullCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
-
-  my_times time1,time2;
-  my_gettime(CLOCK_REALTIME,&time1);
-
-  double* buf_p=*buf_pointer;
-  for(int j=0;j<nsend_atoms;j++)
-  {
-    int i=static_cast <int> (buf_p[j+1]);
-    int nextra=0;
-    int k;
-    buf_p[m++] = num_bond[i];
-    for (k = 0; k < num_bond[i]; k++) {
-      buf_p[m++] = bond_type[i][k];
-      buf_p[m++] = bond_atom[i][k];
-    }
-    nextra+=2*num_bond[i]+1;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    buf_p[m++] = num_angle[i];
-    for (k = 0; k < num_angle[i]; k++) {
-      buf_p[m++] = angle_type[i][k];
-      buf_p[m++] = angle_atom1[i][k];
-      buf_p[m++] = angle_atom2[i][k];
-      buf_p[m++] = angle_atom3[i][k];
-    }
-    nextra+=4*num_angle[i]+1;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    buf_p[m++] = num_dihedral[i];
-    for (k = 0; k < num_dihedral[i]; k++) {
-      buf_p[m++] = dihedral_type[i][k];
-      buf_p[m++] = dihedral_atom1[i][k];
-      buf_p[m++] = dihedral_atom2[i][k];
-      buf_p[m++] = dihedral_atom3[i][k];
-      buf_p[m++] = dihedral_atom4[i][k];
-    }
-    nextra+=5*num_dihedral[i]+1;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    buf_p[m++] = num_improper[i];
-    for (k = 0; k < num_improper[i]; k++) {
-      buf_p[m++] = improper_type[i][k];
-      buf_p[m++] = improper_atom1[i][k];
-      buf_p[m++] = improper_atom2[i][k];
-      buf_p[m++] = improper_atom3[i][k];
-      buf_p[m++] = improper_atom4[i][k];
-    }
-    nextra+=5*num_improper[i]+1;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    buf_p[m++] = nspecial[i][0];
-    buf_p[m++] = nspecial[i][1];
-    buf_p[m++] = nspecial[i][2];
-    for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k];
-    nextra+=nspecial[i][2]+3;
-    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-
-    if (atom->nextra_grow)
-      for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      {
-        int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]);
-        m+=dm;
-                  nextra+=dm;
-                  if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i,1);
-        if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
-      }
-
-    if(i<nlocal)AtomVecFull::copy(copylist[j],i,1);
-    (*buf_pointer)[j+1] = nextra;
-  }
-
-          my_gettime(CLOCK_REALTIME,&time2);
-          cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-  (*buf_pointer)[0] = nsend_atoms;
-  atom->nlocal-=nsend_atoms;
-  cuda->shared_data.atom.update_nlocal=2;
- //printf("End Pack Exchange\n");
-  if(m==1) return 0;
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullCuda::unpack_exchange(double *buf)
-{
-// printf("Begin UnPack Exchange\n");
-  if(cuda->oncpu)
-          return AtomVecFull::unpack_exchange(buf);
-
-  int dim=cuda->shared_data.exchange_dim;
-  if(domain->box_change)
-  Cuda_AtomVecFullCuda_Init(&cuda->shared_data);
-
-  int mfirst=0;
-  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
-  {
-  int nlocal = atom->nlocal;
-  int nsend_atoms=static_cast<int> (buf[0]);
-  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
-
-  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data
-  int naccept = Cuda_AtomVecFullCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
-  cu_copylist->download();
-  int m = nsend_atoms*NCUDAEXCHANGE + 1;
-  nlocal+=naccept;
-
-  my_times time1,time2;
-  my_gettime(CLOCK_REALTIME,&time1);
-
-  for(int j=0;j<nsend_atoms;j++)
-  {
-    if(copylist[j]>-1)
-    {
-           int k;
-          int i=copylist[j];
-      num_bond[i] = static_cast<int> (buf[m++]);
-      for (k = 0; k < num_bond[i]; k++) {
-            bond_type[i][k] = static_cast<int> (buf[m++]);
-            bond_atom[i][k] = static_cast<int> (buf[m++]);
-            }
-
-            num_angle[i] = static_cast<int> (buf[m++]);
-            for (k = 0; k < num_angle[i]; k++) {
-            angle_type[i][k] = static_cast<int> (buf[m++]);
-            angle_atom1[i][k] = static_cast<int> (buf[m++]);
-            angle_atom2[i][k] = static_cast<int> (buf[m++]);
-            angle_atom3[i][k] = static_cast<int> (buf[m++]);
-            }
-
-            num_dihedral[i] = static_cast<int> (buf[m++]);
-            for (k = 0; k < num_dihedral[i]; k++) {
-            dihedral_type[i][k] = static_cast<int> (buf[m++]);
-            dihedral_atom1[i][k] = static_cast<int> (buf[m++]);
-            dihedral_atom2[i][k] = static_cast<int> (buf[m++]);
-            dihedral_atom3[i][k] = static_cast<int> (buf[m++]);
-            dihedral_atom4[i][k] = static_cast<int> (buf[m++]);
-            }
-
-            num_improper[i] = static_cast<int> (buf[m++]);
-            for (k = 0; k < num_improper[i]; k++) {
-            improper_type[i][k] = static_cast<int> (buf[m++]);
-            improper_atom1[i][k] = static_cast<int> (buf[m++]);
-            improper_atom2[i][k] = static_cast<int> (buf[m++]);
-            improper_atom3[i][k] = static_cast<int> (buf[m++]);
-            improper_atom4[i][k] = static_cast<int> (buf[m++]);
-            }
-
-            nspecial[i][0] = static_cast<int> (buf[m++]);
-            nspecial[i][1] = static_cast<int> (buf[m++]);
-            nspecial[i][2] = static_cast<int> (buf[m++]);
-            for (k = 0; k < nspecial[i][2]; k++)
-            special[i][k] = static_cast<int> (buf[m++]);
-
-            if (atom->nextra_grow)
-        for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-                                      m += modify->fix[atom->extra_grow[iextra]]->
-                                        unpack_exchange(i,&buf[m]);
-
-    }
-    else
-    m+=static_cast <int> (buf[j+1]);
-  }
-
-          my_gettime(CLOCK_REALTIME,&time2);
-          cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-  cuda->shared_data.atom.nlocal=nlocal;
-  cuda->shared_data.atom.update_nlocal=2;
-  atom->nlocal=nlocal;
-  mfirst+=m;
-  buf=&buf[m];
-  }
-  return mfirst;
-}
diff --git a/src/USER-CUDA/atom_vec_full_cuda.h b/src/USER-CUDA/atom_vec_full_cuda.h
deleted file mode 100644
index 4ce1b24a51..0000000000
--- a/src/USER-CUDA/atom_vec_full_cuda.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef ATOM_CLASS
-
-AtomStyle(full/cuda,AtomVecFullCuda)
-
-#else
-
-#ifndef LMP_ATOM_VEC_FULL_CUDA_H
-#define LMP_ATOM_VEC_FULL_CUDA_H
-
-#include "atom_vec_full.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class AtomVecFullCuda : public AtomVecFull {
- public:
-  AtomVecFullCuda(class LAMMPS *);
-  virtual ~AtomVecFullCuda() {}
-  void grow_copylist(int n);
-  void grow_send(int n,double** buf_send,int flag);
-  void grow_both(int n);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
- private:
-  class Cuda *cuda;
-  bool cuda_init_done;
-  int* copylist;
-  int* copylist2;
-  cCudaData<int, int, xx >* cu_copylist;
-  int max_nsend;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/comm_cuda.cpp b/src/USER-CUDA/comm_cuda.cpp
deleted file mode 100644
index a03f873ce2..0000000000
--- a/src/USER-CUDA/comm_cuda.cpp
+++ /dev/null
@@ -1,1375 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author (triclinic) : Pieter in 't Veld (SNL)
-------------------------------------------------------------------------- */
-
-#ifdef LAMMPS_BIGBIG
-#error LAMMPS_BIGBIG not supported by this file
-#endif
-
-#include <mpi.h>
-#include <cmath>
-#include <cstring>
-#include <cstdio>
-#include <cstdlib>
-#include "comm_cuda.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "force.h"
-#include "pair.h"
-#include "domain.h"
-#include "neighbor.h"
-#include "modify.h"
-#include "fix.h"
-#include "group.h"
-#include "compute.h"
-#include "user_cuda.h"
-#include "error.h"
-#include "memory.h"
-#include "comm_cuda_cu.h"
-
-using namespace LAMMPS_NS;
-
-#define BUFFACTOR 1.5
-#define BUFMIN 1000
-#define BUFEXTRA 1000
-
-
-
-#define BIG 1.0e20
-
-enum{SINGLE,MULTI};
-
-/* ----------------------------------------------------------------------
-   setup MPI and allocate buffer space
-------------------------------------------------------------------------- */
-
-CommCuda::CommCuda(LAMMPS *lmp) : CommBrick(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  cu_pbc=NULL;
-  cu_slablo=NULL;
-  cu_slabhi=NULL;
-  cu_multilo=NULL;
-  cu_multihi=NULL;
-  cu_sendlist=NULL;
-
-
-  memory->sfree(buf_send);
-  memory->sfree(buf_recv);
-  buf_send = NULL;
-  buf_recv = NULL;
-
-  CommBrick::free_swap();
-  allocate_swap(maxswap);
-}
-
-/* ---------------------------------------------------------------------- */
-
-CommCuda::~CommCuda()
-{
-  delete cu_sendlist;
-  if(cuda->pinned)
-  {
-    CudaWrapper_FreePinnedHostData((void*)buf_send);
-    CudaWrapper_FreePinnedHostData((void*)buf_recv);
-  }
-  else
-  {
-    memory->sfree(buf_send);
-    memory->sfree(buf_recv);
-  }
-  buf_send=NULL;
-  buf_recv=NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void CommCuda::init()
-{
-  if(not buf_send)
-  grow_send(maxsend,0);
-  if(not buf_recv)
-  grow_recv(maxrecv);
-  if(not cu_sendlist)
-  {
-    cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,maxswap,BUFMIN);
-    cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
-    cuda->shared_data.comm.maxswap=maxswap;
-    cuda->shared_data.comm.maxlistlength=BUFMIN;
-    cu_sendlist->upload();
-  }
-  delete cu_pbc;
-  cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,cuda->shared_data.comm.maxswap,6);
-  cu_pbc->upload();
-
-  delete cu_slablo;
-  cu_slablo = new cCudaData<double, X_CFLOAT,x>(slablo,cuda->shared_data.comm.maxswap);
-  cu_slablo->upload();
-
-  delete cu_slabhi;
-  cu_slabhi = new cCudaData<double, X_CFLOAT,x>(slabhi,cuda->shared_data.comm.maxswap);
-  cu_slabhi->upload();
-
-  cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
-  cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
-  cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
-
-  CommBrick::init();
-}
-
-/* ----------------------------------------------------------------------
-   setup spatial-decomposition communication patterns
-   function of neighbor cutoff(s) & cutghostuser & current box size
-   single style sets slab boundaries (slablo,slabhi) based on max cutoff
-   multi style sets type-dependent slab boundaries (multilo,multihi)
-------------------------------------------------------------------------- */
-
-void CommCuda::setup()
-{
-  if(cuda->shared_data.pair.neighall) cutghostuser = MAX(2.0*neighbor->cutneighmax,cutghostuser);
-  CommBrick::setup();
-
-  //upload changed geometry to device
-    if(style == SINGLE)
-    {
-            if(cu_slablo) cu_slablo->upload();
-            if(cu_slabhi) cu_slabhi->upload();
-    }
-        else
-    {
-            if(cu_multilo) cu_multilo->upload();
-            if(cu_multihi) cu_multihi->upload();
-    }
-}
-
-/* ----------------------------------------------------------------------
-   forward communication of atom coords every timestep
-   other per-atom attributes may also be sent via pack/unpack routines
-------------------------------------------------------------------------- */
-
-void CommCuda::forward_comm(int mode)
-{
-  if(mode==0) return forward_comm_cuda();
-  if(mode==1) return forward_comm_pack_cuda();
-  if(mode==2) return forward_comm_transfer_cuda();
-  if(mode==3) return forward_comm_unpack_cuda();
-}
-
-
-void CommCuda::forward_comm_cuda()
-{
-  my_times time1,time2,time3;
-
-  int n;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-
-  cuda->shared_data.domain.xy=domain->xy;
-  cuda->shared_data.domain.xz=domain->xz;
-  cuda->shared_data.domain.yz=domain->yz;
-  cuda->shared_data.domain.prd[0]=domain->prd[0];
-  cuda->shared_data.domain.prd[1]=domain->prd[1];
-  cuda->shared_data.domain.prd[2]=domain->prd[2];
-  cuda->shared_data.domain.triclinic=domain->triclinic;
-  if(not comm_x_only && not avec->cudable)
-  {
-          cuda->downloadAll();
-    CommBrick::forward_comm();
-    cuda->uploadAll();
-    return;
-  }
-
-  // exchange data with another proc
-  // if other proc is self, just copy
-  // if comm_x_only set, exchange or copy directly to x, don't unpack
-
-  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap] != me)
-    {
-      if (comm_x_only)
-      {
-
-        int size_forward_recv_now=0;
-
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
-          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
-        else
-          size_forward_recv_now=size_forward_recv[iswap];
-my_gettime(CLOCK_REALTIME,&time1);
-
-        MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
-                 recvproc[iswap],0,world,&request);
-        n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) buf_send,pbc[iswap],pbc_flag[iswap]);
-
-my_gettime(CLOCK_REALTIME,&time2);
-
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
-          n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
-
-                //printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now,n);
-            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-my_gettime(CLOCK_REALTIME,&time3);
-cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
-      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
-cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
-      time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;
-
-        Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
-
-      }
-      else if (ghost_velocity)
-      {
-            MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-
-        if(avec->cudable)
-          n = avec->pack_comm_vel(sendnum[iswap],&iswap,
-                           buf_send,pbc_flag[iswap],pbc[iswap]);
-        else
-              n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
-                                buf_send,pbc_flag[iswap],pbc[iswap]);
-
-            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-            avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_recv);
-      }
-      else
-      {
-            MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-
-        if(avec->cudable)
-          n = avec->pack_comm(sendnum[iswap],&iswap,
-                           buf_send,pbc_flag[iswap],pbc[iswap]);
-        else
-              n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
-
-            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
-      }
-
-    }
-    else  //sendproc == me
-    {
-      cuda->self_comm=1;
-      if (comm_x_only)
-      {
-            if (sendnum[iswap])
-                {
-          n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
-          if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
-          if((sizeof(X_CFLOAT)!=sizeof(double)) && n)
-            n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
-                }
-      }
-      else if (ghost_velocity)
-      {
-                n = avec->pack_comm_vel(sendnum[iswap],&iswap,
-                                (double*) firstrecv,pbc_flag[iswap],pbc[iswap]);
-            //avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],(double*) firstrecv);
-      }
-      else
-      {
-                n = avec->pack_comm(sendnum[iswap],&iswap,
-                            (double*) firstrecv,pbc_flag[iswap],pbc[iswap]);
-                //avec->unpack_comm(recvnum[iswap],firstrecv[iswap],(double*) firstrecv);
-      }
-      cuda->self_comm=0;
-    }
-  }
-}
-
-void CommCuda::forward_comm_pack_cuda()
-{
-  my_times time1,time2;
-  int n;  // initialize comm buffers & exchange memory
-
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-
-  cuda->shared_data.domain.xy=domain->xy;
-  cuda->shared_data.domain.xz=domain->xz;
-  cuda->shared_data.domain.yz=domain->yz;
-  cuda->shared_data.domain.prd[0]=domain->prd[0];
-  cuda->shared_data.domain.prd[1]=domain->prd[1];
-  cuda->shared_data.domain.prd[2]=domain->prd[2];
-  cuda->shared_data.domain.triclinic=domain->triclinic;
-  if(not comm_x_only && not avec->cudable) cuda->downloadAll();  //if not comm_x_only the communication routine of the atom_vec style class is used
-
-  // exchange data with another proc
-  // if other proc is self, just copy
-  // if comm_x_only set, exchange or copy directly to x, don't unpack
-
-  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap] != me)
-    {
-      if (comm_x_only)
-      {
-
-
-my_gettime(CLOCK_REALTIME,&time1);
-
-      //  n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*) cuda->shared_data.comm.buf_send[iswap],pbc[iswap],pbc_flag[iswap]);
-                  n = Cuda_CommCuda_PackComm(&cuda->shared_data,sendnum[iswap],iswap,(void*)buf_send,pbc[iswap],pbc_flag[iswap]);
-
-my_gettime(CLOCK_REALTIME,&time2);
-
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
-          n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
-                cuda->shared_data.comm.send_size[iswap]=n;
-      }
-      else if (ghost_velocity)
-      {
-my_gettime(CLOCK_REALTIME,&time1);
-
-       // n = Cuda_CommCuda_PackComm_Vel(&cuda->shared_data,sendnum[iswap],iswap,(void*) &buf_send[iswap*maxsend],pbc[iswap],pbc_flag[iswap]);
-
-my_gettime(CLOCK_REALTIME,&time2);
-
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && n) //some complicated way to safe some transfer size if single precision is used
-          n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
-                cuda->shared_data.comm.send_size[iswap]=n;
-       }
-      else
-      {
-            MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-
-        if(avec->cudable)
-          n = avec->pack_comm(sendnum[iswap],&iswap,
-                           cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
-        else
-              n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            cuda->shared_data.comm.buf_send[iswap],pbc_flag[iswap],pbc[iswap]);
-
-            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
-      }
-
-    }
-    else  //sendproc == me
-    {
-      if (comm_x_only)
-      {
-            if (sendnum[iswap])
-                {
-          n = Cuda_CommCuda_PackComm_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap],pbc[iswap],pbc_flag[iswap]);
-          if(n<0) error->all(FLERR," # CUDA ERRROR on PackComm_Self");
-          if((sizeof(X_CFLOAT)!=sizeof(double)) && n)
-            n=(n+1)*sizeof(X_CFLOAT)/sizeof(double);
-                }
-      }
-      else if (ghost_velocity)
-      {
-                n = avec->pack_comm_vel(sendnum[iswap],sendlist[iswap],
-                                buf_send,pbc_flag[iswap],pbc[iswap]);
-            avec->unpack_comm_vel(recvnum[iswap],firstrecv[iswap],buf_send);
-      }
-      else
-      {
-                n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
-                avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
-      }
-    }
-  }
-  if(not comm_x_only && not avec->cudable) cuda->uploadAll();
-}
-
-void CommCuda::forward_comm_transfer_cuda()
-{
-  my_times time1,time2,time3;
-  int n;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-  cuda->shared_data.domain.xy=domain->xy;
-  cuda->shared_data.domain.xz=domain->xz;
-  cuda->shared_data.domain.yz=domain->yz;
-  cuda->shared_data.domain.prd[0]=domain->prd[0];
-  cuda->shared_data.domain.prd[1]=domain->prd[1];
-  cuda->shared_data.domain.prd[2]=domain->prd[2];
-  cuda->shared_data.domain.triclinic=domain->triclinic;
-  if(not comm_x_only && not avec->cudable) cuda->downloadAll();  //if not comm_x_only the communication routine of the atom_vec style class is used
-//printf("A\n");
-  // exchange data with another proc
-  // if other proc is self, just copy
-  // if comm_x_only set, exchange or copy directly to x, don't unpack
-
-  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap] != me)
-    {
-      if (comm_x_only)
-      {
-
-        int size_forward_recv_now=0;
-
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
-          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
-        else
-          size_forward_recv_now=size_forward_recv[iswap];
-
-        //printf("A: %i \n",size_forward_recv_now/1024*4);
-        //MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
-        //         recvproc[iswap],0,world,&request);
-        MPI_Irecv(buf_recv,size_forward_recv_now,MPI_DOUBLE,
-                 recvproc[iswap],0,world,&request);
-                //printf("%p %p %i\n",buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double));
-        //memcpy(buf_send,cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap]*sizeof(double));
-        //        CudaWrapper_SyncStream(1);
-        //printf("B: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
-                CudaWrapper_DownloadCudaDataAsync((void*) buf_send, cuda->shared_data.comm.buf_send_dev[iswap], cuda->shared_data.comm.send_size[iswap]*sizeof(double),2);
-            //MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-my_gettime(CLOCK_REALTIME,&time1);
-        CudaWrapper_SyncStream(2);
-        //printf("C: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
-my_gettime(CLOCK_REALTIME,&time2);
-cuda->shared_data.cuda_timings.comm_forward_download+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-            MPI_Send(buf_send,cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-        //printf("D: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
-                CudaWrapper_UploadCudaDataAsync((void*) buf_recv,cuda->shared_data.comm.buf_recv_dev[iswap], size_forward_recv_now*sizeof(double),2);
-my_gettime(CLOCK_REALTIME,&time1);
-        CudaWrapper_SyncStream(2);
-        //printf("E: %i \n",cuda->shared_data.comm.send_size[iswap]/1024*4);
-        //memcpy(cuda->shared_data.comm.buf_recv[iswap],buf_recv,size_forward_recv_now*sizeof(double));
-                 //printf("RecvSize: %i SendSize: %i\n",size_forward_recv_now*sizeof(double),cuda->shared_data.comm.send_size[iswap]*sizeof(double));
-my_gettime(CLOCK_REALTIME,&time3);
-cuda->shared_data.cuda_timings.comm_forward_upload+=
-      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
-cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
-      time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;
-my_gettime(CLOCK_REALTIME,&time3);
-cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
-      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
-      }
-      else if (ghost_velocity)
-      {
- /*       int size_forward_recv_now=0;
-
-        if((sizeof(X_CFLOAT)!=sizeof(double)) && size_forward_recv[iswap]) //some complicated way to safe some transfer size if single precision is used
-          size_forward_recv_now=(size_forward_recv[iswap]+1)*sizeof(X_CFLOAT)/sizeof(double);
-        else
-          size_forward_recv_now=size_forward_recv[iswap];
-
-my_gettime(CLOCK_REALTIME,&time1);
-
-        MPI_Irecv(cuda->shared_data.comm.buf_recv[iswap],size_forward_recv_now,MPI_DOUBLE,
-                 recvproc[iswap],0,world,&request);
-
-my_gettime(CLOCK_REALTIME,&time2);
-
-            MPI_Send(cuda->shared_data.comm.buf_send[iswap],cuda->shared_data.comm.send_size[iswap],MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-my_gettime(CLOCK_REALTIME,&time3);
-cuda->shared_data.cuda_timings.comm_forward_mpi_upper+=
-      time3.tv_sec-time1.tv_sec+1.0*(time3.tv_nsec-time1.tv_nsec)/1000000000;
-cuda->shared_data.cuda_timings.comm_forward_mpi_lower+=
-      time3.tv_sec-time2.tv_sec+1.0*(time3.tv_nsec-time2.tv_nsec)/1000000000;*/
-
-       }
-      else
-      {
-            MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-
-        if(avec->cudable)
-          n = avec->pack_comm(sendnum[iswap],&iswap,
-                           buf_send,pbc_flag[iswap],pbc[iswap]);
-        else
-              n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
-
-            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
-      }
-
-    }
-    else  //sendproc == me
-    {
-      if (comm_x_only)
-      {
-            if (sendnum[iswap])
-                {
-                }
-      }
-      else if (ghost_velocity)
-      {
-      }
-      else
-      {
-                n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
-                avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
-      }
-    }
-  }
-  if(not comm_x_only && not avec->cudable) cuda->uploadAll();
-}
-
-void CommCuda::forward_comm_unpack_cuda()
-{
-  int n;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-
-  cuda->shared_data.domain.xy=domain->xy;
-  cuda->shared_data.domain.xz=domain->xz;
-  cuda->shared_data.domain.yz=domain->yz;
-  cuda->shared_data.domain.prd[0]=domain->prd[0];
-  cuda->shared_data.domain.prd[1]=domain->prd[1];
-  cuda->shared_data.domain.prd[2]=domain->prd[2];
-  cuda->shared_data.domain.triclinic=domain->triclinic;
-  if(not comm_x_only && not avec->cudable) cuda->downloadAll();  //if not comm_x_only the communication routine of the atom_vec style class is used
-
-  // exchange data with another proc
-  // if other proc is self, just copy
-  // if comm_x_only set, exchange or copy directly to x, don't unpack
-
-  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap] != me)
-    {
-      if (comm_x_only)
-      {
-
-        //Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],cuda->shared_data.comm.buf_recv[iswap],iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
-        Cuda_CommCuda_UnpackComm(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],buf_recv,iswap); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
-
-      }
-      else if (ghost_velocity)
-      {
-        //Cuda_CommCuda_UnpackComm_Vel(&cuda->shared_data,recvnum[iswap],firstrecv[iswap],(void*)&buf_recv[iswap*maxrecv]); //Unpack for cpu exchange happens implicitely since buf==x[firstrecv]
-      }
-      else
-      {
-            MPI_Irecv(buf_recv,size_forward_recv[iswap],MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-
-        if(avec->cudable)
-          n = avec->pack_comm(sendnum[iswap],&iswap,
-                           buf_send,pbc_flag[iswap],pbc[iswap]);
-        else
-              n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
-
-            MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-            MPI_Wait(&request,MPI_STATUS_IGNORE);
-            avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_recv);
-      }
-
-    }
-    else  //sendproc == me
-    {
-      if (comm_x_only)
-      {
-            if (sendnum[iswap])
-                {
-                }
-      }
-      else if (ghost_velocity)
-      {
-      }
-      else
-      {
-                n = avec->pack_comm(sendnum[iswap],sendlist[iswap],
-                            buf_send,pbc_flag[iswap],pbc[iswap]);
-                avec->unpack_comm(recvnum[iswap],firstrecv[iswap],buf_send);
-      }
-    }
-  }
-  if(not comm_x_only && not avec->cudable) cuda->uploadAll();
-}
-
-void CommCuda::forward_comm_pair(Pair *pair)
-{
-  if(not cuda->shared_data.pair.cudable_force)
-  {
-          return CommBrick::forward_comm_pair(pair);
-  }
-
-  int iswap,n;
-  double *buf;
-  MPI_Request request;
-
-  int nsize = pair->comm_forward;
-
-  for (iswap = 0; iswap < nswap; iswap++) {
-
-    // pack buffer
-
-    n = pair->pack_forward_comm(sendnum[iswap],&iswap,
-                                buf_send,pbc_flag[iswap],pbc[iswap]);
-        int nrecv = recvnum[iswap]*nsize;
-        if(nrecv<0) nrecv=-(nrecv+1)/2;
-        int nsend = n;
-        if(nsend<0) nsend=-(nsend+1)/2;
-
-    // exchange with another proc
-    // if self, set recv buffer to send buffer
-
-    if (sendproc[iswap] != me) {
-      MPI_Irecv(buf_recv,nrecv,MPI_DOUBLE,recvproc[iswap],0,
-                world,&request);
-      MPI_Send(buf_send,nsend,MPI_DOUBLE,sendproc[iswap],0,world);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
-      buf = buf_recv;
-    } else buf = buf_send;
-
-    // unpack buffer
-
-    pair->unpack_forward_comm(recvnum[iswap],firstrecv[iswap],buf);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   reverse communication of forces on atoms every timestep
-   other per-atom attributes may also be sent via pack/unpack routines
-------------------------------------------------------------------------- */
-
-void CommCuda::reverse_comm()
-{
-  int n;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-  double *buf;
-
-  if(not comm_f_only && not avec->cudable) cuda->downloadAll();  //not yet implemented in CUDA but only needed for non standard atom styles
-
-  // exchange data with another proc
-  // if other proc is self, just copy
-  // if comm_f_only set, exchange or copy directly from f, don't pack
-
-  for (int iswap = nswap-1; iswap >= 0; iswap--) {
-    if (sendproc[iswap] != me) {
-      if (comm_f_only) {
-
-    int size_recv_now=size_reverse_recv[iswap];
-        if((sizeof(F_CFLOAT)!=sizeof(double))&& size_reverse_recv[iswap])
-          size_recv_now=(size_recv_now+1)*sizeof(F_CFLOAT)/sizeof(double);
-        MPI_Irecv(buf_recv,size_recv_now,MPI_DOUBLE,
-                  sendproc[iswap],0,world,&request);
-
-    buf=buf_send;
-    if (size_reverse_send[iswap])
-    {
-      Cuda_CommCuda_PackReverse(&cuda->shared_data,size_reverse_send[iswap]/3,firstrecv[iswap],buf);
-    }
-    else buf=NULL;
-    int size_reverse_send_now=size_reverse_send[iswap];
-        if((sizeof(F_CFLOAT)!=sizeof(double))&& size_reverse_send[iswap])
-          size_reverse_send_now=(size_reverse_send_now+1)*sizeof(F_CFLOAT)/sizeof(double);
-        MPI_Send(buf,size_reverse_send_now,MPI_DOUBLE,
-                 recvproc[iswap],0,world);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-        Cuda_CommCuda_UnpackReverse(&cuda->shared_data,sendnum[iswap],iswap,buf_recv);
-
-      } else {
-        MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
-                  sendproc[iswap],0,world,&request);
-        n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
-        MPI_Send(buf_send,n,MPI_DOUBLE,recvproc[iswap],0,world);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-      avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_recv);
-      }
-
-    } else {
-      if (comm_f_only) {
-        if (sendnum[iswap])
-              Cuda_CommCuda_UnpackReverse_Self(&cuda->shared_data,sendnum[iswap],iswap,firstrecv[iswap]);
-      } else {
-        n = avec->pack_reverse(recvnum[iswap],firstrecv[iswap],buf_send);
-        avec->unpack_reverse(sendnum[iswap],sendlist[iswap],buf_send);
-      }
-    }
-  }
-  if(not comm_f_only && not avec->cudable) cuda->uploadAll();  //not yet implemented in CUDA but only needed for non standard atom styles
-}
-
-/* ----------------------------------------------------------------------
-   exchange: move atoms to correct processors
-   atoms exchanged with all 6 stencil neighbors
-   send out atoms that have left my box, receive ones entering my box
-   atoms will be lost if not inside some proc's box
-     can happen if atom moves outside of non-periodic bounary
-     or if atom moves more than one proc away
-   this routine called before every reneighboring
-   for triclinic, atoms must be in lamda coords (0-1) before exchange is called
-------------------------------------------------------------------------- */
-
-void CommCuda::exchange()
-{
-  AtomVec *avec = atom->avec;
-
-  if(not cuda->oncpu && avec->cudable)
-            return exchange_cuda();
-
-  if(not cuda->oncpu) cuda->downloadAll();
-
-  CommBrick::exchange();
-}
-
-
-void CommCuda::exchange_cuda()
-{
-  int nsend,nrecv,nrecv1,nrecv2,nlocal;
-  double *buf;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-  my_times time1,time2;
-
-  // clear global->local map for owned and ghost atoms
-  // b/c atoms migrate to new procs in exchange() and
-  // new ghosts are created in borders()
-  // map_set() is done at end of borders()
-
-
-  if(map_style) cuda->cu_tag->download();
-
-  if (map_style) atom->map_clear();
-
-  // loop over dimensions
-
-  for (int dim = 0; dim < 3; dim++) {
-    // fill buffer with atoms leaving my box, using < and >=
-    // when atom is deleted, fill it in with last atom
-
-    cuda->shared_data.exchange_dim=dim;
-
-    nlocal = atom->nlocal;
-    avec->maxsend=&maxsend;
-    nsend=avec->pack_exchange(dim,(double*) &buf_send);
-    nlocal = atom->nlocal;
-
-
-    atom->nlocal = nlocal;
-
-    // send/recv atoms in both directions
-    // if 1 proc in dimension, no send/recv, set recv buf to send buf
-    // if 2 procs in dimension, single send/recv
-    // if more than 2 procs in dimension, send/recv to both neighbors
-
- my_gettime(CLOCK_REALTIME,&time1);
-
-    if (procgrid[dim] == 1) {
-      nrecv = nsend;
-      buf = buf_send;
-
-    } else {
-      MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
-                   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
-      nrecv = nrecv1;
-      if (procgrid[dim] > 2) {
-        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
-                     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
-        nrecv += nrecv2;
-      }
-      if (nrecv+1 > maxrecv) grow_recv(nrecv+1);
-
-      MPI_Irecv(buf_recv,nrecv1,MPI_DOUBLE,procneigh[dim][1],0,
-                world,&request);
-      MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][0],0,world);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-      if (procgrid[dim] > 2) {
-        MPI_Irecv(&buf_recv[nrecv1],nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
-                  world,&request);
-        MPI_Send(buf_send,nsend,MPI_DOUBLE,procneigh[dim][1],0,world);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-            if((nrecv1==0)||(nrecv2==0)) buf_recv[nrecv]=0;
-      }
-
-      buf = buf_recv;
-    }
-        //printf("nsend: %i nrecv: %i\n",nsend,nrecv);
-    // check incoming atoms to see if they are in my box
-    // if so, add to my list
-my_gettime(CLOCK_REALTIME,&time2);
-cuda->shared_data.cuda_timings.comm_exchange_mpi+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-    if(nrecv)
-    {
-      avec->maxsend=&maxsend;
-      avec->unpack_exchange(buf);
-    }
-  }
-
-  if(atom->firstgroupname) cuda->downloadAll();
-
-  if(atom->firstgroupname) atom->first_reorder();
-
-  if(atom->firstgroupname) cuda->uploadAll();
-}
-
-/* ----------------------------------------------------------------------
-   borders: list nearby atoms to send to neighboring procs at every timestep
-   one list is created for every swap that will be made
-   as list is made, actually do swaps
-   this does equivalent of a communicate (so don't need to explicitly
-     call communicate routine on reneighboring timestep)
-   this routine is called before every reneighboring
-   for triclinic, atoms must be in lamda coords (0-1) before borders is called
-------------------------------------------------------------------------- */
-
-
-void CommCuda::borders()
-{
-  AtomVec *avec = atom->avec;
-  if(not cuda->oncpu && avec->cudable)
-  {
-          if(cuda->shared_data.overlap_comm&&cuda->finished_setup)
-             borders_cuda_overlap_forward_comm();
-           else
-             borders_cuda();
-
-           return;
-  }
-
-  CommBrick::borders();
-
-  cuda->setSystemParams();
-  if(cuda->finished_setup) {cuda->checkResize(); cuda->uploadAll();}
-  cuda->shared_data.atom.nghost=atom->nghost;
-  cu_sendlist->upload();
-}
-
-void CommCuda::borders_cuda()
-{
-  int n,iswap,dim,ineed,twoneed,smax,rmax;
-  int nsend,nrecv,nfirst,nlast;
-  double *buf;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-  my_times time1,time2;
-
-  // clear old ghosts
-
-  atom->nghost = 0;
-
-  // do swaps over all 3 dimensions
-
-  iswap = 0;
-  smax = rmax = 0;
-
-  cuda->shared_data.comm.nsend=0;
-  for (dim = 0; dim < 3; dim++) {
-    nlast = 0;
-    twoneed = 2*maxneed[dim];
-    for (ineed = 0; ineed < twoneed; ineed++) {
-
-      // find atoms within slab boundaries lo/hi using <= and >=
-      // check atoms between nfirst and nlast
-      //   for first swaps in a dim, check owned and ghost
-      //   for later swaps in a dim, only check newly arrived ghosts
-      // store sent atom indices in list for use in future timesteps
-
-      if (ineed % 2 == 0) {
-        nfirst = nlast;
-        nlast = atom->nlocal + atom->nghost;
-      }
-
-      nsend = 0;
-
-      // find send atoms according to SINGLE vs MULTI
-      // all atoms eligible versus atoms in bordergroup
-      // only need to limit loop to bordergroup for first sends (ineed < 2)
-      // on these sends, break loop in two: owned (in group) and ghost
-     do
-     {
-       if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast <int> (nsend*1.05));
-               nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap);
-     }while(nsend>=maxsendlist[iswap]);
-      // pack up list of border atoms
-
-      if (nsend*size_border > maxsend)
-        grow_send(nsend*size_border,0);
-
-      if (ghost_velocity)
-        n = avec->pack_border_vel(nsend,&iswap,buf_send,
-                           pbc_flag[iswap],pbc[iswap]);
-      else
-        n = avec->pack_border(nsend,&iswap,buf_send,
-                           pbc_flag[iswap],pbc[iswap]);
-
-      // swap atoms with other proc
-      // put incoming ghosts at end of my atom arrays
-      // if swapping with self, simply copy, no messages
-
-my_gettime(CLOCK_REALTIME,&time1);
-      if (sendproc[iswap] != me) {
-        MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
-                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE);
-        if (nrecv*size_border > maxrecv)
-          grow_recv(nrecv*size_border);
-        MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-        MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-        buf = buf_recv;
-      } else {
-        nrecv = nsend;
-        buf = buf_send;
-      }
-
-my_gettime(CLOCK_REALTIME,&time2);
-cuda->shared_data.cuda_timings.comm_border_mpi+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-      // unpack buffer
-
-      if (ghost_velocity)
-        avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
-      else
-        avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf);
-
-      // set all pointers & counters
-
-      smax = MAX(smax,nsend);
-      rmax = MAX(rmax,nrecv);
-      sendnum[iswap] = nsend;
-      recvnum[iswap] = nrecv;
-      size_forward_recv[iswap] = nrecv*size_forward;
-      size_reverse_send[iswap] = nrecv*size_reverse;
-      size_reverse_recv[iswap] = nsend*size_reverse;
-      firstrecv[iswap] = atom->nlocal + atom->nghost;
-      atom->nghost += nrecv;
-      iswap++;
-    }
-  }
-
-  // insure send/recv buffers are long enough for all forward & reverse comm
-
-  int max = MAX(maxforward*smax,maxreverse*rmax);
-  if (max > maxsend) grow_send(max,0);
-  max = MAX(maxforward*rmax,maxreverse*smax);
-  if (max > maxrecv) grow_recv(max);
-
-  // reset global->local map
-  if(map_style)
-  {
-          cuda->cu_tag->download();
-         atom->map_set();
-  }
-
-  cuda->setSystemParams();
-  cuda->shared_data.atom.nghost+=n;
-}
-
-void CommCuda::borders_cuda_overlap_forward_comm()
-{
-  int n,iswap,dim,ineed,twoneed,smax,rmax;
-  int nsend,nrecv,nfirst,nlast;
-  double *buf;
-  MPI_Request request;
-  AtomVec *avec = atom->avec;
-  my_times time1,time2;
-
-  // clear old ghosts
-
-  atom->nghost = 0;
-
-  // do swaps over all 3 dimensions
-
-  iswap = 0;
-  smax = rmax = 0;
-
-  cuda->shared_data.comm.nsend=0;
-  for (dim = 0; dim < 3; dim++) {
-    nlast = 0;
-    twoneed = 2*maxneed[dim];
-    for (ineed = 0; ineed < twoneed; ineed++) {
-
-      // find atoms within slab boundaries lo/hi using <= and >=
-      // check atoms between nfirst and nlast
-      //   for first swaps in a dim, check owned and ghost
-      //   for later swaps in a dim, only check newly arrived ghosts
-      // store sent atom indices in list for use in future timesteps
-
-      if (ineed % 2 == 0) {
-        nfirst = nlast;
-        nlast = atom->nlocal + atom->nghost;
-      }
-
-      nsend = 0;
-
-      // find send atoms according to SINGLE vs MULTI
-      // all atoms eligible versus atoms in bordergroup
-      // only need to limit loop to bordergroup for first sends (ineed < 2)
-      // on these sends, break loop in two: owned (in group) and ghost
-     do
-     {
-       if(nsend>=maxsendlist[iswap]) grow_list(iswap,static_cast <int> (nsend*1.05));
-               nsend=Cuda_CommCuda_BuildSendlist(&cuda->shared_data,bordergroup,ineed,style==SINGLE?1:0,atom->nfirst,nfirst,nlast,dim,iswap);
-     }while(nsend>=maxsendlist[iswap]);
-         cuda->shared_data.comm.nsend_swap[iswap]=nsend;
-          // pack up list of border atoms
-
-      if (nsend*size_border > maxsend)
-        grow_send(nsend*size_border,0);
-
-      if (ghost_velocity)
-        n = avec->pack_border_vel(nsend,&iswap,buf_send,
-                           pbc_flag[iswap],pbc[iswap]);
-      else
-        n = avec->pack_border(nsend,&iswap,buf_send,
-                           pbc_flag[iswap],pbc[iswap]);
-
-      // swap atoms with other proc
-      // put incoming ghosts at end of my atom arrays
-      // if swapping with self, simply copy, no messages
-
-my_gettime(CLOCK_REALTIME,&time1);
-      if (sendproc[iswap] != me) {
-        MPI_Sendrecv(&nsend,1,MPI_INT,sendproc[iswap],0,
-                     &nrecv,1,MPI_INT,recvproc[iswap],0,world,MPI_STATUS_IGNORE);
-        if (nrecv*size_border > maxrecv)
-          grow_recv(nrecv*size_border);
-        MPI_Irecv(buf_recv,nrecv*size_border,MPI_DOUBLE,
-                  recvproc[iswap],0,world,&request);
-        MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-        MPI_Wait(&request,MPI_STATUS_IGNORE);
-        buf = buf_recv;
-      } else {
-        nrecv = nsend;
-        buf = buf_send;
-      }
-
-my_gettime(CLOCK_REALTIME,&time2);
-cuda->shared_data.cuda_timings.comm_border_mpi+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-      // unpack buffer
-
-      if (ghost_velocity)
-        avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf);
-      else
-        avec->unpack_border(nrecv,atom->nlocal+atom->nghost,buf);
-
-      // set all pointers & counters
-
-      smax = MAX(smax,nsend);
-      rmax = MAX(rmax,nrecv);
-      sendnum[iswap] = nsend;
-      recvnum[iswap] = nrecv;
-      size_forward_recv[iswap] = nrecv*size_forward;
-      size_reverse_send[iswap] = nrecv*size_reverse;
-      size_reverse_recv[iswap] = nsend*size_reverse;
-      firstrecv[iswap] = atom->nlocal + atom->nghost;
-      atom->nghost += nrecv;
-      iswap++;
-    }
-  }
-
-  // insure send/recv buffers are long enough for all forward & reverse comm
-
-  int max = MAX(maxforward*smax,maxreverse*rmax);
-  if (max > maxsend) grow_send(max,0);
-  max = MAX(maxforward*rmax,maxreverse*smax);
-  if (max > maxrecv) grow_recv(max);
-
-  // reset global->local map
-  if(map_style)
-  {
-          cuda->cu_tag->download();
-         atom->map_set();
-  }
-
-  cuda->setSystemParams();
-  cuda->shared_data.atom.nghost+=n;
-}
-
-
-
-
-void CommCuda::forward_comm_fix(Fix *fix, int size)
-{
-  int iswap,n;
-  double *buf;
-  MPI_Request request;
-
-  int nsize = fix->comm_forward;
-
-  for (iswap = 0; iswap < nswap; iswap++) {
-    // pack buffer
-    if(fix->cudable_comm&&cuda->finished_setup)
-    {
-            int swap=iswap;
-        if(sendproc[iswap] == me) {swap=-iswap-1; buf=(double*)&(firstrecv[iswap]);}
-        else buf=buf_send;
-
-        n = fix->pack_forward_comm(sendnum[iswap],&swap,
-                                   buf,pbc_flag[iswap],pbc[iswap]);
-        if(sendproc[iswap] == me)
-        {
-                continue;
-        }
-    }
-    else
-      n = fix->pack_forward_comm(sendnum[iswap],sendlist[iswap],
-                                 buf_send,pbc_flag[iswap],pbc[iswap]);
-
-     // exchange with another proc
-    // if self, set recv buffer to send buffer
-
-    if (sendproc[iswap] != me) {
-      MPI_Irecv(buf_recv,nsize*recvnum[iswap],MPI_DOUBLE,recvproc[iswap],0,
-                world,&request);
-      MPI_Send(buf_send,n,MPI_DOUBLE,sendproc[iswap],0,world);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
-      buf = buf_recv;
-    } else buf = buf_send;
-
-    // unpack buffer
-
-    fix->unpack_forward_comm(recvnum[iswap],firstrecv[iswap],buf);
-  }
-}
-
-
-void CommCuda::grow_send(int n, int flag)
-{
-  int oldmaxsend = (maxsend+BUFEXTRA)*sizeof(double);
-  maxsend = static_cast<int> (BUFFACTOR * n);
-  if (flag){
-    if(cuda->pinned)
-    {
-      double* tmp = new double[oldmaxsend];
-      memcpy((void*) tmp,(void*) buf_send,oldmaxsend*sizeof(double));
-      if(buf_send) CudaWrapper_FreePinnedHostData((void*) (buf_send));
-      buf_send = (double*) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)*sizeof(double),false);
-      memcpy(buf_send,tmp,oldmaxsend*sizeof(double));
-      delete [] tmp;
-    }
-    else
-    {
-    buf_send = (double *)
-      memory->srealloc(buf_send,(maxsend+BUFEXTRA)*sizeof(double),
-                       "comm:buf_send");printf("srealloc\n");
-    }
-  }
-  else {
-    if(cuda->pinned)
-    {
-      if(buf_send) CudaWrapper_FreePinnedHostData((void*) buf_send);
-      buf_send = (double*) CudaWrapper_AllocPinnedHostData((maxsend+BUFEXTRA)*sizeof(double),false);
-    }
-    else
-    {
-      memory->sfree(buf_send);
-      buf_send = (double *) memory->smalloc((maxsend+BUFEXTRA)*sizeof(double),
-                                          "comm:buf_send");
-    }
-    for(int i=0;i<maxswap;i++)
-    {
-      if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],oldmaxsend);
-      cuda->shared_data.comm.buf_send_dev[i]=CudaWrapper_AllocCudaData((maxsend+BUFEXTRA)*sizeof(double));
-    }
-  }
-}
-/* ----------------------------------------------------------------------
-   free/malloc the size of the recv buffer as needed with BUFFACTOR
-------------------------------------------------------------------------- */
-
-
-void CommCuda::grow_recv(int n)
-{
-  int oldmaxrecv = maxrecv*sizeof(double);
-  maxrecv = static_cast<int> (BUFFACTOR * n);
-  if(cuda->pinned)
-  {
-    if(buf_recv) CudaWrapper_FreePinnedHostData((void*)buf_recv);
-    buf_recv = (double*) CudaWrapper_AllocPinnedHostData(maxrecv*sizeof(double), false,true);
-  }
-  else
-  {
-    memory->sfree(buf_recv);
-    buf_recv = (double *) memory->smalloc(maxrecv*sizeof(double),
-                                        "comm:buf_recv");
-  }
-  for(int i=0;i<maxswap;i++)
-  {
-    if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],oldmaxrecv);
-    cuda->shared_data.comm.buf_recv_dev[i]=CudaWrapper_AllocCudaData((maxrecv)*sizeof(double));
-  }
-}
-
-/* ----------------------------------------------------------------------
-   realloc the size of the iswap sendlist as needed with BUFFACTOR
-------------------------------------------------------------------------- */
-
-void CommCuda::grow_list(int iswap, int n)
-{
-
-  MYDBG(printf(" # CUDA CommCuda::grow_list\n");)
-  if(cuda->finished_setup&&cu_sendlist) cu_sendlist->download();
-  if(!cu_sendlist||n*BUFFACTOR>cu_sendlist->get_dim()[1]||n*BUFFACTOR>maxsendlist[iswap])
-  {
-          for(int i=0;i<maxswap;i++)
-          {
-            maxsendlist[i] = static_cast<int> (BUFFACTOR * n);
-            sendlist[i] = (int *)
-                    memory->srealloc(sendlist[i],maxsendlist[i]*sizeof(int),
-                                     "comm:sendlist[iswap]");
-          }
-          delete cu_sendlist;
-          cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,maxswap,maxsendlist[iswap]);
-          cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
-    cuda->shared_data.comm.maxlistlength=maxsendlist[iswap];
-    cu_sendlist->upload();
-  }
- }
-
-/* ----------------------------------------------------------------------
-   realloc the buffers needed for swaps
-------------------------------------------------------------------------- */
-
-void CommCuda::grow_swap(int n)
-{
-  int oldmaxswap=maxswap;
-  CommBrick::grow_swap(n);
-  if(n>cu_sendlist->get_dim()[0])
-  {
-   MYDBG(printf(" # CUDA CommCuda::grow_swap\n");)
-
-          delete cu_sendlist;
-          cu_sendlist=new cCudaData<int, int, xy> ((int*)sendlist,n,BUFMIN);
-          cuda->shared_data.comm.sendlist.dev_data=cu_sendlist->dev_data();
-    cuda->shared_data.comm.maxlistlength=BUFMIN;
-    cuda->shared_data.comm.maxswap=n;
-    cuda->shared_data.comm.nsend_swap=new int[n];
-    cuda->shared_data.comm.send_size=new int[n];
-    cuda->shared_data.comm.recv_size=new int[n];
-  }
-  for(int i=0;i<oldmaxswap;i++)
-  {
-    if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double));
-    if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double));
-    cuda->shared_data.comm.buf_recv_dev[i]=NULL;
-    cuda->shared_data.comm.buf_send_dev[i]=NULL;
-  }
-  cuda->shared_data.comm.buf_send= new double*[n];
-  cuda->shared_data.comm.buf_recv= new double*[n];
-  cuda->shared_data.comm.buf_send_dev= new void*[n];
-  cuda->shared_data.comm.buf_recv_dev= new void*[n];
-  for(int i=0;i<n;i++)
-  {
-    cuda->shared_data.comm.buf_recv[i]=NULL;
-    cuda->shared_data.comm.buf_send[i]=NULL;
-    cuda->shared_data.comm.buf_recv_dev[i]=NULL;
-    cuda->shared_data.comm.buf_send_dev[i]=NULL;
-  }
-  grow_send(maxsend,0);
-  grow_recv(maxrecv);
-
-  maxswap=n;
-}
-
-/* ----------------------------------------------------------------------
-   allocation of swap info
-------------------------------------------------------------------------- */
-
-void CommCuda::allocate_swap(int n)
-{
-   CommBrick::allocate_swap(n);
-
-          delete cu_pbc;
-          delete cu_slablo;
-          delete cu_slabhi;
-
-    cuda->shared_data.comm.maxswap=n;
-          if(cu_sendlist)
-          {
-            cu_pbc=new cCudaData<int, int, xy> ((int*)pbc,n,6);
-            cu_slablo = new cCudaData<double, X_CFLOAT,x>(slablo,n);
-            cu_slabhi = new cCudaData<double, X_CFLOAT,x>(slabhi,n);
-
-            cuda->shared_data.comm.pbc.dev_data=cu_pbc->dev_data();
-            cuda->shared_data.comm.slablo.dev_data=cu_slablo->dev_data();
-            cuda->shared_data.comm.slabhi.dev_data=cu_slabhi->dev_data();
-          }
-    cuda->shared_data.comm.nsend_swap=new int[n];
-    cuda->shared_data.comm.send_size=new int[n];
-    cuda->shared_data.comm.recv_size=new int[n];
-    cuda->shared_data.comm.buf_send= new double*[n];
-    cuda->shared_data.comm.buf_recv= new double*[n];
-    cuda->shared_data.comm.buf_send_dev= new void*[n];
-    cuda->shared_data.comm.buf_recv_dev= new void*[n];
-    for(int i=0;i<n;i++) cuda->shared_data.comm.buf_send_dev[i]=NULL;
-    for(int i=0;i<n;i++) cuda->shared_data.comm.buf_recv_dev[i]=NULL;
-}
-
-
-/* ----------------------------------------------------------------------
-   allocation of multi-type swap info
-------------------------------------------------------------------------- */
-
-void CommCuda::allocate_multi(int n)
-{
-  CommBrick::allocate_multi(n);
-
-          delete cu_multilo;
-          delete cu_multihi;
-          cu_multilo = new cCudaData<double, X_CFLOAT,xy>(slablo,n,atom->ntypes+1);
-          cu_multihi = new cCudaData<double, X_CFLOAT,xy>(slabhi,n,atom->ntypes+1);
-
-          cuda->shared_data.comm.multilo.dev_data=cu_multilo->dev_data();
-          cuda->shared_data.comm.multihi.dev_data=cu_multihi->dev_data();
-}
-
-/* ----------------------------------------------------------------------
-   free memory for swaps
-------------------------------------------------------------------------- */
-
-void CommCuda::free_swap()
-{
-
-  CommBrick::free_swap();
-
-  delete cuda->shared_data.comm.nsend_swap; cuda->shared_data.comm.nsend_swap=NULL;
-  delete cu_pbc; cu_pbc = NULL;
-  delete cu_slablo; cu_slablo = NULL;
-  delete cu_slabhi; cu_slabhi = NULL;
-  for(int i=0;i<maxswap;i++)
-  {
-    if(cuda->shared_data.comm.buf_recv_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_recv_dev[i],maxrecv*sizeof(double));
-    if(cuda->shared_data.comm.buf_send_dev[i]) CudaWrapper_FreeCudaData(cuda->shared_data.comm.buf_send_dev[i],maxsend*sizeof(double));
-  }
-
-
-}
-
-/* ----------------------------------------------------------------------
-   free memory for multi-type swaps
-------------------------------------------------------------------------- */
-
-void CommCuda::free_multi()
-{
-  CommBrick::free_multi();
-  delete cu_multilo; cu_multilo = NULL;
-  delete cu_multihi; cu_multihi = NULL;
-}
diff --git a/src/USER-CUDA/comm_cuda.h b/src/USER-CUDA/comm_cuda.h
deleted file mode 100644
index 5105018f32..0000000000
--- a/src/USER-CUDA/comm_cuda.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_COMM_CUDA_H
-#define LMP_COMM_CUDA_H
-
-#include "pointers.h"
-
-#include "cuda_data.h"
-#include "comm_brick.h"
-
-namespace LAMMPS_NS {
-
-class CommCuda : public CommBrick {
-public:
-  CommCuda(class LAMMPS *);
-  ~CommCuda();
-
-  virtual void init();
-  virtual void setup();                     // setup 3d communication pattern
-  virtual void forward_comm(int mode=0);              // forward communication of atom coords
-  virtual void forward_comm_cuda();
-  virtual void forward_comm_pack_cuda();
-  virtual void forward_comm_transfer_cuda();
-  virtual void forward_comm_unpack_cuda();
-  virtual void forward_comm_pair(Pair *pair);
-  virtual void reverse_comm();              // reverse communication of forces
-  virtual void exchange();                  // move atoms to new procs
-  virtual void exchange_cuda();                  // move atoms to new procs
-  virtual void borders();                   // setup list of atoms to communicate
-  virtual void borders_cuda();                   // setup list of atoms to communicate
-  virtual void borders_cuda_overlap_forward_comm();
-  virtual void forward_comm_fix(class Fix *, int size=0);          // forward comm from a Fix
-
-
-
-
- protected:
-  class Cuda *cuda;
-  cCudaData<int, int, xy>* cu_pbc;
-  cCudaData<double, X_CFLOAT, x>* cu_slablo;
-  cCudaData<double, X_CFLOAT, x>* cu_slabhi;
-  cCudaData<double, X_CFLOAT, xy>* cu_multilo;
-  cCudaData<double, X_CFLOAT, xy>* cu_multihi;
-
-  cCudaData<int, int, xy>* cu_sendlist;
-  virtual void grow_send(int,int);          // reallocate send buffer
-  virtual void grow_recv(int);              // free/allocate recv buffer
-  virtual void grow_list(int, int);         // reallocate one sendlist
-  virtual void grow_swap(int);              // grow swap and multi arrays
-  virtual void allocate_swap(int);          // allocate swap arrays
-  virtual void allocate_multi(int);         // allocate multi arrays
-  virtual void free_swap();                 // free swap arrays
-  virtual void free_multi();                // free multi arrays
-};
-
-}
-
-#endif
diff --git a/src/USER-CUDA/compute_pe_cuda.cpp b/src/USER-CUDA/compute_pe_cuda.cpp
deleted file mode 100644
index b8661c9702..0000000000
--- a/src/USER-CUDA/compute_pe_cuda.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cstring>
-#include "compute_pe_cuda.h"
-#include "atom.h"
-#include "update.h"
-#include "force.h"
-#include "pair.h"
-#include "bond.h"
-#include "angle.h"
-#include "dihedral.h"
-#include "improper.h"
-#include "kspace.h"
-#include "modify.h"
-#include "domain.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-ComputePECuda::ComputePECuda(LAMMPS *lmp, int narg, char **arg) :
-  ComputePE(lmp, narg, arg)
-{
-  cudable = 1;
-}
diff --git a/src/USER-CUDA/compute_pe_cuda.h b/src/USER-CUDA/compute_pe_cuda.h
deleted file mode 100644
index bc8b057762..0000000000
--- a/src/USER-CUDA/compute_pe_cuda.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef COMPUTE_CLASS
-
-ComputeStyle(pe/cuda,ComputePECuda)
-
-#else
-
-#ifndef LMP_COMPUTE_PE_CUDA_H
-#define LMP_COMPUTE_PE_CUDA_H
-
-#include "compute_pe.h"
-
-namespace LAMMPS_NS {
-
-class ComputePECuda : public ComputePE {
- public:
-  ComputePECuda(class LAMMPS *, int, char **);
-  ~ComputePECuda() {}
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/compute_pressure_cuda.cpp b/src/USER-CUDA/compute_pressure_cuda.cpp
deleted file mode 100644
index c92e918ad0..0000000000
--- a/src/USER-CUDA/compute_pressure_cuda.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cstring>
-#include <cstdlib>
-#include "compute_pressure_cuda.h"
-#include "atom.h"
-#include "update.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "force.h"
-#include "pair.h"
-#include "bond.h"
-#include "angle.h"
-#include "dihedral.h"
-#include "improper.h"
-#include "kspace.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-enum{DUMMY0,INVOKED_SCALAR,INVOKED_VECTOR,DUMMMY3,INVOKED_PERATOM};
-
-/* ---------------------------------------------------------------------- */
-
-ComputePressureCuda::ComputePressureCuda(LAMMPS *lmp, int narg, char **arg) :
-  ComputePressure(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-  cudable = 1;
-
-  // store temperature ID used by pressure computation
-  // insure it is valid for temperature computation
-
-  int n = strlen(arg[3]) + 1;
-  char* id_temp = new char[n];
-  strcpy(id_temp,arg[3]);
-
-  int icompute = modify->find_compute(id_temp);
-  delete [] id_temp;
-  if (modify->compute[icompute]->cudable == 0)
-  {
-    error->warning(FLERR,"Compute pressure/cuda temperature ID is not cudable! Try a temp/cuda style.");
-    cudable = 0;
-  }
-
-}
-
-double ComputePressureCuda::compute_scalar()
-{
-  if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll();
-  return ComputePressure::compute_scalar();
-}
-
-void ComputePressureCuda::compute_vector()
-{
-  if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll();
-  ComputePressure::compute_vector();
-}
diff --git a/src/USER-CUDA/compute_pressure_cuda.h b/src/USER-CUDA/compute_pressure_cuda.h
deleted file mode 100644
index af48091708..0000000000
--- a/src/USER-CUDA/compute_pressure_cuda.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-#ifdef COMPUTE_CLASS
-
-ComputeStyle(pressure/cuda,ComputePressureCuda)
-
-#else
-
-#ifndef LMP_COMPUTE_PRESSURE_CUDA_H
-#define LMP_COMPUTE_PRESSURE_CUDA_H
-
-#include "compute_pressure.h"
-
-namespace LAMMPS_NS {
-
-class ComputePressureCuda : public ComputePressure {
- public:
-  ComputePressureCuda(class LAMMPS *, int, char **);
-  ~ComputePressureCuda() {}
-  double compute_scalar();
-  void compute_vector();
-
-  private:
-  class Cuda *cuda;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/compute_temp_cuda.cpp b/src/USER-CUDA/compute_temp_cuda.cpp
deleted file mode 100644
index 85afa07258..0000000000
--- a/src/USER-CUDA/compute_temp_cuda.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "compute_temp_cuda.h"
-#include "compute_temp_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "force.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "group.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-ComputeTempCuda::ComputeTempCuda(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 3) error->all(FLERR,"Illegal compute temp/cuda command");
-
-  scalar_flag = vector_flag = 1;
-  size_vector = 6;
-  extscalar = 0;
-  extvector = 1;
-  tempflag = 1;
-
-  vector = new double[6];
-  cu_t_vector = 0;
-  cu_t_scalar = 0;
-  cudable=true;
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-ComputeTempCuda::~ComputeTempCuda()
-{
-  delete [] vector;
-  delete cu_t_vector;
-  delete cu_t_scalar;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void ComputeTempCuda::setup()
-{
-  dynamic = 0;
-  if (dynamic_user || group->dynamic[igroup]) dynamic = 1;
-
-  fix_dof = 0;
-  for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
-  dof_compute();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void ComputeTempCuda::dof_compute()
-{
-  double natoms = group->count(igroup);
-  dof = domain->dimension * natoms;
-  dof -= extra_dof + fix_dof;
-  if (dof > 0.0) tfactor = force->mvv2e / (dof * force->boltz);
-  else tfactor = 0.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-double ComputeTempCuda::compute_scalar()
-{
-  if(cuda->begin_setup)
-  {
-          if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_CFLOAT, x> (t_vector,6);
-          if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_CFLOAT, x> (&t_scalar,1);
-    invoked_scalar = update->ntimestep;
-    Cuda_ComputeTempCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_CFLOAT*) cu_t_scalar->dev_data());
-    cu_t_scalar->download();
-  }
-  else
-  {
-  invoked_scalar = update->ntimestep;
-
-  double **v = atom->v;
-  double *mass = atom->mass;
-  double *rmass = atom->rmass;
-  int *type = atom->type;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  double t = 0.0;
-
-  if (rmass) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
-  } else {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) *
-          mass[type[i]];
-  }
-  t_scalar=t;
-  }
-
-  MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
-  if (dynamic) dof_compute();
-  scalar *= tfactor;
-  if(scalar>1e15)
-  {
-          cuda->cu_v->download();
-          cuda->cu_x->download();
-          cuda->cu_type->download();
-    double **v = atom->v;
-    double **x = atom->x;
-    printf("Out of v-range atoms:  \n");
-          for(int i=0;i<atom->nlocal;i++)
-          if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5)
-          printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]);
-          error->all(FLERR,"Temperature out of range. Simulations will be abortet.\n");
-  }
-  return scalar;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void ComputeTempCuda::compute_vector()
-{
-  int i;
-  if(cuda->begin_setup)
-  {
-  if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_CFLOAT, x> (t_vector,6);
-  if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_CFLOAT, x> (&t_scalar,1);
-
-  invoked_vector = update->ntimestep;
-
-  Cuda_ComputeTempCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_CFLOAT*) cu_t_vector->dev_data());
-  cu_t_vector->download();
-  }
-  else
-  {
-
-  invoked_vector = update->ntimestep;
-
-  double **v = atom->v;
-  double *mass = atom->mass;
-  double *rmass = atom->rmass;
-  int *type = atom->type;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  double massone,t[6];
-  for (i = 0; i < 6; i++) t[i] = 0.0;
-
-  for (i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) {
-      if (rmass) massone = rmass[i];
-      else massone = mass[type[i]];
-      t[0] += massone * v[i][0]*v[i][0];
-      t[1] += massone * v[i][1]*v[i][1];
-      t[2] += massone * v[i][2]*v[i][2];
-      t[3] += massone * v[i][0]*v[i][1];
-      t[4] += massone * v[i][0]*v[i][2];
-      t[5] += massone * v[i][1]*v[i][2];
-    }
-
-  for (i = 0; i < 6; i++) t_vector[i]=t[i];
-  }
-  MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world);
-  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
-}
diff --git a/src/USER-CUDA/compute_temp_cuda.h b/src/USER-CUDA/compute_temp_cuda.h
deleted file mode 100644
index 54b3338c08..0000000000
--- a/src/USER-CUDA/compute_temp_cuda.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef COMPUTE_CLASS
-
-ComputeStyle(temp/cuda,ComputeTempCuda)
-
-#else
-
-#ifndef LMP_COMPUTE_TEMP_CUDA_H
-#define LMP_COMPUTE_TEMP_CUDA_H
-
-#include "compute.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class ComputeTempCuda : public Compute {
- public:
-  ComputeTempCuda(class LAMMPS *, int, char **);
-  ~ComputeTempCuda();
-  void init() {}
-  void setup();
-  double compute_scalar();
-  void compute_vector();
-
- private:
-  class Cuda *cuda;
-  int fix_dof;
-  double tfactor;
-
-  void dof_compute();
-  double t_vector[6];
-  double t_scalar;
-  cCudaData<double     , ENERGY_CFLOAT                   , x>* cu_t_scalar;
-  cCudaData<double     , ENERGY_CFLOAT                   , x>* cu_t_vector;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/compute_temp_partial_cuda.cpp b/src/USER-CUDA/compute_temp_partial_cuda.cpp
deleted file mode 100644
index b366c546f6..0000000000
--- a/src/USER-CUDA/compute_temp_partial_cuda.cpp
+++ /dev/null
@@ -1,360 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "compute_temp_partial_cuda.h"
-#include "compute_temp_partial_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "force.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "group.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-ComputeTempPartialCuda::ComputeTempPartialCuda(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 6) error->all(FLERR,"Illegal compute temp/partial command");
-
-  scalar_flag = vector_flag = 1;
-  size_vector = 6;
-  extscalar = 0;
-  extvector = 1;
-  tempflag = 1;
-  tempbias = 1;
-
-  xflag = force->inumeric(FLERR,arg[3]);
-  yflag = force->inumeric(FLERR,arg[4]);
-  zflag = force->inumeric(FLERR,arg[5]);
-  if (zflag && domain->dimension == 2)
-    error->all(FLERR,"Compute temp/partial cannot use vz for 2d systemx");
-
-  maxbias = 0;
-  vbiasall = NULL;
-
-  vector = new double[6];
-  cu_t_vector = 0;
-  cu_t_scalar = 0;
-  cu_vbiasall=NULL;
-  cudable=true;
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-ComputeTempPartialCuda::~ComputeTempPartialCuda()
-{
-  memory->destroy(vbiasall);
-  delete [] vector;
-  delete cu_t_vector;
-  delete cu_t_scalar;
-  delete cu_vbiasall;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::setup()
-{
-  dynamic = 0;
-  if (dynamic_user || group->dynamic[igroup]) dynamic = 1;
-
-  fix_dof = 0;
-  for (int i = 0; i < modify->nfix; i++)
-    fix_dof += modify->fix[i]->dof(igroup);
-  dof_compute();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::dof_compute()
-{
-  double natoms = group->count(igroup);
-  int nper = xflag+yflag+zflag;
-  dof = nper * natoms;
-  dof -= (1.0*nper/domain->dimension)*fix_dof + extra_dof;
-  if (dof > 0) tfactor = force->mvv2e / (dof * force->boltz);
-  else tfactor = 0.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int ComputeTempPartialCuda::dof_remove(int i)
-{
-  int nper = xflag+yflag+zflag;
-  return (domain->dimension - nper);
-}
-
-/* ---------------------------------------------------------------------- */
-
-double ComputeTempPartialCuda::compute_scalar()
-{
-  if(cuda->begin_setup)
-  {
-          if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_CFLOAT, x> (t_vector,6);
-          if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_CFLOAT, x> (&t_scalar,1);
-    invoked_scalar = update->ntimestep;
-    Cuda_ComputeTempPartialCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_CFLOAT*) cu_t_scalar->dev_data(),xflag,yflag,zflag);
-    cu_t_scalar->download();
-  }
-  else
-  {
-  invoked_scalar = update->ntimestep;
-
-  double **v = atom->v;
-  double *mass = atom->mass;
-  double *rmass = atom->rmass;
-  int *type = atom->type;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  double t = 0.0;
-
-  if (rmass) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * rmass[i];
-  } else {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) *
-          mass[type[i]];
-  }
-  t_scalar=t;
-  }
-
-  MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
-  if (dynamic) dof_compute();
-  scalar *= tfactor;
-  if(scalar>1e15)
-  {
-          cuda->cu_v->download();
-          cuda->cu_x->download();
-          cuda->cu_type->download();
-    double **v = atom->v;
-    double **x = atom->x;
-    printf("Out of v-range atoms:  \n");
-          for(int i=0;i<atom->nlocal;i++)
-          if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5)
-          printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]);
-          error->all(FLERR,"Temperature out of range. Simulations will be abortet.\n");
-  }
-  return scalar;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::compute_vector()
-{
-  int i;
-  if(cuda->begin_setup)
-  {
-  if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_CFLOAT, x> (t_vector,6);
-  if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_CFLOAT, x> (&t_scalar,1);
-
-  invoked_vector = update->ntimestep;
-
-  Cuda_ComputeTempPartialCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_CFLOAT*) cu_t_vector->dev_data(),xflag,yflag,zflag);
-  cu_t_vector->download();
-  }
-  else
-  {
-
-  invoked_vector = update->ntimestep;
-
-  double **v = atom->v;
-  double *mass = atom->mass;
-  double *rmass = atom->rmass;
-  int *type = atom->type;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  double massone,t[6];
-  for (i = 0; i < 6; i++) t[i] = 0.0;
-
-  for (i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) {
-      if (rmass) massone = rmass[i];
-      else massone = mass[type[i]];
-      t[0] += massone * xflag*v[i][0]*v[i][0];
-      t[1] += massone * yflag*v[i][1]*v[i][1];
-      t[2] += massone * zflag*v[i][2]*v[i][2];
-      t[3] += massone * xflag*yflag*v[i][0]*v[i][1];
-      t[4] += massone * xflag*zflag*v[i][0]*v[i][2];
-      t[5] += massone * yflag*zflag*v[i][1]*v[i][2];
-    }
-
-  for (i = 0; i < 6; i++) t_vector[i]=t[i];
-  }
-  MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world);
-  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
-}
-
-/* ----------------------------------------------------------------------
-   remove velocity bias from atom I to leave thermal velocity
-------------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::remove_bias(int i, double *v)
-{
-  if (!xflag) {
-    vbias[0] = v[0];
-    v[0] = 0.0;
-  }
-  if (!yflag) {
-    vbias[1] = v[1];
-    v[1] = 0.0;
-  }
-  if (!zflag) {
-    vbias[2] = v[2];
-    v[2] = 0.0;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   remove velocity bias from all atoms to leave thermal velocity
-------------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::remove_bias_all()
-{
-  double **v = atom->v;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  if (nlocal > maxbias) {
-    memory->destroy(vbiasall);
-    maxbias = atom->nmax;
-    memory->create(vbiasall,maxbias,3,"temp/partial:vbiasall");
-        delete cu_vbiasall;
-        cu_vbiasall = new cCudaData<double, V_CFLOAT, yx> ((double*)vbiasall, atom->nmax, 3);
-  }
-  if(cuda->begin_setup)
-  {
-                  Cuda_ComputeTempPartialCuda_RemoveBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data());
-  }
-  else
-  {
-  if (!xflag) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        vbiasall[i][0] = v[i][0];
-        v[i][0] = 0.0;
-      }
-  }
-  if (!yflag) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        vbiasall[i][1] = v[i][1];
-        v[i][1] = 0.0;
-      }
-  }
-  if (!zflag) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        vbiasall[i][2] = v[i][2];
-        v[i][2] = 0.0;
-      }
-  }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   add back in velocity bias to atom I removed by remove_bias()
-   assume remove_bias() was previously called
-------------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::restore_bias(int i, double *v)
-{
-  if (!xflag) v[0] += vbias[0];
-  if (!yflag) v[1] += vbias[1];
-  if (!zflag) v[2] += vbias[2];
-}
-
-/* ----------------------------------------------------------------------
-   add back in velocity bias to all atoms removed by remove_bias_all()
-   assume remove_bias_all() was previously called
-------------------------------------------------------------------------- */
-
-void ComputeTempPartialCuda::restore_bias_all()
-{
-  double **v = atom->v;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  if(cuda->begin_setup)
-  {
-                  Cuda_ComputeTempPartialCuda_RestoreBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data());
-  }
-  else
-  {
-
-  if (!xflag) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        v[i][0] += vbiasall[i][0];
-  }
-  if (!yflag) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        v[i][1] += vbiasall[i][1];
-  }
-  if (!zflag) {
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        v[i][2] += vbiasall[i][2];
-  }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-double ComputeTempPartialCuda::memory_usage()
-{
-  double bytes = maxbias * sizeof(double);
-  return bytes;
-}
diff --git a/src/USER-CUDA/compute_temp_partial_cuda.h b/src/USER-CUDA/compute_temp_partial_cuda.h
deleted file mode 100644
index 320bf17858..0000000000
--- a/src/USER-CUDA/compute_temp_partial_cuda.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef COMPUTE_CLASS
-
-ComputeStyle(temp/partial/cuda,ComputeTempPartialCuda)
-
-#else
-
-#ifndef LMP_COMPUTE_TEMP_PARTIAL_CUDA_H
-#define LMP_COMPUTE_TEMP_PARTIAL_CUDA_H
-
-#include "compute.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class ComputeTempPartialCuda : public Compute {
- public:
-  ComputeTempPartialCuda(class LAMMPS *, int, char **);
-  ~ComputeTempPartialCuda();
-  void init() {}
-  void setup();
-  double compute_scalar();
-  void compute_vector();
-
-  int dof_remove(int);
-  void remove_bias(int, double *);
-  void remove_bias_all();
-  void restore_bias(int, double *);
-  void restore_bias_all();
-  double memory_usage();
-
- private:
-  class Cuda *cuda;
-  int xflag,yflag,zflag;
-  int fix_dof;
-  double tfactor;
-
-  void dof_compute();
-  double t_vector[6];
-  double t_scalar;
-  cCudaData<double     , ENERGY_CFLOAT                   , x>* cu_t_scalar;
-  cCudaData<double     , ENERGY_CFLOAT                   , x>* cu_t_vector;
-  cCudaData<double, V_CFLOAT, yx>* cu_vbiasall;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/cuda.cpp b/src/USER-CUDA/cuda.cpp
deleted file mode 100644
index 995289a792..0000000000
--- a/src/USER-CUDA/cuda.cpp
+++ /dev/null
@@ -1,1067 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "user_cuda.h"
-#include "atom.h"
-#include "domain.h"
-#include "force.h"
-#include "pair.h"
-#include "update.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "universe.h"
-#include "input.h"
-#include "atom_masks.h"
-#include "error.h"
-
-#include "cuda_neigh_list.h"
-//#include "pre_binning_cu.h"
-//#include "reverse_binning_cu.h"
-#include <ctime>
-#include <cmath>
-#include "cuda_pair_cu.h"
-#include "cuda_cu.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-Cuda::Cuda(LAMMPS* lmp) : Pointers(lmp)
-{
-  cuda_exists = true;
-  lmp->cuda = this;
-
-  if (universe->me == 0) printf("# Using LAMMPS_CUDA \n");
-
-  shared_data.me = universe->me;
-
-  device_set = false;
-  devicelist = NULL;
-
-  Cuda_Cuda_GetCompileSettings(&shared_data);
-
-  if (universe->me == 0) {
-
-    if(shared_data.compile_settings.prec_glob != sizeof(CUDA_CFLOAT) / 4)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: Global Precision: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.prec_glob, (int) sizeof(CUDA_CFLOAT) / 4);
-
-    if(shared_data.compile_settings.prec_x != sizeof(X_CFLOAT) / 4)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: X Precision: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.prec_x, (int) sizeof(X_CFLOAT) / 4);
-
-    if(shared_data.compile_settings.prec_v != sizeof(V_CFLOAT) / 4)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: V Precision: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.prec_v, (int) sizeof(V_CFLOAT) / 4);
-
-    if(shared_data.compile_settings.prec_f != sizeof(F_CFLOAT) / 4)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: F Precision: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.prec_f, (int) sizeof(F_CFLOAT) / 4);
-
-    if(shared_data.compile_settings.prec_pppm != sizeof(PPPM_CFLOAT) / 4)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: PPPM Precision: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.prec_pppm, (int) sizeof(PPPM_CFLOAT) / 4);
-
-    if(shared_data.compile_settings.prec_fft != sizeof(FFT_CFLOAT) / 4)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: FFT Precision: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.prec_fft, (int) sizeof(FFT_CFLOAT) / 4);
-
-#ifdef FFT_CUFFT
-    if(shared_data.compile_settings.cufft != 1)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: cufft: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.cufft, 1);
-#else
-    if(shared_data.compile_settings.cufft != 0)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: cufft: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.cufft, 0);
-#endif
-
-    if(shared_data.compile_settings.arch != CUDA_ARCH)
-      printf("\n\n # CUDA WARNING: Compile Settings of cuda and cpp code differ! \n"
-                 " # CUDA WARNING: arch: cuda %i cpp %i\n\n",
-             shared_data.compile_settings.cufft, CUDA_ARCH);
-  }
-
-  cu_x          = 0;
-  cu_v          = 0;
-  cu_f          = 0;
-  cu_tag        = 0;
-  cu_type       = 0;
-  cu_mask       = 0;
-  cu_image      = 0;
-  cu_xhold      = 0;
-  cu_q          = 0;
-  cu_rmass      = 0;
-  cu_mass       = 0;
-  cu_virial     = 0;
-  cu_eatom      = 0;
-  cu_vatom      = 0;
-  cu_radius          = 0;
-  cu_density          = 0;
-  cu_omega          = 0;
-  cu_torque          = 0;
-
-  cu_special           = 0;
-  cu_nspecial   = 0;
-
-  cu_molecule   = 0;
-
-  cu_x_type           = 0;
-  x_type                  = 0;
-  cu_v_radius          = 0;
-  v_radius          = 0;
-  cu_omega_rmass          = 0;
-  omega_rmass          = 0;
-
-  binned_id = 0;
-  cu_binned_id  = 0;
-  binned_idnew = 0;
-  cu_binned_idnew = 0;
-
-  cu_map_array = 0;
-
-  copy_buffer = 0;
-  copy_buffersize = 0;
-
-  neighbor_decide_by_integrator = 0;
-  pinned = true;
-
-  debugdata = 0;
-
-  finished_setup = false;
-  begin_setup = false;
-  finished_run = false;
-
-  setSharedDataZero();
-
-  uploadtime = 0;
-  downloadtime = 0;
-  dotiming = false;
-
-  dotestatom = false;
-  testatom = 0;
-  oncpu = true;
-
-  self_comm = 0;
-  MYDBG(printf("# CUDA: Cuda::Cuda Done...\n");)
-  //cCudaData<double, float, yx >
-}
-
-/* ---------------------------------------------------------------------- */
-
-Cuda::~Cuda()
-{
-  print_timings();
-
-  if (universe->me == 0) printf("# CUDA: Free memory...\n");
-
-  delete [] devicelist;
-
-  delete cu_q;
-  delete cu_x;
-  delete cu_v;
-  delete cu_f;
-  delete cu_tag;
-  delete cu_type;
-  delete cu_mask;
-  delete cu_image;
-  delete cu_xhold;
-  delete cu_mass;
-  delete cu_rmass;
-  delete cu_virial;
-  delete cu_eng_vdwl;
-  delete cu_eng_coul;
-  delete cu_extent;
-  delete cu_eatom;
-  delete cu_vatom;
-  delete cu_radius;
-  delete cu_density;
-  delete cu_omega;
-  delete cu_torque;
-  delete cu_molecule;
-
-  delete cu_x_type;
-  delete [] x_type;
-  delete cu_v_radius;
-  delete [] v_radius;
-  delete cu_omega_rmass;
-  delete [] omega_rmass;
-
-  delete cu_debugdata;
-  delete[] debugdata;
-
-  delete cu_map_array;
-
-  std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
-
-  while(p != neigh_lists.end()) {
-    delete p->second;
-    ++p;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   package cuda command
-   can be invoked multiple times: -c on, -pk, package command
-   can only init GPUs once in activate(), so just store params here
-------------------------------------------------------------------------- */
-
-void Cuda::accelerator(int narg, char **arg)
-{
-  // this error should not happen
-
-  if (device_set) error->all(FLERR,"USER-CUDA device is already activated");
-
-  // pppn = # of GPUs/node
-
-  pppn = force->inumeric(FLERR,arg[0]);
-  if (pppn <= 0) error->all(FLERR,"Illegal package cuda command");
-
-  // optional args
-
-  delete [] devicelist;
-  devicelist = NULL;
-  int newtonflag = 0;
-
-  int iarg = 1;
-  while (iarg < narg) {
-    if (strcmp(arg[iarg],"newton") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
-      if (strcmp(arg[iarg+1],"off") == 0) newtonflag = 0;
-      else if (strcmp(arg[iarg+1],"on") == 0) newtonflag = 1;
-      else error->all(FLERR,"Illegal package cuda command");
-    } else if (strcmp(arg[iarg],"gpuID") == 0) {
-      if (iarg+pppn+1 > narg) error->all(FLERR,"Illegal package cuda command");
-      devicelist = new int[pppn];
-      for (int k = 0; k < pppn; k++)
-        devicelist[k] = force->inumeric(FLERR,arg[iarg+k+1]);
-      iarg += pppn + 1;
-    } else if (strcmp(arg[iarg],"timing") == 0) {
-      dotiming = true;
-      iarg++;
-    } else if (strcmp(arg[iarg],"test") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
-      testatom = force->numeric(FLERR,arg[iarg+1]);
-      dotestatom = true;
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"thread") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
-      if (strcmp(arg[iarg+1],"auto") == 0)
-        shared_data.pair.override_block_per_atom = -1;
-      else if (strcmp(arg[iarg+1],"tpa") == 0)
-        shared_data.pair.override_block_per_atom = 0;
-      else if (strcmp(arg[iarg+1],"bpa") == 0)
-        shared_data.pair.override_block_per_atom = 1;
-      else error->all(FLERR,"Illegal package cuda command");
-      iarg += 2;
-    }
-
-    // undocumented options
-
-    else if (strcmp(arg[iarg],"suffix") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
-      strcpy(lmp->suffix,arg[iarg+1]);
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"overlap_comm") == 0) {
-      shared_data.overlap_comm = 1;
-      iarg++;
-    } else if (strcmp(arg[iarg],"pinned") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal package cuda command");
-      pinned = force->inumeric(FLERR,arg[iarg+1]) == 0 ? false : true;
-      if ((pinned == false) && (universe->me == 0))
-        printf(" #CUDA: Pinned memory is not used for communication\n");
-      iarg += 2;
-    } else error->all(FLERR,"Illegal package cuda command");
-  }
-
-  // set newton flags
-
-  force->newton = force->newton_pair = force->newton_bond = newtonflag;
-}
-
-/* ----------------------------------------------------------------------
-   activate the GPUs
-   only done once with whatever settings used by the last package command
-------------------------------------------------------------------------- */
-
-void Cuda::activate()
-{
-  if (device_set) return;
-  device_set = true;
-
-  if (universe->me == 0) printf("# CUDA: Activate GPU \n");
-
-  CudaWrapper_Init(0, (char**)0, universe->me, pppn, devicelist);
-  //if(shared_data.overlap_comm)
-  CudaWrapper_AddStreams(3);
-  cu_x          = 0;
-  cu_v          = 0;
-  cu_f          = 0;
-  cu_tag        = 0;
-  cu_type       = 0;
-  cu_mask       = 0;
-  cu_image      = 0;
-  cu_xhold      = 0;
-  cu_q          = 0;
-  cu_rmass      = 0;
-  cu_mass       = 0;
-  cu_virial     = 0;
-  cu_eatom      = 0;
-  cu_vatom      = 0;
-  cu_radius            = 0;
-  cu_density          = 0;
-  cu_omega            = 0;
-  cu_torque            = 0;
-
-  cu_special           = 0;
-  cu_nspecial   = 0;
-
-  cu_molecule   = 0;
-
-  cu_x_type           = 0;
-  cu_v_radius          = 0;
-  cu_omega_rmass          = 0;
-
-  cu_binned_id  = 0;
-  cu_binned_idnew = 0;
-  allocate();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void Cuda::setSharedDataZero()
-{
-  MYDBG(printf("# CUDA: Cuda::setSharedDataZero ...\n");)
-  shared_data.atom.nlocal = 0;
-  shared_data.atom.nghost = 0;
-  shared_data.atom.nall = 0;
-  shared_data.atom.nmax = 0;
-  shared_data.atom.ntypes = 0;
-  shared_data.atom.q_flag = 0;
-  shared_data.atom.need_eatom = 0;
-  shared_data.atom.need_vatom = 0;
-  shared_data.atom.update_nmax = 1;
-  shared_data.atom.update_nlocal = 1;
-  shared_data.atom.update_neigh = 1;
-
-  shared_data.pair.cudable_force = 0;
-  shared_data.pair.collect_forces_later = 0;
-  shared_data.pair.use_block_per_atom = 0;
-  shared_data.pair.override_block_per_atom = -1;
-  shared_data.pair.cut = 0;
-  shared_data.pair.cutsq = 0;
-  shared_data.pair.cut_inner = 0;
-  shared_data.pair.cut_coul = 0;
-  shared_data.pair.special_lj = 0;
-  shared_data.pair.special_coul = 0;
-
-  shared_data.pair.neighall = false;
-
-  shared_data.pppm.cudable_force = 0;
-
-  shared_data.buffersize = 0;
-  shared_data.buffer_new = 1;
-  shared_data.buffer = NULL;
-
-  shared_data.comm.comm_phase = 0;
-  shared_data.overlap_comm = 0;
-
-  shared_data.comm.buffer = NULL;
-  shared_data.comm.buffer_size = 0;
-  shared_data.comm.overlap_split_ratio = 0;
-  // setTimingsZero();
-}
-
-void Cuda::allocate()
-{
-  MYDBG(printf("# CUDA: Cuda::allocate ...\n");)
-
-  if(not cu_virial) {
-    cu_virial    = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.virial , 6);
-    cu_eng_vdwl  = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.eng_vdwl , 1);
-    cu_eng_coul  = new cCudaData<double, ENERGY_CFLOAT, x > (NULL, & shared_data.pair.eng_coul , 1);
-    cu_extent          = new cCudaData<double, double, x> (extent, 6);
-    shared_data.flag = CudaWrapper_AllocCudaData(sizeof(int));
-    int size = 2 * CUDA_MAX_DEBUG_SIZE;
-    debugdata = new int[size];
-    cu_debugdata    = new cCudaData<int, int, x > (debugdata , size);
-    shared_data.debugdata = cu_debugdata->dev_data();
-  }
-
-  checkResize();
-  setSystemParams();
-  MYDBG(printf("# CUDA: Cuda::allocate done...\n");)
-}
-
-void Cuda::setSystemParams()
-{
-  MYDBG(printf("# CUDA: Cuda::setSystemParams ...\n");)
-  shared_data.atom.nlocal = atom->nlocal;
-  shared_data.atom.nghost = atom->nghost;
-  shared_data.atom.nall = atom->nlocal + atom->nghost;
-  shared_data.atom.ntypes = atom->ntypes;
-  shared_data.atom.q_flag = atom->q_flag;
-  shared_data.atom.rmass_flag = atom->rmass_flag;
-  MYDBG(printf("# CUDA: Cuda::setSystemParams done ...\n");)
-}
-
-void Cuda::setDomainParams()
-{
-  MYDBG(printf("# CUDA: Cuda::setDomainParams ...\n");)
-  cuda_shared_domain* cu_domain = &shared_data.domain;
-
-  cu_domain->triclinic = domain->triclinic;
-
-  for(short i = 0; i < 3; ++i) {
-    cu_domain->periodicity[i] = domain->periodicity[i];
-    cu_domain->sublo[i] = domain->sublo[i];
-    cu_domain->subhi[i] = domain->subhi[i];
-    cu_domain->boxlo[i] = domain->boxlo[i];
-    cu_domain->boxhi[i] = domain->boxhi[i];
-    cu_domain->prd[i] = domain->prd[i];
-  }
-
-  if(domain->triclinic) {
-    for(short i = 0; i < 3; ++i) {
-      cu_domain->boxlo_lamda[i] = domain->boxlo_lamda[i];
-      cu_domain->boxhi_lamda[i] = domain->boxhi_lamda[i];
-      cu_domain->prd_lamda[i] = domain->prd_lamda[i];
-      cu_domain->sublo[i] = domain->sublo_lamda[i];
-      cu_domain->subhi[i] = domain->subhi_lamda[i];
-    }
-
-    cu_domain->xy = domain->xy;
-    cu_domain->xz = domain->xz;
-    cu_domain->yz = domain->yz;
-  }
-
-  for(int i = 0; i < 6; i++) {
-    cu_domain->h[i] = domain->h[i];
-    cu_domain->h_inv[i] = domain->h_inv[i];
-    cu_domain->h_rate[i] = domain->h_rate[i];
-  }
-
-  cu_domain->update = 2;
-  MYDBG(printf("# CUDA: Cuda::setDomainParams done ...\n");)
-}
-
-void Cuda::checkResize()
-{
-  MYDBG(printf("# CUDA: Cuda::checkResize ...\n");)
-  cuda_shared_atom* cu_atom = & shared_data.atom;
-  cu_atom->q_flag      = atom->q_flag;
-  cu_atom->rmass_flag  = atom->rmass ? 1 : 0;
-  cu_atom->nall = atom->nlocal + atom->nghost;
-  cu_atom->nlocal      = atom->nlocal;
-  cu_atom->nghost      = atom->nghost;
-
-  // do we have more atoms to upload than currently allocated memory on device? (also true if nothing yet allocated)
-  if(atom->nmax > cu_atom->nmax || cu_tag == NULL) {
-    delete cu_x;
-    cu_x         = new cCudaData<double, X_CFLOAT, yx> ((double*)atom->x , & cu_atom->x        , atom->nmax, 3, 0, true); //cu_x->set_buffer(&(shared_data.buffer),&(shared_data.buffersize),true);
-    delete cu_v;
-    cu_v         = new cCudaData<double, V_CFLOAT, yx> ((double*)atom->v, & cu_atom->v         , atom->nmax, 3);
-    delete cu_f;
-    cu_f         = new cCudaData<double, F_CFLOAT, yx> ((double*)atom->f, & cu_atom->f         , atom->nmax, 3, 0, true);
-    delete cu_tag;
-    cu_tag       = new cCudaData<int   , int    , x > (atom->tag       , & cu_atom->tag       , atom->nmax, 0, true);
-    delete cu_type;
-    cu_type      = new cCudaData<int   , int    , x > (atom->type      , & cu_atom->type      , atom->nmax, 0, true);
-    delete cu_mask;
-    cu_mask      = new cCudaData<int   , int    , x > (atom->mask      , & cu_atom->mask      , atom->nmax, 0, true);
-    delete cu_image;
-    cu_image     = new cCudaData<int   , int    , x > (atom->image     , & cu_atom->image     , atom->nmax, 0, true);
-
-    if(atom->rmass) {
-      delete cu_rmass;
-      cu_rmass     = new cCudaData<double, V_CFLOAT, x > (atom->rmass     , & cu_atom->rmass     , atom->nmax);
-    }
-
-    if(cu_atom->q_flag) {
-      delete cu_q;
-      cu_q         = new cCudaData<double, F_CFLOAT, x > ((double*)atom->q, & cu_atom->q         , atom->nmax, 0 , true);
-    }// cu_q->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
-
-    if(atom->radius) {
-      delete cu_radius;
-      cu_radius    = new cCudaData<double, X_CFLOAT, x > (atom->radius    , & cu_atom->radius     , atom->nmax);
-      delete cu_v_radius;
-      cu_v_radius  = new cCudaData<V_CFLOAT, V_CFLOAT, x> (v_radius , & cu_atom->v_radius      , atom->nmax * 4);
-      delete cu_omega_rmass;
-      cu_omega_rmass  = new cCudaData<V_CFLOAT, V_CFLOAT, x> (omega_rmass , & cu_atom->omega_rmass      , atom->nmax * 4);
-    }
-
-    if(atom->omega) {
-      delete cu_omega;
-      cu_omega     = new cCudaData<double, V_CFLOAT, yx > (((double*) atom->omega)    , & cu_atom->omega     , atom->nmax, 3);
-    }
-
-    if(atom->torque) {
-      delete cu_torque;
-      cu_torque    = new cCudaData<double, F_CFLOAT, yx > (((double*) atom->torque)   , & cu_atom->torque     , atom->nmax, 3);
-    }
-
-    if(atom->special) {
-      delete cu_special;
-      cu_special    = new cCudaData<int, int, yx > (((int*) & (atom->special[0][0]))   , & cu_atom->special     , atom->nmax, atom->maxspecial, 0 , true);
-      shared_data.atom.maxspecial = atom->maxspecial;
-    }
-
-    if(atom->nspecial) {
-      delete cu_nspecial;
-      cu_nspecial    = new cCudaData<int, int, yx > (((int*) atom->nspecial)  , & cu_atom->nspecial     , atom->nmax, 3, 0, true);
-    }
-
-    if(atom->molecule) {
-      delete cu_molecule;
-      cu_molecule    = new cCudaData<int, int, x > (((int*) atom->molecule)  , & cu_atom->molecule     , atom->nmax, 0 , true);
-    }
-
-    shared_data.atom.special_flag = neighbor->special_flag;
-    shared_data.atom.molecular = atom->molecular;
-
-    cu_atom->update_nmax = 2;
-    cu_atom->nmax        = atom->nmax;
-
-    delete cu_x_type;
-    cu_x_type   = new cCudaData<X_CFLOAT, X_CFLOAT, x> (x_type , & cu_atom->x_type      , atom->nmax * 4);
-  }
-
-  if(((cu_xhold == NULL) || (cu_xhold->get_dim()[0] < neighbor->maxhold)) && neighbor->xhold) {
-    delete cu_xhold;
-    cu_xhold     = new cCudaData<double, X_CFLOAT, yx> ((double*)neighbor->xhold, & cu_atom->xhold         , neighbor->maxhold, 3);
-    shared_data.atom.maxhold = neighbor->maxhold;
-  }
-
-  if(atom->mass && !cu_mass) {
-    cu_mass      = new cCudaData<double, V_CFLOAT, x > (atom->mass      , & cu_atom->mass      , atom->ntypes + 1);
-  }
-
-  cu_atom->mass_host   = atom->mass;
-
-  if(atom->map_style == 1) {
-    if(cu_map_array == NULL) {
-      cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size());
-    } else if(cu_map_array->dev_size() / sizeof(int) < atom->get_map_size()) {
-      delete cu_map_array;
-      cu_map_array   = new cCudaData<int, int, x > (atom->get_map_array()   , & cu_atom->map_array     , atom->get_map_size());
-    }
-  }
-
-
-  // if any of the host pointers have changed (e.g. re-allocated somewhere else), set to correct pointer
-  if(cu_x   ->get_host_data() != atom->x)    cu_x   ->set_host_data((double*)(atom->x));
-
-  if(cu_v   ->get_host_data() != atom->v)    cu_v   ->set_host_data((double*)(atom->v));
-
-  if(cu_f   ->get_host_data() != atom->f)    cu_f   ->set_host_data((double*)(atom->f));
-
-  if(cu_tag ->get_host_data() != atom->tag)  cu_tag ->set_host_data(atom->tag);
-
-  if(cu_type->get_host_data() != atom->type) cu_type->set_host_data(atom->type);
-
-  if(cu_mask->get_host_data() != atom->mask) cu_mask->set_host_data(atom->mask);
-
-  if(cu_image->get_host_data() != atom->image) cu_mask->set_host_data(atom->image);
-
-  if(cu_xhold)
-    if(cu_xhold->get_host_data() != neighbor->xhold) cu_xhold->set_host_data((double*)(neighbor->xhold));
-
-  if(atom->rmass)
-    if(cu_rmass->get_host_data() != atom->rmass) cu_rmass->set_host_data((double*)(atom->rmass));
-
-  if(cu_atom->q_flag)
-    if(cu_q->get_host_data() != atom->q) cu_q->set_host_data((double*)(atom->q));
-
-  if(atom->radius)
-    if(cu_radius->get_host_data() != atom->radius) cu_radius->set_host_data((double*)(atom->radius));
-
-  if(atom->omega)
-    if(cu_omega->get_host_data() != atom->omega) cu_omega->set_host_data((double*)(atom->omega));
-
-  if(atom->torque)
-    if(cu_torque->get_host_data() != atom->torque) cu_torque->set_host_data((double*)(atom->torque));
-
-  if(atom->special)
-    if(cu_special->get_host_data() != atom->special) {
-      delete cu_special;
-      cu_special    = new cCudaData<int, int, yx > (((int*) atom->special)   , & cu_atom->special     , atom->nmax, atom->maxspecial);
-      shared_data.atom.maxspecial = atom->maxspecial;
-    }
-
-  if(atom->nspecial)
-    if(cu_nspecial->get_host_data() != atom->nspecial) cu_nspecial->set_host_data((int*)(atom->nspecial));
-
-  if(atom->molecule)
-    if(cu_molecule->get_host_data() != atom->molecule) cu_molecule->set_host_data((int*)(atom->molecule));
-
-  if(force)
-    if(cu_virial   ->get_host_data() != force->pair->virial)    cu_virial   ->set_host_data(force->pair->virial);
-
-  if(force)
-    if(cu_eng_vdwl ->get_host_data() != &force->pair->eng_vdwl)    cu_eng_vdwl  ->set_host_data(&force->pair->eng_vdwl);
-
-  if(force)
-    if(cu_eng_coul ->get_host_data() != &force->pair->eng_coul)    cu_eng_coul   ->set_host_data(&force->pair->eng_coul);
-
-  cu_atom->update_nlocal = 2;
-  MYDBG(printf("# CUDA: Cuda::checkResize done...\n");)
-}
-
-void Cuda::evsetup_eatom_vatom(int eflag_atom, int vflag_atom)
-{
-  if(eflag_atom) {
-    if(not cu_eatom)
-      cu_eatom         = new cCudaData<double, ENERGY_CFLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom)         , atom->nmax);  // cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
-
-    if(cu_eatom->get_dim()[0] != atom->nmax) {
-      //delete cu_eatom;
-      //cu_eatom         = new cCudaData<double, ENERGY_CFLOAT, x > (force->pair->eatom, & (shared_data.atom.eatom)         , atom->nmax  );// cu_eatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
-      shared_data.atom.update_nmax = 2;
-    }
-
-    cu_eatom->set_host_data(force->pair->eatom);
-    cu_eatom->memset_device(0);
-  }
-
-  if(vflag_atom) {
-    if(not cu_vatom)
-      cu_vatom         = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom)         , atom->nmax , 6);// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
-
-    if(cu_vatom->get_dim()[0] != atom->nmax) {
-      //delete cu_vatom;
-      //cu_vatom         = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)force->pair->vatom, & (shared_data.atom.vatom)         , atom->nmax ,6 );// cu_vatom->set_buffer(&(copy_buffer),&(copy_buffersize),true);}
-      shared_data.atom.update_nmax = 2;
-    }
-
-    cu_vatom->set_host_data((double*)force->pair->vatom);
-    cu_vatom->memset_device(0);
-  }
-}
-
-void Cuda::uploadAll()
-{
-  MYDBG(printf("# CUDA: Cuda::uploadAll() ... start\n");)
-  my_times starttime;
-  my_times endtime;
-
-  if(atom->nmax != shared_data.atom.nmax) checkResize();
-
-  my_gettime(CLOCK_REALTIME, &starttime);
-  cu_x   ->upload();
-  cu_v   ->upload();
-  cu_f   ->upload();
-  cu_tag ->upload();
-  cu_type->upload();
-  cu_mask->upload();
-  cu_image->upload();
-
-  if(shared_data.atom.q_flag) cu_q    ->upload();
-
-  if(atom->rmass)             cu_rmass->upload();
-
-  if(atom->radius)            cu_radius->upload();
-
-  if(atom->omega)             cu_omega->upload();
-
-  if(atom->torque)            cu_torque->upload();
-
-  if(atom->special)           cu_special->upload();
-
-  if(atom->nspecial)          cu_nspecial->upload();
-
-  if(atom->molecule)          cu_molecule->upload();
-
-  if(cu_eatom) cu_eatom->upload();
-
-  if(cu_vatom) cu_vatom->upload();
-
-  my_gettime(CLOCK_REALTIME, &endtime);
-  uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-  CUDA_IF_BINNING(Cuda_PreBinning(& shared_data);)
-  CUDA_IF_BINNING(Cuda_Binning(& shared_data);)
-
-  shared_data.atom.triggerneighsq = neighbor->triggersq;
-  MYDBG(printf("# CUDA: Cuda::uploadAll() ... end\n");)
-}
-
-void Cuda::downloadAll()
-{
-  MYDBG(printf("# CUDA: Cuda::downloadAll() ... start\n");)
-  my_times starttime;
-  my_times endtime;
-
-  if(atom->nmax != shared_data.atom.nmax) checkResize();
-
-  CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);)
-  my_gettime(CLOCK_REALTIME, &starttime);
-  cu_x   ->download();
-  cu_v   ->download();
-  cu_f   ->download();
-  cu_type->download();
-  cu_tag ->download();
-  cu_mask->download();
-  cu_image->download();
-
-  //if(shared_data.atom.need_eatom) cu_eatom->download();
-  //if(shared_data.atom.need_vatom) cu_vatom->download();
-
-  if(shared_data.atom.q_flag) cu_q    ->download();
-
-  if(atom->rmass)             cu_rmass->download();
-
-  if(atom->radius)            cu_radius->download();
-
-  if(atom->omega)             cu_omega->download();
-
-  if(atom->torque)            cu_torque->download();
-
-  if(atom->special)           cu_special->download();
-
-  if(atom->nspecial)          cu_nspecial->download();
-
-  if(atom->molecule)          cu_molecule->download();
-
-  if(cu_eatom) cu_eatom->download();
-
-  if(cu_vatom) cu_vatom->download();
-
-  my_gettime(CLOCK_REALTIME, &endtime);
-  downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-  MYDBG(printf("# CUDA: Cuda::downloadAll() ... end\n");)
-}
-
-void Cuda::upload(int datamask)
-{
-  MYDBG(printf("# CUDA: Cuda::upload() ... start\n");)
-  my_times starttime;
-  my_times endtime;
-
-  if(atom->nmax != shared_data.atom.nmax) checkResize();
-
-  my_gettime(CLOCK_REALTIME, &starttime);
-  if(X_MASK & datamask) cu_x   ->upload();
-  if(V_MASK & datamask) cu_v   ->upload();
-  if(F_MASK & datamask) cu_f   ->upload();
-  if(TYPE_MASK & datamask) cu_type->upload();
-  if(TAG_MASK & datamask) cu_tag ->upload();
-  if(MASK_MASK & datamask) cu_mask->upload();
-  if(IMAGE_MASK & datamask) cu_image->upload();
-
-  //if(shared_data.atom.need_eatom) cu_eatom->upload();
-  //if(shared_data.atom.need_vatom) cu_vatom->upload();
-
-  if(shared_data.atom.q_flag)
-	  if(Q_MASK & datamask) cu_q    ->upload();
-
-  if(atom->rmass)
-	  if(RMASS_MASK & datamask) cu_rmass->upload();
-
-  if(atom->radius)
-	  if(RADIUS_MASK & datamask) cu_radius->upload();
-
-  if(atom->omega)
-	  if(OMEGA_MASK & datamask) cu_omega->upload();
-
-  if(atom->torque)
-	  if(TORQUE_MASK & datamask) cu_torque->upload();
-
-  if(atom->special)
-	  if(SPECIAL_MASK & datamask) cu_special->upload();
-
-  if(atom->nspecial)
-	  if(SPECIAL_MASK & datamask) cu_nspecial->upload();
-
-  if(atom->molecule)
-	  if(MOLECULE_MASK & datamask) cu_molecule->upload();
-
-  if(cu_eatom) cu_eatom->upload();
-
-  if(cu_vatom) cu_vatom->upload();
-
-  my_gettime(CLOCK_REALTIME, &endtime);
-  uploadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-  MYDBG(printf("# CUDA: Cuda::upload() ... end\n");)
-}
-
-void Cuda::download(int datamask)
-{
-  MYDBG(printf("# CUDA: Cuda::download() ... start\n");)
-  my_times starttime;
-  my_times endtime;
-
-  if(atom->nmax != shared_data.atom.nmax) checkResize();
-
-  CUDA_IF_BINNING(Cuda_ReverseBinning(& shared_data);)
-  my_gettime(CLOCK_REALTIME, &starttime);
-  if(X_MASK & datamask) cu_x   ->download();
-  if(V_MASK & datamask) cu_v   ->download();
-  if(F_MASK & datamask) cu_f   ->download();
-  if(TYPE_MASK & datamask) cu_type->download();
-  if(TAG_MASK & datamask) cu_tag ->download();
-  if(MASK_MASK & datamask) cu_mask->download();
-  if(IMAGE_MASK & datamask) cu_image->download();
-
-  //if(shared_data.atom.need_eatom) cu_eatom->download();
-  //if(shared_data.atom.need_vatom) cu_vatom->download();
-
-  if(shared_data.atom.q_flag)
-	  if(Q_MASK & datamask) cu_q    ->download();
-
-  if(atom->rmass)
-	  if(RMASS_MASK & datamask) cu_rmass->download();
-
-  if(atom->radius)
-	  if(RADIUS_MASK & datamask) cu_radius->download();
-
-  if(atom->omega)
-	  if(OMEGA_MASK & datamask) cu_omega->download();
-
-  if(atom->torque)
-	  if(TORQUE_MASK & datamask) cu_torque->download();
-
-  if(atom->special)
-	  if(SPECIAL_MASK & datamask) cu_special->download();
-
-  if(atom->nspecial)
-	  if(SPECIAL_MASK & datamask) cu_nspecial->download();
-
-  if(atom->molecule)
-	  if(MOLECULE_MASK & datamask) cu_molecule->download();
-
-  if(cu_eatom) cu_eatom->download();
-
-  if(cu_vatom) cu_vatom->download();
-
-  my_gettime(CLOCK_REALTIME, &endtime);
-  downloadtime += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-  MYDBG(printf("# CUDA: Cuda::download() ... end\n");)
-}
-
-void Cuda::downloadX()
-{
-  Cuda_Pair_RevertXType(& this->shared_data);
-  cu_x->download();
-}
-
-CudaNeighList* Cuda::registerNeighborList(class NeighList* neigh_list)
-{
-  MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... start a\n");)
-  std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.find(neigh_list);
-
-  if(p != neigh_lists.end()) return p->second;
-  else {
-    CudaNeighList* neigh_list_cuda = new CudaNeighList(lmp, neigh_list);
-    neigh_lists.insert(std::pair<NeighList*, CudaNeighList*>(neigh_list, neigh_list_cuda));
-    return neigh_list_cuda;
-  }
-
-  MYDBG(printf("# CUDA: Cuda::registerNeighborList() ... end b\n");)
-}
-
-void Cuda::uploadAllNeighborLists()
-{
-  MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... start\n");)
-  std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
-
-  while(p != neigh_lists.end()) {
-    p->second->nl_upload();
-
-    if(not(p->second->neigh_list->cuda_list->build_cuda))
-      for(int i = 0; i < atom->nlocal; i++)
-        p->second->sneighlist.maxneighbors = MAX(p->second->neigh_list->numneigh[i], p->second->sneighlist.maxneighbors) ;
-
-    ++p;
-  }
-
-  MYDBG(printf("# CUDA: Cuda::uploadAllNeighborList() ... done\n");)
-}
-
-void Cuda::downloadAllNeighborLists()
-{
-  MYDBG(printf("# CUDA: Cuda::downloadAllNeighborList() ... start\n");)
-  std::map<NeighList*, CudaNeighList*>::iterator p = neigh_lists.begin();
-
-  while(p != neigh_lists.end()) {
-    p->second->nl_download();
-    ++p;
-  }
-}
-
-void Cuda::update_xhold(int &maxhold, double* xhold)
-{
-  if(this->shared_data.atom.maxhold < atom->nmax) {
-    maxhold = atom->nmax;
-    delete this->cu_xhold;
-    this->cu_xhold     = new cCudaData<double, X_CFLOAT, yx> ((double*)xhold, & this->shared_data.atom.xhold         , maxhold, 3);
-  }
-
-  this->shared_data.atom.maxhold = maxhold;
-  CudaWrapper_CopyData(this->cu_xhold->dev_data(), this->cu_x->dev_data(), 3 * atom->nmax * sizeof(X_CFLOAT));
-}
-
-void Cuda::setTimingsZero()
-{
-  shared_data.cuda_timings.test1 = 0;
-  shared_data.cuda_timings.test2 = 0;
-
-  //communication
-  shared_data.cuda_timings.comm_forward_total = 0;
-  shared_data.cuda_timings.comm_forward_mpi_upper = 0;
-  shared_data.cuda_timings.comm_forward_mpi_lower = 0;
-  shared_data.cuda_timings.comm_forward_kernel_pack = 0;
-  shared_data.cuda_timings.comm_forward_kernel_unpack = 0;
-  shared_data.cuda_timings.comm_forward_upload = 0;
-  shared_data.cuda_timings.comm_forward_download = 0;
-
-  shared_data.cuda_timings.comm_exchange_total = 0;
-  shared_data.cuda_timings.comm_exchange_mpi = 0;
-  shared_data.cuda_timings.comm_exchange_kernel_pack = 0;
-  shared_data.cuda_timings.comm_exchange_kernel_unpack = 0;
-  shared_data.cuda_timings.comm_exchange_kernel_fill = 0;
-  shared_data.cuda_timings.comm_exchange_cpu_pack = 0;
-  shared_data.cuda_timings.comm_exchange_upload = 0;
-  shared_data.cuda_timings.comm_exchange_download = 0;
-
-  shared_data.cuda_timings.comm_border_total = 0;
-  shared_data.cuda_timings.comm_border_mpi = 0;
-  shared_data.cuda_timings.comm_border_kernel_pack = 0;
-  shared_data.cuda_timings.comm_border_kernel_unpack = 0;
-  shared_data.cuda_timings.comm_border_kernel_buildlist = 0;
-  shared_data.cuda_timings.comm_border_kernel_self = 0;
-  shared_data.cuda_timings.comm_border_upload = 0;
-  shared_data.cuda_timings.comm_border_download = 0;
-
-  //pair forces
-  shared_data.cuda_timings.pair_xtype_conversion = 0;
-  shared_data.cuda_timings.pair_kernel = 0;
-  shared_data.cuda_timings.pair_virial = 0;
-  shared_data.cuda_timings.pair_force_collection = 0;
-
-  //neighbor
-  shared_data.cuda_timings.neigh_bin = 0;
-  shared_data.cuda_timings.neigh_build = 0;
-  shared_data.cuda_timings.neigh_special = 0;
-
-  //PPPM
-  shared_data.cuda_timings.pppm_particle_map = 0;
-  shared_data.cuda_timings.pppm_make_rho = 0;
-  shared_data.cuda_timings.pppm_brick2fft = 0;
-  shared_data.cuda_timings.pppm_poisson = 0;
-  shared_data.cuda_timings.pppm_fillbrick = 0;
-  shared_data.cuda_timings.pppm_fieldforce = 0;
-  shared_data.cuda_timings.pppm_compute = 0;
-
-  CudaWrapper_CheckUploadTime(true);
-  CudaWrapper_CheckDownloadTime(true);
-  CudaWrapper_CheckCPUBufUploadTime(true);
-  CudaWrapper_CheckCPUBufDownloadTime(true);
-}
-
-void Cuda::print_timings()
-{
-  if(universe->me != 0) return;
-
-  if(not dotiming) return;
-
-  printf("\n # CUDA: Special timings\n\n");
-  printf("\n Transfer Times\n");
-  printf(" PCIe Upload:  \t %lf s\n", CudaWrapper_CheckUploadTime());
-  printf(" PCIe Download:\t %lf s\n", CudaWrapper_CheckDownloadTime());
-  printf(" CPU Tempbbuf Upload:   \t %lf \n", CudaWrapper_CheckCPUBufUploadTime());
-  printf(" CPU Tempbbuf Download: \t %lf \n", CudaWrapper_CheckCPUBufDownloadTime());
-
-  printf("\n Communication \n");
-
-  printf(" Forward Total           \t %lf \n", shared_data.cuda_timings.comm_forward_total);
-  printf(" Forward MPI Upper Bound \t %lf \n", shared_data.cuda_timings.comm_forward_mpi_upper);
-  printf(" Forward MPI Lower Bound \t %lf \n", shared_data.cuda_timings.comm_forward_mpi_lower);
-  printf(" Forward Kernel Pack     \t %lf \n", shared_data.cuda_timings.comm_forward_kernel_pack);
-  printf(" Forward Kernel Unpack   \t %lf \n", shared_data.cuda_timings.comm_forward_kernel_unpack);
-  printf(" Forward Kernel Self     \t %lf \n", shared_data.cuda_timings.comm_forward_kernel_self);
-  printf(" Forward Upload          \t %lf \n", shared_data.cuda_timings.comm_forward_upload);
-  printf(" Forward Download        \t %lf \n", shared_data.cuda_timings.comm_forward_download);
-  printf(" Forward Overlap Split Ratio\t %lf \n", shared_data.comm.overlap_split_ratio);
-  printf("\n");
-
-  printf(" Exchange Total          \t %lf \n", shared_data.cuda_timings.comm_exchange_total);
-  printf(" Exchange MPI            \t %lf \n", shared_data.cuda_timings.comm_exchange_mpi);
-  printf(" Exchange Kernel Pack    \t %lf \n", shared_data.cuda_timings.comm_exchange_kernel_pack);
-  printf(" Exchange Kernel Unpack  \t %lf \n", shared_data.cuda_timings.comm_exchange_kernel_unpack);
-  printf(" Exchange Kernel Fill    \t %lf \n", shared_data.cuda_timings.comm_exchange_kernel_fill);
-  printf(" Exchange CPU Pack             \t %lf \n", shared_data.cuda_timings.comm_exchange_cpu_pack);
-  printf(" Exchange Upload         \t %lf \n", shared_data.cuda_timings.comm_exchange_upload);
-  printf(" Exchange Download       \t %lf \n", shared_data.cuda_timings.comm_exchange_download);
-  printf("\n");
-
-  printf(" Border Total            \t %lf \n", shared_data.cuda_timings.comm_border_total);
-  printf(" Border MPI              \t %lf \n", shared_data.cuda_timings.comm_border_mpi);
-  printf(" Border Kernel Pack      \t %lf \n", shared_data.cuda_timings.comm_border_kernel_pack);
-  printf(" Border Kernel Unpack    \t %lf \n", shared_data.cuda_timings.comm_border_kernel_unpack);
-  printf(" Border Kernel Self      \t %lf \n", shared_data.cuda_timings.comm_border_kernel_self);
-  printf(" Border Kernel BuildList \t %lf \n", shared_data.cuda_timings.comm_border_kernel_buildlist);
-  printf(" Border Upload           \t %lf \n", shared_data.cuda_timings.comm_border_upload);
-  printf(" Border Download              \t %lf \n", shared_data.cuda_timings.comm_border_download);
-  printf("\n");
-
-  //pair forces
-  printf(" Pair XType Conversion   \t %lf \n", shared_data.cuda_timings.pair_xtype_conversion);
-  printf(" Pair Kernel             \t %lf \n", shared_data.cuda_timings.pair_kernel);
-  printf(" Pair Virial             \t %lf \n", shared_data.cuda_timings.pair_virial);
-  printf(" Pair Force Collection   \t %lf \n", shared_data.cuda_timings.pair_force_collection);
-  printf("\n");
-
-  //neighbor
-  printf(" Neighbor Binning        \t %lf \n", shared_data.cuda_timings.neigh_bin);
-  printf(" Neighbor Build          \t %lf \n", shared_data.cuda_timings.neigh_build);
-  printf(" Neighbor Special        \t %lf \n", shared_data.cuda_timings.neigh_special);
-  printf("\n");
-
-  //pppm
-  if(force->kspace) {
-    printf(" PPPM Total              \t %lf \n", shared_data.cuda_timings.pppm_compute);
-    printf(" PPPM Particle Map       \t %lf \n", shared_data.cuda_timings.pppm_particle_map);
-    printf(" PPPM Make Rho           \t %lf \n", shared_data.cuda_timings.pppm_make_rho);
-    printf(" PPPM Brick2fft          \t %lf \n", shared_data.cuda_timings.pppm_brick2fft);
-    printf(" PPPM Poisson            \t %lf \n", shared_data.cuda_timings.pppm_poisson);
-    printf(" PPPM Fillbrick          \t %lf \n", shared_data.cuda_timings.pppm_fillbrick);
-    printf(" PPPM Fieldforce         \t %lf \n", shared_data.cuda_timings.pppm_fieldforce);
-    printf("\n");
-  }
-
-  printf(" Debug Test 1            \t %lf \n", shared_data.cuda_timings.test1);
-  printf(" Debug Test 2            \t %lf \n", shared_data.cuda_timings.test2);
-
-  printf("\n");
-}
diff --git a/src/USER-CUDA/cuda_data.h b/src/USER-CUDA/cuda_data.h
deleted file mode 100644
index bb778c12d3..0000000000
--- a/src/USER-CUDA/cuda_data.h
+++ /dev/null
@@ -1,796 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef _CUDA_DATA_H_
-#define _CUDA_DATA_H_
-
-
-enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
-//xx==x in atom_vec x is a member therefore copymode x produces compile errors
-#include "cuda_shared.h"
-#include "cuda_wrapper_cu.h"
-#include "cuda_data_cu.h"
-#include <ctime>
-
-#include <cstdio>
-#include <typeinfo>
-template <typename host_type, typename dev_type, copy_mode mode>
-class cCudaData
-{
-        protected:
-        void** buffer;
-        int* buf_size;
-        host_type* host_data;
-        dev_array* dev_data_array;
-        dev_type* temp_data;
-        unsigned nbytes;
-        bool owns_dev_array;
-        bool current_data_on_device; //this is not yet working as intended and therefore deactivated
-        bool current_data_on_host;
-        bool is_continues;
-        bool pinned;
-
-        public:
-        cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
-        cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y=0, unsigned dim_z=0, bool is_pinned=false);
-        ~cCudaData();
-        void* dev_data() {if(dev_data_array!=NULL) return dev_data_array->dev_data; else return NULL;};
-        void set_dev_data(void* adev_data) {dev_data_array->dev_data=adev_data;};
-        void set_dev_array(dev_array* adev_array) {dev_data_array=adev_array;};
-        void set_host_data(host_type* host_data);
-        void* get_host_data() { return host_data;};
-        void set_buffer(void** buffer,int* buf_size,bool ais_continues);
-        unsigned int* get_dim() {return dev_data_array->dim;};
-        // if you want to upload data to the gpu, which will not change there, then set will_be_changed=false
-        // if you want to upload data to the gpu and update it there, then set will_be_changed=true (default)
-        void upload(bool will_be_changed=true);
-        void uploadAsync(int stream, bool will_be_changed=true );
-        // if you want to download data just to have a look at it, then set will_be_changed=false
-        // if you are going to modify the downloaded data, then set will_be_changed=true (default)
-        void download(bool will_be_changed=true);
-        void downloadAsync(int stream);
-        void memset_device(int value);
-        void device_data_has_changed() {current_data_on_device=false;}
-        void host_data_has_changed() {current_data_on_host=false;}
-        int dev_size() {
-                int size = dev_data_array->dim[0]*sizeof(dev_type);
-                if(dev_data_array->dim[1]) size*=dev_data_array->dim[1];
-                if(dev_data_array->dim[2]) size*=dev_data_array->dim[2];
-                return size;}
-};
-
-
-template <typename host_type, typename dev_type, copy_mode mode>
-cCudaData<host_type, dev_type, mode>
-::cCudaData(host_type* host_data, dev_array* dev_data_array, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
-{
-        pinned=is_pinned;
-        owns_dev_array = false;
-        current_data_on_device = false;
-        current_data_on_host = false;
-        is_continues = false;
-        this->host_data = host_data;
-        this->dev_data_array = dev_data_array;
-        unsigned ndev;
-        if((mode == x)||(mode==xx))
-        {
-                ndev = dim_x;
-                dev_data_array->dim[0] = dim_x;
-                dev_data_array->dim[1] = 0;
-                dev_data_array->dim[2] = 0;
-        }
-        else if(mode == xy || mode == yx )
-        {
-                ndev = dim_x * dim_y;
-                dev_data_array->dim[0] = dim_x;
-                dev_data_array->dim[1] = dim_y;
-                dev_data_array->dim[2] = 0;
-        }
-        else
-        {
-                ndev = dim_x * dim_y * dim_z;
-                dev_data_array->dim[0] = dim_x;
-                dev_data_array->dim[1] = dim_y;
-                dev_data_array->dim[2] = dim_z;
-        }
-        nbytes = ndev * sizeof(dev_type);
-        if(nbytes<=0)
-        {
-                host_data=NULL;
-                temp_data=NULL;
-                dev_data_array->dev_data=NULL;
-                return;
-        }
-
-        dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
-        if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
-        {
-                if(not pinned)
-                temp_data = new dev_type[ndev];
-                else
-                {
-                        temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
-                }
-        }
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-cCudaData<host_type, dev_type, mode>
-::cCudaData(host_type* host_data, unsigned dim_x, unsigned dim_y, unsigned dim_z, bool is_pinned)
-{
-        pinned=is_pinned;
-        this->dev_data_array = new dev_array;
-        this->owns_dev_array = true;
-        current_data_on_device = false;
-        current_data_on_host = false;
-        is_continues = false;
-        this->host_data = host_data;
-        unsigned ndev;
-        if((mode == x)||(mode==xx))
-        {
-                ndev = dim_x;
-                dev_data_array->dim[0] = dim_x;
-                dev_data_array->dim[1] = 0;
-                dev_data_array->dim[2] = 0;
-        }
-        else if(mode == xy || mode == yx )
-        {
-                ndev = dim_x * dim_y;
-                dev_data_array->dim[0] = dim_x;
-                dev_data_array->dim[1] = dim_y;
-                dev_data_array->dim[2] = 0;
-        }
-        else
-        {
-                ndev = dim_x * dim_y * dim_z;
-                dev_data_array->dim[0] = dim_x;
-                dev_data_array->dim[1] = dim_y;
-                dev_data_array->dim[2] = dim_z;
-        }
-        nbytes = ndev * sizeof(dev_type);
-        if(nbytes<=0)
-        {
-                host_data=NULL;
-                temp_data=NULL;
-                dev_data_array->dev_data=NULL;
-                return;
-        }
-
-        dev_data_array->dev_data = CudaWrapper_AllocCudaData(nbytes);
-        if(((mode!=x)&&(mode!=xx)) || (typeid(host_type) != typeid(dev_type)))
-        {
-                if(not pinned)
-                temp_data = new dev_type[ndev];
-                else
-                {
-                        temp_data = (dev_type*) CudaWrapper_AllocPinnedHostData(ndev*sizeof(dev_type));
-                }
-        }
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-cCudaData<host_type, dev_type, mode>
-::~cCudaData()
-{
-        if(((mode!=x)&&(mode!=xx)) || typeid(host_type) != typeid(dev_type))
-        {
-                if(not pinned)
-                delete [] temp_data;
-                else
-                {
-                        CudaWrapper_FreePinnedHostData((void*)temp_data);
-                }
-        }
-        if((dev_data_array->dev_data)&&(nbytes>0))
-        CudaWrapper_FreeCudaData(dev_data_array->dev_data,nbytes);
-        if(owns_dev_array) delete dev_data_array;
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::set_host_data(host_type* host_data)
-{
-        this->host_data = host_data;
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::upload(bool will_be_changed)
-{
-        // if current data is already up, do not re-upload it
-//        if(current_data_on_device) return;
-    if(buffer&&is_continues)
-    {
-           printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
-            if(typeid(host_type)==typeid(double))
-            {
-              if(typeid(dev_type)==typeid(double))
-              {
-                      CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-              else if(typeid(dev_type)==typeid(float))
-              {
-                      CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-            }
-            else if(typeid(host_type)==typeid(float))
-            {
-              if(typeid(dev_type)==typeid(double))
-              {
-                      CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-              else if(typeid(dev_type)==typeid(float))
-              {
-                      CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-            }
-            else if(typeid(host_type)==typeid(int))
-            {
-              if(typeid(dev_type)==typeid(int))
-              {
-                      CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-            }
-    }
-        switch(mode)
-        {
-                case x:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                                CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
-                        else
-                        {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                          for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                          CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-                        }
-                        break;
-                }
-
-                case xx:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                                CudaWrapper_UploadCudaData(host_data, dev_data_array->dev_data, nbytes);
-                        else
-                        {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                                CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-                        }
-                        break;
-                }
-
-                case xy:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        {
-                                dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-                        break;
-                }
-
-                case yx:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                                {
-                                        temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-                        break;
-                }
-                case xyz:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
-                                for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
-                                {
-                                        temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-                        break;
-                }
-
-                case xzy:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
-                        {
-                                dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-                        break;
-                }
-        }
-        // we have uploaded the data to the device, i.e.:
-        current_data_on_device = true;
-        // the data is going to change on the device, making the host data out-dated
-        if(will_be_changed) current_data_on_host = false;
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::uploadAsync(int stream,bool will_be_changed)
-{
-        // if current data is already up, do not re-upload it
-//        if(current_data_on_device) return;
-    if(buffer&&is_continues)
-    {
-           printf("Actual Buffer: %p %i\n",*buffer,*buf_size);
-            if(typeid(host_type)==typeid(double))
-            {
-              if(typeid(dev_type)==typeid(double))
-              {
-                      CudaData_Upload_DoubleDouble((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-              else if(typeid(dev_type)==typeid(float))
-              {
-                      CudaData_Upload_DoubleFloat((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-            }
-            else if(typeid(host_type)==typeid(float))
-            {
-              if(typeid(dev_type)==typeid(double))
-              {
-                      CudaData_Upload_FloatDouble((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-              else if(typeid(dev_type)==typeid(float))
-              {
-                      CudaData_Upload_FloatFloat((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-            }
-            else if(typeid(host_type)==typeid(int))
-            {
-              if(typeid(dev_type)==typeid(int))
-              {
-                      CudaData_Upload_IntInt((void*) host_data,dev_data_array->dev_data,
-                                                                                              dev_data_array->dim,mode,*buffer);
-                        current_data_on_device = true;
-                        if(will_be_changed) current_data_on_host = false;
-                        return;
-              }
-            }
-    }
-        switch(mode)
-        {
-                case x:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                                CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
-                        else
-                        {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                          for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                          CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
-                        }
-                        break;
-                }
-
-                case xx:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                                CudaWrapper_UploadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes,stream);
-                        else
-                        {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i) temp_data[i] = static_cast<dev_type>(host_data[i]);
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                                CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
-                        }
-                        break;
-                }
-
-                case xy:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        {
-                                dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        temp[j] = static_cast<dev_type>((reinterpret_cast<host_type**>(host_data))[i][j]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
-                        break;
-                }
-
-                case yx:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                                {
-                                        temp[i] = static_cast<dev_type>(reinterpret_cast<host_type**>(host_data)[i][j]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
-                        break;
-                }
-                case xyz:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[(i*dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
-                                for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
-                                {
-                                        temp[k] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
-                        break;
-                }
-
-                case xzy:
-                {
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
-                        {
-                                dev_type* temp = &temp_data[(i*dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        temp[j] = static_cast<dev_type>(reinterpret_cast<host_type***>(host_data)[i][j][k]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufUploadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        CudaWrapper_UploadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes,stream);
-                        break;
-                }
-        }
-        // we have uploaded the data to the device, i.e.:
-        current_data_on_device = true;
-        // the data is going to change on the device, making the host data out-dated
-        if(will_be_changed) current_data_on_host = false;
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::download(bool will_be_changed)
-{
-        // if current data is already down, do not re-download it
-//        if(current_data_on_host) return;
-        switch(mode)
-        {
-                case x:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                                CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
-                        else
-                        {
-                                CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufDownloadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        }
-                        break;
-                }
-
-                case xx:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                                CudaWrapper_DownloadCudaData(host_data, dev_data_array->dev_data, nbytes);
-                        else
-                        {
-                                CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufDownloadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        }
-                        break;
-                }
-
-                case xy:
-                {
-                        CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        {
-                                dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufDownloadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        break;
-                }
-
-                case yx:
-                {
-                        CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                                {
-                                        reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufDownloadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        break;
-                }
-
-                case xyz:
-                {
-                        CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[(i * dev_data_array->dim[1]+j)*dev_data_array->dim[2]];
-                                for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
-                                {
-                                        reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[k]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufDownloadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        break;
-                }
-
-                case xzy:
-                {
-                        CudaWrapper_DownloadCudaData(temp_data, dev_data_array->dev_data, nbytes);
-    timespec time1,time2;
-    my_gettime(CLOCK_REALTIME,&time1);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        for(unsigned k=0; k<dev_data_array->dim[2]; ++k)
-                        {
-                                dev_type* temp = &temp_data[(i * dev_data_array->dim[2]+k)*dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        reinterpret_cast<host_type***>(host_data)[i][j][k] = static_cast<host_type>(temp[j]);
-                                }
-                        }
-        my_gettime(CLOCK_REALTIME,&time2);
-        CudaWrapper_AddCPUBufDownloadTime(
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000);
-                        break;
-                }
-        }
-        // we have downloaded the data to the host, i.e.:
-        current_data_on_host = true;
-        // the data is going to change on the host, making the device data out-dated
-        if(will_be_changed) current_data_on_device = false;
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::downloadAsync(int stream)
-{
-        switch(mode)
-        {
-                case x:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                        {
-                                CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
-                                CudaWrapper_SyncStream(stream);
-                        }
-                        else
-                        {
-                                CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
-                                CudaWrapper_SyncStream(stream);
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
-                        }
-                        break;
-                }
-
-                case xx:
-                {
-                        if(typeid(host_type) == typeid(dev_type))
-                        {
-                                CudaWrapper_DownloadCudaDataAsync(host_data, dev_data_array->dev_data, nbytes, stream);
-                            CudaWrapper_SyncStream(stream);
-                        }
-                        else
-                        {
-                                CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
-                             CudaWrapper_SyncStream(stream);
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i) host_data[i] = static_cast<host_type>(temp_data[i]);
-                        }
-                        break;
-                }
-
-                case xy:
-                {
-                        CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
-                        CudaWrapper_SyncStream(stream);
-                        for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                        {
-                                dev_type* temp = &temp_data[i * dev_data_array->dim[1]];
-                                for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                                {
-                                        reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[j]);
-                                }
-                        }
-                        break;
-                }
-
-                case yx:
-                {
-                        CudaWrapper_DownloadCudaDataAsync(temp_data, dev_data_array->dev_data, nbytes, stream);
-                        CudaWrapper_SyncStream(stream);
-                        for(unsigned j=0; j<dev_data_array->dim[1]; ++j)
-                        {
-                                dev_type* temp = &temp_data[j*dev_data_array->dim[0]];
-                                for(unsigned i=0; i<dev_data_array->dim[0]; ++i)
-                                {
-                                        reinterpret_cast<host_type**>(host_data)[i][j] = static_cast<host_type>(temp[i]);
-                                }
-                        }
-                        break;
-                }
-        }
-}
-
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::memset_device(int value)
-{
-   CudaWrapper_Memset(dev_data_array->dev_data,value, nbytes);
-}
-
-template <typename host_type, typename dev_type, copy_mode mode>
-void cCudaData<host_type, dev_type, mode>
-::set_buffer(void** abuffer,int* abuf_size,bool ais_continues)
-{
-   buffer = abuffer;
-   buf_size = abuf_size;
-   unsigned nbytes_buf=(nbytes/sizeof(dev_type))*sizeof(host_type);
-   if(buffer!=NULL)
-   if(not((typeid(host_type) == typeid(dev_type))&&(mode == x || mode == xx)))
-   {
-           printf("Allocate Buffer: %p %i\n",*buffer,*buf_size);
-            if(((*buffer)!=NULL)&&(*buf_size<nbytes_buf))
-            CudaWrapper_FreeCudaData(*buffer,*buf_size);
-            if(*buf_size<nbytes_buf)
-            {*buffer=CudaWrapper_AllocCudaData(nbytes_buf);*buf_size=nbytes_buf;}
-           printf("Allocate Buffer2: %p %i\n",*buffer,*buf_size);
-
-   }
-   is_continues=ais_continues;
-}
-#endif // _CUDA_DATA_H_
diff --git a/src/USER-CUDA/cuda_modify_flags.h b/src/USER-CUDA/cuda_modify_flags.h
deleted file mode 100644
index e683456d27..0000000000
--- a/src/USER-CUDA/cuda_modify_flags.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef CUDA_MODIFY_FLAGS_H
-#define CUDA_MODIFY_FLAGS_H
-
-#include "fix.h"
-
-namespace LAMMPS_NS {
-namespace FixConstCuda {
-  static const int INITIAL_INTEGRATE_CUDA = FixConst::FIX_CONST_LAST << 0;
-  static const int POST_INTEGRATE_CUDA    = FixConst::FIX_CONST_LAST << 1;
-  static const int PRE_EXCHANGE_CUDA      = FixConst::FIX_CONST_LAST << 2;
-  static const int PRE_NEIGHBOR_CUDA      = FixConst::FIX_CONST_LAST << 3;
-  static const int PRE_FORCE_CUDA         = FixConst::FIX_CONST_LAST << 4;
-  static const int POST_FORCE_CUDA        = FixConst::FIX_CONST_LAST << 5;
-  static const int FINAL_INTEGRATE_CUDA   = FixConst::FIX_CONST_LAST << 6;
-  static const int END_OF_STEP_CUDA       = FixConst::FIX_CONST_LAST << 7;
-  static const int THERMO_ENERGY_CUDA     = FixConst::FIX_CONST_LAST << 8;
-  static const int MIN_POST_FORCE_CUDA    = FixConst::FIX_CONST_LAST << 9;
-}
-}
-// remember not to shift over 31 bits
-
-#endif // CUDA_MODIFY_FLAGS_H
diff --git a/src/USER-CUDA/cuda_neigh_list.cpp b/src/USER-CUDA/cuda_neigh_list.cpp
deleted file mode 100644
index 6e05cee274..0000000000
--- a/src/USER-CUDA/cuda_neigh_list.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include "cuda_neigh_list.h"
-#include "neigh_list.h"
-#include <cstring>
-#include <vector>
-#include <map>
-#include <algorithm>
-#include "user_cuda.h"
-#include "atom.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
-{
-        cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
-        this->neigh_list = neigh_list;
-        neigh_list->cuda_list=this;
-        sneighlist.maxlocal = neigh_list->get_maxlocal();
-        sneighlist.maxneighbors = 32;
-        sneighlist.maxcut = 0.0;
-        sneighlist.cutneighsq = NULL;
-        cu_neighbors = NULL;
-        cu_neighbors_border = NULL;
-        cu_neighbors_inner = NULL;
-        cu_numneigh_border = NULL;
-        cu_numneigh_inner = NULL;
-        cu_numneigh = NULL;
-        cu_ilist = NULL;
-        cu_ilist_border = NULL;
-        cu_inum_border = NULL;
-        inum_border = 0;
-        neighbors = NULL;
-        neighbors_inner = NULL;
-        neighbors_border = NULL;
-        numneigh_border = NULL;
-        numneigh_inner = NULL;
-        ilist_border = NULL;
-
-        build_cuda = false;
-        sneighlist.binned_id=NULL;
-        sneighlist.bin_dim=new int[3];
-        sneighlist.bin_dim[0]=0;
-        sneighlist.bin_dim[1]=0;
-        sneighlist.bin_dim[2]=0;
-
-        cu_ex_type = NULL;
-        cu_ex1_bit = NULL;
-        cu_ex2_bit = NULL;
-        cu_ex_mol_bit = NULL;
-        sneighlist.nex_type=0;
-        sneighlist.nex_group=0;
-        sneighlist.nex_mol=0;
-
-        sneighlist.bin_nmax=0;
-        sneighlist.bin_extraspace=0.05;
-        MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... end\n");)
-
-}
-
-CudaNeighList::~CudaNeighList()
-{
-        dev_free();
-}
-
-void CudaNeighList::dev_alloc()
-{
-        MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... start\n"); )
-        cu_ilist         = new cCudaData<int , int , x> (neigh_list->ilist   , & sneighlist.ilist     , sneighlist.maxlocal );
-        cu_numneigh      = new cCudaData<int , int , x> (neigh_list->numneigh, & sneighlist.numneigh  , sneighlist.maxlocal );
-        neighbors = new int[atom->nmax*sneighlist.maxneighbors];
-        cu_neighbors= new cCudaData<int, int, x> (neighbors                                          , & sneighlist.neighbors, atom->nmax*sneighlist.maxneighbors );
-
-        if(cuda->shared_data.overlap_comm)
-        {
-        ilist_border  = new int[sneighlist.maxlocal];
-        numneigh_border        = new int[sneighlist.maxlocal];
-        numneigh_inner        = new int[sneighlist.maxlocal];
-        cu_inum_border  = new cCudaData<int , int , x> (&inum_border                 , & sneighlist.inum_border      , 1 );
-        cu_ilist_border  = new cCudaData<int , int , x> (ilist_border                 , & sneighlist.ilist_border     , sneighlist.maxlocal );
-        cu_numneigh_border        = new cCudaData<int , int , x> (numneigh_border  , & sneighlist.numneigh_border  , sneighlist.maxlocal );
-        cu_numneigh_inner         = new cCudaData<int , int , x> (numneigh_inner   , & sneighlist.numneigh_inner   , sneighlist.maxlocal );
-        neighbors_border = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
-        cu_neighbors_border= new cCudaData<int, int, x> (neighbors_border         , & sneighlist.neighbors_border, sneighlist.maxlocal*sneighlist.maxneighbors );
-        neighbors_inner = new int[sneighlist.maxlocal*sneighlist.maxneighbors];
-        cu_neighbors_inner = new cCudaData<int, int, x> (neighbors_inner         , & sneighlist.neighbors_inner , sneighlist.maxlocal*sneighlist.maxneighbors );
-        }
-        cuda->shared_data.atom.update_neigh=2;
-        MYDBG( printf("# CUDA: CudaNeighList::dev_alloc() ... end\n"); )
-}
-
-void CudaNeighList::dev_free()
-{
-        MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... start\n"); )
-        delete cu_numneigh;
-        delete cu_ilist;
-        delete [] neighbors;
-        delete cu_neighbors;
-
-        if(cuda->shared_data.overlap_comm)
-        {
-        delete [] ilist_border;
-        delete [] numneigh_border;
-        delete [] numneigh_inner;
-        delete [] neighbors_border;
-        delete [] neighbors_inner;
-        delete cu_inum_border;
-        delete cu_neighbors_border;
-        delete cu_neighbors_inner;
-        delete cu_numneigh_border;
-        delete cu_numneigh_inner;
-        delete cu_ilist_border;
-        }
-        MYDBG( printf("# CUDA: CudaNeighList::dev_free() ... end\n"); )
-}
-
-void CudaNeighList::grow_device()
-{
-        MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... start\n");)
-        // if host has allocated more memory for atom arrays than device has, then allocate more memory on device
-        int new_maxlocal = neigh_list->get_maxlocal();
-        if(sneighlist.maxlocal < new_maxlocal)
-        {
-                sneighlist.maxlocal = new_maxlocal;
-                dev_free();
-                dev_alloc();
-        }
-
-        if(!cu_ilist || !cu_numneigh) dev_alloc();
-
-        // check, if hosts data has been allocated somewhere else
-        if(cu_ilist   ->get_host_data() != neigh_list->ilist)    cu_ilist   ->set_host_data(neigh_list->ilist);
-        if(cu_numneigh->get_host_data() != neigh_list->numneigh) cu_numneigh->set_host_data(neigh_list->numneigh);
-
-        MYDBG(printf("# CUDA: CudaNeighList::grow_device() ... end\n");)
-}
-
-
-void CudaNeighList::nl_upload(bool will_be_changed)
-{
-        //return;
-        MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... start\n");)
-        if(cu_ilist)
-        cu_ilist->upload();
-        if(cu_numneigh)
-        cu_numneigh->upload();
-        MYDBG(printf("# CUDA: CudaNeighList::nl_upload() ... end\n");)
-}
-
-void CudaNeighList::nl_download(bool will_be_changed)
-{
-        MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... start\n");)
-        if(cu_ilist)
-        cu_ilist->download();
-        if(cu_numneigh)
-        cu_numneigh->download();
-        MYDBG(printf("# CUDA: CudaNeighList::nl_download() ... end\n");)
-}
diff --git a/src/USER-CUDA/cuda_neigh_list.h b/src/USER-CUDA/cuda_neigh_list.h
deleted file mode 100644
index f733cdfd61..0000000000
--- a/src/USER-CUDA/cuda_neigh_list.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_NEIGH_LIST_CUDA_H
-#define LMP_NEIGH_LIST_CUDA_H
-
-#include "pointers.h"
-#include "cuda_data.h"
-#include "neigh_list.h"
-
-namespace LAMMPS_NS
-{
-
-class CudaNeighList : protected Pointers
-{
-        public:
-                cCudaData<int , int , x>*  cu_ilist;
-                cCudaData<int , int , x>*  cu_numneigh;
-                cCudaData<int , int , x>*  cu_inum_border;
-                cCudaData<int , int , x>*  cu_ilist_border;
-                cCudaData<int , int , x>*  cu_numneigh_border;
-                cCudaData<int , int , x>*  cu_numneigh_inner;
-                cCudaData<int , int , x>*  cu_neighbors;
-                cCudaData<int , int , x>*  cu_neighbors_border;
-                cCudaData<int , int , x>*  cu_neighbors_inner;
-                cCudaData<int , int , x>*  cu_ex_type;
-                cCudaData<int , int , x>*  cu_ex1_bit;
-                cCudaData<int , int , x>*  cu_ex2_bit;
-                cCudaData<int , int , x>*  cu_ex_mol_bit;
-
-
-                cuda_shared_neighlist sneighlist;
-
-                int* neighbors;
-                int* neighbors_inner;
-                int* neighbors_border;
-                int inum_border;
-                int* ilist_border;
-                int* numneigh_border;
-                int* numneigh_inner;
-                int nex_type;
-                int nex_group;
-                int nex_mol;
-
-                bool build_cuda;
-
-                CudaNeighList(class LAMMPS *, class NeighList* neigh_list);
-                ~CudaNeighList();
-                void grow_device(); // will grow pages memory on device, keeping old pages. will grow lists memory on device, deleting old lists
-                void nl_upload(bool will_be_changed=true);
-                void nl_download(bool will_be_changed=true);
-                NeighList* neigh_list;
-
-                void dev_alloc();
-                void dev_free();
-
- private:
-  class Cuda *cuda;
-};
-
-}
-
-#endif
diff --git a/src/USER-CUDA/domain_cuda.cpp b/src/USER-CUDA/domain_cuda.cpp
deleted file mode 100644
index 997a42a681..0000000000
--- a/src/USER-CUDA/domain_cuda.cpp
+++ /dev/null
@@ -1,345 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author (triclinic) : Pieter in 't Veld (SNL)
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <math.h>
-#include "domain_cuda.h"
-#include "style_region.h"
-#include "atom.h"
-#include "force.h"
-#include "update.h"
-#include "modify.h"
-#include "fix.h"
-#include "fix_deform.h"
-#include "region.h"
-#include "lattice.h"
-#include "comm.h"
-#include "memory.h"
-#include "error.h"
-
-#include "user_cuda.h"
-#include "domain_cu.h"
-
-using namespace LAMMPS_NS;
-
-#define BIG   1.0e20
-#define SMALL 1.0e-4
-#define DELTA 1
-
-enum {NO_REMAP, X_REMAP, V_REMAP};                // same as fix_deform.cpp
-
-/* ----------------------------------------------------------------------
-   default is periodic
-------------------------------------------------------------------------- */
-
-DomainCuda::DomainCuda(LAMMPS* lmp) : Domain(lmp)
-{
-  cuda = lmp->cuda;
-
-  if(cuda == NULL)
-    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-}
-
-/* ---------------------------------------------------------------------- */
-
-void DomainCuda::init()
-{
-  Domain::init();
-
-  if(not cuda->finished_run) {
-    cuda->setDomainParams();
-    Cuda_Domain_Init(&cuda->shared_data);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   set global box params
-   assumes boxlo/hi and triclinic tilts are already set
-------------------------------------------------------------------------- */
-
-void DomainCuda::set_global_box()
-{
-  // one-time activation of CUDA
-  // do it here, b/c is now too late for further package commands
-  // activation must occur before any USER-CUDA class communicates with GPUs
-
-  cuda->activate();
-
-  Domain::set_global_box();
-
-  if(not cuda->finished_run) {
-    cuda->setDomainParams();
-  }
-}
-
-/* ----------------------------------------------------------------------
-   set lamda box params, only need be done one time
-   assumes global box is defined and proc assignment has been made by comm
-   for uppermost proc, insure subhi = 1.0 (in case round-off occurs)
-------------------------------------------------------------------------- */
-
-void DomainCuda::set_lamda_box()
-{
-  Domain::set_lamda_box();
-
-  if(not cuda->finished_run) {
-    cuda->setDomainParams();
-  }
-}
-
-/* ----------------------------------------------------------------------
-   set local subbox params
-   assumes global box is defined and proc assignment has been made
-   for uppermost proc, insure subhi = boxhi (in case round-off occurs)
-------------------------------------------------------------------------- */
-
-void DomainCuda::set_local_box()
-{
-  Domain::set_local_box();
-
-  if(not cuda->finished_run) {
-    // cuda->setDomainParams();
-    //Cuda_Domain_Init(&cuda->shared_data);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   reset global & local boxes due to global box boundary changes
-   if shrink-wrapped, determine atom extent and reset boxlo/hi
-   if shrink-wrapped and triclinic, perform shrink-wrap in box coords
-------------------------------------------------------------------------- */
-
-void DomainCuda::reset_box()
-{
-  if(nonperiodic == 2) {
-
-    // convert back to box coords for shrink-wrap operation
-
-    if(triclinic) lamda2x(atom->nlocal);
-
-    // compute extent of atoms on this proc
-
-    double extent[3][2], all[3][2];
-
-    extent[2][0] = extent[1][0] = extent[0][0] = BIG;
-    extent[2][1] = extent[1][1] = extent[0][1] = -BIG;
-
-    double** x = atom->x;
-    int nlocal = atom->nlocal;
-
-    if(cuda->finished_setup && (!cuda->oncpu)) {
-      extent[0][0] = cuda->extent[0];
-      extent[0][1] = cuda->extent[1];
-      extent[1][0] = cuda->extent[2];
-      extent[1][1] = cuda->extent[3];
-      extent[2][0] = cuda->extent[4];
-      extent[2][1] = cuda->extent[5];
-    } else
-      for(int i = 0; i < nlocal; i++) {
-        extent[0][0] = MIN(extent[0][0], x[i][0]);
-        extent[0][1] = MAX(extent[0][1], x[i][0]);
-        extent[1][0] = MIN(extent[1][0], x[i][1]);
-        extent[1][1] = MAX(extent[1][1], x[i][1]);
-        extent[2][0] = MIN(extent[2][0], x[i][2]);
-        extent[2][1] = MAX(extent[2][1], x[i][2]);
-      }
-
-    // compute extent across all procs
-    // flip sign of MIN to do it in one Allreduce MAX
-
-    extent[0][0] = -extent[0][0];
-    extent[1][0] = -extent[1][0];
-    extent[2][0] = -extent[2][0];
-
-    MPI_Allreduce(extent, all, 6, MPI_DOUBLE, MPI_MAX, world);
-
-    // in shrink-wrapped dims, set box by atom extent
-    // if minimum set, enforce min box size settings
-
-    if(triclinic == 0) {
-      if(xperiodic == 0) {
-        if(boundary[0][0] == 2) boxlo[0] = -all[0][0] - small[0];
-        else if(boundary[0][0] == 3)
-          boxlo[0] = MIN(-all[0][0] - small[0], minxlo);
-
-        if(boundary[0][1] == 2) boxhi[0] = all[0][1] + small[0];
-        else if(boundary[0][1] == 3) boxhi[0] = MAX(all[0][1] + small[0], minxhi);
-
-        if(boxlo[0] > boxhi[0]) error->all(FLERR, "Illegal simulation box");
-      }
-
-      if(yperiodic == 0) {
-        if(boundary[1][0] == 2) boxlo[1] = -all[1][0] - small[1];
-        else if(boundary[1][0] == 3)
-          boxlo[1] = MIN(-all[1][0] - small[1], minylo);
-
-        if(boundary[1][1] == 2) boxhi[1] = all[1][1] + small[1];
-        else if(boundary[1][1] == 3) boxhi[1] = MAX(all[1][1] + small[1], minyhi);
-
-        if(boxlo[1] > boxhi[1]) error->all(FLERR, "Illegal simulation box");
-      }
-
-      if(zperiodic == 0) {
-        if(boundary[2][0] == 2) boxlo[2] = -all[2][0] - small[2];
-        else if(boundary[2][0] == 3)
-          boxlo[2] = MIN(-all[2][0] - small[2], minzlo);
-
-        if(boundary[2][1] == 2) boxhi[2] = all[2][1] + small[2];
-        else if(boundary[2][1] == 3) boxhi[2] = MAX(all[2][1] + small[2], minzhi);
-
-        if(boxlo[2] > boxhi[2]) error->all(FLERR, "Illegal simulation box");
-      }
-
-    } else {
-      double lo[3], hi[3];
-
-      if(xperiodic == 0) {
-        lo[0] = -all[0][0];
-        lo[1] = 0.0;
-        lo[2] = 0.0;
-        Domain::lamda2x(lo, lo);
-        hi[0] = all[0][1];
-        hi[1] = 0.0;
-        hi[2] = 0.0;
-        Domain::lamda2x(hi, hi);
-
-        if(boundary[0][0] == 2) boxlo[0] = lo[0] - small[0];
-        else if(boundary[0][0] == 3) boxlo[0] = MIN(lo[0] - small[0], minxlo);
-
-        if(boundary[0][1] == 2) boxhi[0] = hi[0] + small[0];
-        else if(boundary[0][1] == 3) boxhi[0] = MAX(hi[0] + small[0], minxhi);
-
-        if(boxlo[0] > boxhi[0]) error->all(FLERR, "Illegal simulation box");
-      }
-
-      if(yperiodic == 0) {
-        lo[0] = 0.0;
-        lo[1] = -all[1][0];
-        lo[2] = 0.0;
-        Domain::lamda2x(lo, lo);
-        hi[0] = 0.0;
-        hi[1] = all[1][1];
-        hi[2] = 0.0;
-        Domain::lamda2x(hi, hi);
-
-        if(boundary[1][0] == 2) boxlo[1] = lo[1] - small[1];
-        else if(boundary[1][0] == 3) boxlo[1] = MIN(lo[1] - small[1], minylo);
-
-        if(boundary[1][1] == 2) boxhi[1] = hi[1] + small[1];
-        else if(boundary[1][1] == 3) boxhi[1] = MAX(hi[1] + small[1], minyhi);
-
-        if(boxlo[1] > boxhi[1]) error->all(FLERR, "Illegal simulation box");
-
-        //xy *= (boxhi[1]-boxlo[1]) / yprd;
-      }
-
-      if(zperiodic == 0) {
-        lo[0] = 0.0;
-        lo[1] = 0.0;
-        lo[2] = -all[2][0];
-        Domain::lamda2x(lo, lo);
-        hi[0] = 0.0;
-        hi[1] = 0.0;
-        hi[2] = all[2][1];
-        Domain::lamda2x(hi, hi);
-
-        if(boundary[2][0] == 2) boxlo[2] = lo[2] - small[2];
-        else if(boundary[2][0] == 3) boxlo[2] = MIN(lo[2] - small[2], minzlo);
-
-        if(boundary[2][1] == 2) boxhi[2] = hi[2] + small[2];
-        else if(boundary[2][1] == 3) boxhi[2] = MAX(hi[2] + small[2], minzhi);
-
-        if(boxlo[2] > boxhi[2]) error->all(FLERR, "Illegal simulation box");
-
-        //xz *= (boxhi[2]-boxlo[2]) / xprd;
-        //yz *= (boxhi[2]-boxlo[2]) / yprd;
-      }
-    }
-  }
-
-  set_global_box();
-  set_local_box();
-
-  if(not cuda->finished_run) {
-    cuda->setDomainParams();
-    Cuda_Domain_Init(&cuda->shared_data);
-  }
-
-  // if shrink-wrapped, convert to lamda coords for new box
-  // must re-invoke pbc() b/c x2lamda result can be outside 0,1 due to roundoff
-
-  if(nonperiodic == 2 && triclinic) {
-    x2lamda(atom->nlocal);
-    pbc();
-  }
-}
-
-/* ----------------------------------------------------------------------
-   enforce PBC and modify box image flags for each atom
-   called every reneighboring and by other commands that change atoms
-   resulting coord must satisfy lo <= coord < hi
-   MAX is important since coord - prd < lo can happen when coord = hi
-   if fix deform, remap velocity of fix group atoms by box edge velocities
-   for triclinic, atoms must be in lamda coords (0-1) before pbc is called
-   image = 10 bits for each dimension
-   increment/decrement in wrap-around fashion
-------------------------------------------------------------------------- */
-
-void DomainCuda::pbc()
-{
-  if(cuda->finished_setup && (!cuda->oncpu)) {
-    cuda->setDomainParams();
-    Cuda_Domain_PBC(&cuda->shared_data, deform_vremap, deform_groupbit, cuda->extent);
-    return;
-  }
-
-  Domain::pbc();
-}
-
-
-/* ----------------------------------------------------------------------
-   convert triclinic 0-1 lamda coords to box coords for all N atoms
-   x = H lamda + x0;
-------------------------------------------------------------------------- */
-
-void DomainCuda::lamda2x(int n)
-{
-  if(cuda->finished_setup && (!cuda->oncpu)) {
-    Cuda_Domain_lamda2x(&cuda->shared_data, n);
-    return;
-  }
-
-  Domain::lamda2x(n);
-}
-
-/* ----------------------------------------------------------------------
-   convert box coords to triclinic 0-1 lamda coords for all N atoms
-   lamda = H^-1 (x - x0)
-------------------------------------------------------------------------- */
-
-void DomainCuda::x2lamda(int n)
-{
-  if(cuda->finished_setup && (!cuda->oncpu)) {
-    Cuda_Domain_x2lamda(&cuda->shared_data, n);
-    return;
-  }
-
-  Domain::x2lamda(n);
-}
diff --git a/src/USER-CUDA/domain_cuda.h b/src/USER-CUDA/domain_cuda.h
deleted file mode 100644
index ede402ffba..0000000000
--- a/src/USER-CUDA/domain_cuda.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_DOMAIN_CUDA_H
-#define LMP_DOMAIN_CUDA_H
-
-#include "pointers.h"
-#include "domain.h"
-
-namespace LAMMPS_NS {
-
-class DomainCuda : public Domain {
- public:
-  DomainCuda(class LAMMPS *);
-  void init();
-  void set_global_box();
-  void set_lamda_box();
-  void set_local_box();
-  void reset_box();
-  void pbc();
-
-  virtual void lamda2x(int);
-  virtual void x2lamda(int);
-
- protected:
-  class Cuda *cuda;
-};
-
-}
-
-#endif
diff --git a/src/USER-CUDA/fft3d_cuda.cpp b/src/USER-CUDA/fft3d_cuda.cpp
deleted file mode 100644
index 42ec4b16ee..0000000000
--- a/src/USER-CUDA/fft3d_cuda.cpp
+++ /dev/null
@@ -1,609 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Jim Shepherd (GA Tech) added SGI SCSL support
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include "fft3d_cuda.h"
-#include "fft3d_cuda_cu.h"
-#include "remap.h"
-#include <ctime>
-#include "cuda_wrapper_cu.h"
-
-#ifdef FFT_CUFFT
-#endif
-
-#define MIN(A,B) ((A) < (B)) ? (A) : (B)
-#define MAX(A,B) ((A) > (B)) ? (A) : (B)
-
-/* ----------------------------------------------------------------------
-   Data layout for 3d FFTs:
-
-   data set of Nfast x Nmid x Nslow elements is owned by P procs
-   on input, each proc owns a subsection of the elements
-   on output, each proc will own a (possibly different) subsection
-   my subsection must not overlap with any other proc's subsection,
-     i.e. the union of all proc's input (or output) subsections must
-     exactly tile the global Nfast x Nmid x Nslow data set
-   when called from C, all subsection indices are
-     C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
-   when called from F77, all subsection indices are
-     F77-style from 1 to N where N = Nfast or Nmid or Nslow
-   a proc can own 0 elements on input or output
-     by specifying hi index < lo index
-   on both input and output, data is stored contiguously on a processor
-     with a fast-varying, mid-varying, and slow-varying index
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Perform 3d FFT
-
-   Arguments:
-   in           starting address of input data on this proc
-   out          starting address of where output data for this proc
-                  will be placed (can be same as in)
-   flag         1 for forward FFT, -1 for inverse FFT
-   plan         plan returned by previous call to fft_3d_create_plan
-------------------------------------------------------------------------- */
-
-void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
-{
-#ifdef FFT_CUFFT
-  plan->iterate++;
-  my_times starttime,starttime2;
-  my_times endtime,endtime2;
-
-  int i,total,length,offset,num;
-  double norm;
-  FFT_DATA *data,*copy;
-  // system specific constants
-
-
-  // pre-remap to prepare for 1st FFTs if needed
-  // copy = loc for remap result
-  int nprocs=plan->nprocs;
-if(nprocs>1)
-{
-  if(plan->init)
-  my_gettime(CLOCK_REALTIME,&starttime);
-  if (plan->pre_plan) {
-    if (plan->pre_target == 0) copy = out;
-    else copy = plan->copy;
-    if(plan->init) remap_3d((double *) in, (double *) out, (double *) plan->scratch,plan->pre_plan);
-    data = out;
-  }
-  else
-    data = in;
-}
-  cufftResult retvalc;
-  if(plan->init)
-  {
-        if(nprocs>1)
-        {
-      if(sizeof(FFT_CFLOAT)==sizeof(double))cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize/2,cudaMemcpyHostToDevice);
-      if(sizeof(FFT_CFLOAT)==sizeof(float)) cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice);
-      initfftdata((double*)plan->cudata2,(FFT_CFLOAT*)plan->cudata,plan->nfast,plan->nmid,plan->nslow);
-    }
-  }
-    if (flag == -1)
-    {
-      retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_FORWARD);
-    }
-    else
-    {
-      retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_INVERSE);
-    }
-    if(retvalc!=CUFFT_SUCCESS) {printf("ErrorCUFFT: %i\n",retvalc);exit(EXIT_FAILURE);}
-
-    FFTsyncthreads();
-#endif
-}
-/* ----------------------------------------------------------------------
-   Create plan for performing a 3d FFT
-
-   Arguments:
-   comm                 MPI communicator for the P procs which own the data
-   nfast,nmid,nslow     size of global 3d matrix
-   in_ilo,in_ihi        input bounds of data I own in fast index
-   in_jlo,in_jhi        input bounds of data I own in mid index
-   in_klo,in_khi        input bounds of data I own in slow index
-   out_ilo,out_ihi      output bounds of data I own in fast index
-   out_jlo,out_jhi      output bounds of data I own in mid index
-   out_klo,out_khi      output bounds of data I own in slow index
-   scaled               0 = no scaling of result, 1 = scaling
-   permute              permutation in storage order of indices on output
-                          0 = no permutation
-                          1 = permute once = mid->fast, slow->mid, fast->slow
-                          2 = permute twice = slow->fast, fast->mid, mid->slow
-   nbuf                 returns size of internal storage buffers used by FFT
-------------------------------------------------------------------------- */
-
-struct fft_plan_3d *fft_3d_create_plan_cuda(
-       MPI_Comm comm, int nfast, int nmid, int nslow,
-       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
-       int in_klo, int in_khi,
-       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
-       int out_klo, int out_khi,
-       int scaled, int permute, int *nbuf,bool ainit)
-{
-#ifdef FFT_CUFFT
-  struct fft_plan_3d *plan;
-  int me,nprocs;
-  int i,num,flag,remapflag,fftflag;
-  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
-  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
-  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
-  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
-  int np1,np2,ip1,ip2;
-  int list[50];
-
-  // system specific variables
-
-  // query MPI info
-
-  MPI_Comm_rank(comm,&me);
-  MPI_Comm_size(comm,&nprocs);
-
-#ifndef FFT_CUFFT
-    error->all(FLERR,"ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'.");
-#endif
-  // compute division of procs in 2 dimensions not on-processor
-  bifactor_cuda(nprocs,&np1,&np2);
-  ip1 = me % np1;
-  ip2 = me/np1;
-
-  // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited)
-
-  int ip1out = ip1;
-  int ip2out = ip2;
-  int np1out = np1;
-  int np2out = np2;
-
-  ip1 = 0;
-  ip2 = 0;
-  np1 = 1;
-  np2 = 1;
-
-  // allocate memory for plan data struct
-
-  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
-  if (plan == NULL) return NULL;
-  plan->init=ainit;
-
-  // remap from initial distribution to layout needed for 1st set of 1d FFTs
-  // not needed if all procs own entire fast axis initially
-  // first indices = distribution after 1st set of FFTs
-
-  if (in_ilo == 0 && in_ihi == nfast-1)
-    flag = 0;
-  else
-    flag = 1;
-
-  if(nprocs>1)flag=1;
-
-  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
-
-  if (remapflag == 0) {
-    first_ilo = in_ilo;
-    first_ihi = in_ihi;
-    first_jlo = in_jlo;
-    first_jhi = in_jhi;
-    first_klo = in_klo;
-    first_khi = in_khi;
-    plan->pre_plan = NULL;
-  }
-  else {
-    first_ilo = 0;
-    first_ihi = nfast - 1;
-    first_jlo = ip1*nmid/np1;
-    first_jhi = (ip1+1)*nmid/np1 - 1;
-    first_klo = ip2*nslow/np2;
-    first_khi = (ip2+1)*nslow/np2 - 1;
-    int members=2;
-    if(plan->init) members=1;
-    plan->pre_plan =
-      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
-                           first_ilo,first_ihi,first_jlo,first_jhi,
-                           first_klo,first_khi,
-                           members,0,0,2,0);
-    if (plan->pre_plan == NULL) return NULL;
-  }
-
-  // 1d FFTs along fast axis
-
-  plan->length1 = nfast;
-  plan->total1 = nfast * nmid * nslow;
-
-  // remap from 1st to 2nd FFT
-  // choose which axis is split over np1 vs np2 to minimize communication
-  // second indices = distribution after 2nd set of FFTs
-
-  second_ilo = ip1*nfast/np1;
-  second_ihi = (ip1+1)*nfast/np1 - 1;
-  second_jlo = 0;
-  second_jhi = nmid - 1;
-  second_klo = ip2*nslow/np2;
-  second_khi = (ip2+1)*nslow/np2 - 1;
-  plan->mid1_plan =
-      remap_3d_create_plan(comm,
-                           first_ilo,first_ihi,first_jlo,first_jhi,
-                           first_klo,first_khi,
-                           second_ilo,second_ihi,second_jlo,second_jhi,
-                           second_klo,second_khi,
-                           2,1,0,2,0);
-  if (plan->mid1_plan == NULL) return NULL;
-
-  // 1d FFTs along mid axis
-
-  plan->length2 = nmid;
-  plan->total2 = nfast * nmid * nslow;
-
-  // remap from 2nd to 3rd FFT
-  // if final distribution is permute=2 with all procs owning entire slow axis
-  //   then this remapping goes directly to final distribution
-  //  third indices = distribution after 3rd set of FFTs
-
-  flag=1;
-
-  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
-
-  if (remapflag == 0) {
-    third_ilo = out_ilo;
-    third_ihi = out_ihi;
-    third_jlo = out_jlo;
-    third_jhi = out_jhi;
-    third_klo = out_klo;
-    third_khi = out_khi;
-  }
-  else {
-    third_ilo = ip1*nfast/np1;
-    third_ihi = (ip1+1)*nfast/np1 - 1;
-    third_jlo = ip2*nmid/np2;
-    third_jhi = (ip2+1)*nmid/np2 - 1;
-    third_klo = 0;
-    third_khi = nslow - 1;
-  }
-
-  plan->mid2_plan =
-    remap_3d_create_plan(comm,
-                         second_jlo,second_jhi,second_klo,second_khi,
-                         second_ilo,second_ihi,
-                         third_jlo,third_jhi,third_klo,third_khi,
-                         third_ilo,third_ihi,
-                         2,1,0,2,0);
-  if (plan->mid2_plan == NULL) return NULL;
-
-  // 1d FFTs along slow axis
-
-  plan->length3 = nslow;
-  plan->total3 = nfast * nmid * nslow;
-
-  // remap from 3rd FFT to final distribution
-  //  not needed if permute = 2 and third indices = out indices on all procs
-
-  flag=1;
-
-  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
-
-  if (remapflag == 0)
-    plan->post_plan = NULL;
-  else {
-    plan->post_plan =
-      remap_3d_create_plan(comm,
-                           third_klo,third_khi,third_ilo,third_ihi,
-                           third_jlo,third_jhi,
-                           out_klo,out_khi,out_ilo,out_ihi,
-                           out_jlo,out_jhi,
-                           2,(permute+1)%3,0,2,0);
-    if (plan->post_plan == NULL) return NULL;
-  }
-
-  // configure plan memory pointers and allocate work space
-  // out_size = amount of memory given to FFT by user
-  // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
-  // copy_size = amount needed internally for extra copy of data
-  // scratch_size = amount needed internally for remap scratch space
-  // for each remap:
-  //   out space used for result if big enough, else require copy buffer
-  //   accumulate largest required remap scratch space
-
-  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
-  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) *
-    (first_khi-first_klo+1);
-  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) *
-    (second_khi-second_klo+1);
-  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) *
-    (third_khi-third_klo+1);
-
-  plan->ihi_out=out_ihi;
-  plan->ilo_out=out_ilo;
-  plan->jhi_out=out_jhi;
-  plan->jlo_out=out_jlo;
-  plan->khi_out=out_khi;
-  plan->klo_out=out_klo;
-
-  copy_size = 0;
-  scratch_size = 0;
-
-  if (plan->pre_plan) {
-    if (first_size <= out_size)
-      plan->pre_target = 0;
-    else {
-      plan->pre_target = 1;
-      copy_size = MAX(copy_size,first_size);
-    }
-    scratch_size = MAX(scratch_size,first_size);
-  }
-
-  if (plan->mid1_plan) {
-    if (second_size <= out_size)
-      plan->mid1_target = 0;
-    else {
-      plan->mid1_target = 1;
-      copy_size = MAX(copy_size,second_size);
-    }
-    scratch_size = MAX(scratch_size,second_size);
-  }
-
-  if (plan->mid2_plan) {
-    if (third_size <= out_size)
-      plan->mid2_target = 0;
-    else {
-      plan->mid2_target = 1;
-      copy_size = MAX(copy_size,third_size);
-    }
-    scratch_size = MAX(scratch_size,third_size);
-  }
-
-  if (plan->post_plan)
-    scratch_size = MAX(scratch_size,out_size);
-
-  *nbuf = copy_size + scratch_size;
-
-  if (copy_size) {
-    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
-    if (plan->copy == NULL) return NULL;
-  }
-  else plan->copy = NULL;
-
-  if (scratch_size) {
-    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
-    if (plan->scratch == NULL) return NULL;
-  }
-  else plan->scratch = NULL;
-
-  // system specific pre-computation of 1d FFT coeffs
-  // and scaling normalization
-
-  cufftResult retvalc;
-  int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) *
-    (in_khi-in_klo+1);
-  int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
-    (out_khi-out_klo+1);
-
-  int nfft_both = MAX(nfft,nfft_brick);
-  nfft_both=nfast*nmid*nslow;
-
-  plan->cudatasize=nfft_both*sizeof(FFT_DATA);
-
-  //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast);
-  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc);
-  plan->nfast=nfast;
-
-  //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid);
-  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc);
-  plan->nmid=nmid;
-
-  //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow);
-  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
-  plan->nslow=nslow;
-
-  retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN);
-  if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
-
-  plan->nprocs=nprocs;
-  plan->me=me;
-  if (scaled == 0)
-    plan->scaled = 0;
-  else {
-    plan->scaled = 1;
-    plan->norm = 1.0/(nfast*nmid*nslow);
-    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
-      (out_khi-out_klo+1);
-  }
-
-  plan->coretime=0;
-  plan->iterate=0;
-  plan->ffttime=0;
-  return plan;
-  #endif
-}
-
-/* ----------------------------------------------------------------------
-   Destroy a 3d fft plan
-------------------------------------------------------------------------- */
-
-void fft_3d_destroy_plan_cuda(struct fft_plan_3d *plan)
-{
-#ifdef FFT_CUFFT
-  if (plan->pre_plan) remap_3d_destroy_plan(plan->pre_plan);
-  if (plan->mid1_plan) remap_3d_destroy_plan(plan->mid1_plan);
-  if (plan->mid2_plan) remap_3d_destroy_plan(plan->mid2_plan);
-  if (plan->post_plan) remap_3d_destroy_plan(plan->post_plan);
-
-  if (plan->copy) free(plan->copy);
-  if (plan->scratch) free(plan->scratch);
-
-
-  //cufftDestroy(plan->plan_fast);
-  //cufftDestroy(plan->plan_mid);
-  //cufftDestroy(plan->plan_slow);
-  cufftDestroy(plan->plan_3d);
-  free(plan);
-#endif
-}
-
-/* ----------------------------------------------------------------------
-   recursively divide n into small factors, return them in list
-------------------------------------------------------------------------- */
-
-void factor_cuda(int n, int *num, int *list)
-{
-  if (n == 1) {
-    return;
-  }
-  else if (n % 2 == 0) {
-    *list = 2;
-    (*num)++;
-    factor_cuda(n/2,num,list+1);
-  }
-  else if (n % 3 == 0) {
-    *list = 3;
-    (*num)++;
-    factor_cuda(n/3,num,list+1);
-  }
-  else if (n % 5 == 0) {
-    *list = 5;
-    (*num)++;
-    factor_cuda(n/5,num,list+1);
-  }
-  else if (n % 7 == 0) {
-    *list = 7;
-    (*num)++;
-    factor_cuda(n/7,num,list+1);
-  }
-  else if (n % 11 == 0) {
-    *list = 11;
-    (*num)++;
-    factor_cuda(n/11,num,list+1);
-  }
-  else if (n % 13 == 0) {
-    *list = 13;
-    (*num)++;
-    factor_cuda(n/13,num,list+1);
-  }
-  else {
-    *list = n;
-    (*num)++;
-    return;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   divide n into 2 factors of as equal size as possible
-------------------------------------------------------------------------- */
-
-void bifactor_cuda(int n, int *factor1, int *factor2)
-{
-  int n1,n2,facmax;
-
-  facmax = static_cast<int> (sqrt((double) n));
-
-  for (n1 = facmax; n1 > 0; n1--) {
-    n2 = n/n1;
-    if (n1*n2 == n) {
-      *factor1 = n1;
-      *factor2 = n2;
-      return;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   perform just the 1d FFTs needed by a 3d FFT, no data movement
-   used for timing purposes
-
-   Arguments:
-   in           starting address of input data on this proc, all set to 0.0
-   nsize        size of in
-   flag         1 for forward FFT, -1 for inverse FFT
-   plan         plan returned by previous call to fft_3d_create_plan
-------------------------------------------------------------------------- */
-
-void fft_1d_only_cuda(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
-{
-#ifdef FFT_CUFFT
-  int i,total,length,offset,num;
-  double norm;
-
-  // system specific constants
-
-
-
-  // total = size of data needed in each dim
-  // length = length of 1d FFT in each dim
-  // total/length = # of 1d FFTs in each dim
-  // if total > nsize, limit # of 1d FFTs to available size of data
-
-  int total1 = plan->total1;
-  int length1 = plan->length1;
-  int total2 = plan->total2;
-  int length2 = plan->length2;
-  int total3 = plan->total3;
-  int length3 = plan->length3;
-
-  if (total1 > nsize) total1 = (nsize/length1) * length1;
-  if (total2 > nsize) total2 = (nsize/length2) * length2;
-  if (total3 > nsize) total3 = (nsize/length3) * length3;
-
-  // perform 1d FFTs in each of 3 dimensions
-  // data is just an array of 0.0
-
-
-  cudaMemcpy((void**) &(plan->cudata), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice);
-  if (flag == -1) {
-    cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD);
-    /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_FORWARD);
-    cufft(plan->plan_mid, plan->cudata, plan->cudata,CUFFT_FORWARD);
-    cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_FORWARD);*/
-  } else {
-    cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD);
-    /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_INVERSE);
-    cufft(plan->plan_mid,plan->cudata, plan->cudata,CUFFT_INVERSE);
-    cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_INVERSE);*/
-  }
-  cudaMemcpy((void*) data, (void**) &(plan->cudata), plan->cudatasize,cudaMemcpyDeviceToHost);
-
-  // scaling if required
-  // limit num to size of data
-
-#endif
-}
diff --git a/src/USER-CUDA/fft3d_cuda.h b/src/USER-CUDA/fft3d_cuda.h
deleted file mode 100644
index 059ac977f7..0000000000
--- a/src/USER-CUDA/fft3d_cuda.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-// User-settable FFT precision
-
-// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag)
-// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag)
-#include "cuda_precision.h"
-//#define FFT_PRECISION 2
-
-// -------------------------------------------------------------------------
-
-// Data types for single-precision complex
-
-#if FFT_PRECISION_CU == 1
-
-#ifdef FFT_CUFFT
-#include "cuda_runtime.h"
-#include "cufft.h"
-typedef struct {
-  float re;
-  float im;
-} FFT_DATA;
-typedef cufftComplex cufftData;
-typedef cufftReal cufftDataInit;
-#define cufft cufftExecC2C
-#define cufftinit cufftExecR2C
-#define CUFFT_PLAN CUFFT_C2C
-#define CUFFT_PLAN_INIT CUFFT_R2C
-#else
-typedef struct {
-  float re;
-  float im;
-} FFT_DATA;
-#endif
-
-#endif
-
-// -------------------------------------------------------------------------
-
-// Data types for double-precision complex
-
-#if FFT_PRECISION_CU == 2
-
-
-#ifdef FFT_CUFFT
-#include "cuda_runtime.h"
-#include "cufft.h"
-typedef cufftDoubleComplex cufftData;
-typedef cufftDoubleReal cufftDataInit;
-typedef struct {
-  double re;
-  double im;
-} FFT_DATA;
-#define cufft cufftExecZ2Z
-#define cufftinit cufftExecD2Z
-#define CUFFT_PLAN CUFFT_Z2Z
-#define CUFFT_PLAN_INIT CUFFT_D2Z
-#endif
-
-#endif
-
-// -------------------------------------------------------------------------
-
-// details of how to do a 3d FFT
-
-struct fft_plan_3d {
-  struct remap_plan_3d *pre_plan;       // remap from input -> 1st FFTs
-  struct remap_plan_3d *mid1_plan;      // remap from 1st -> 2nd FFTs
-  struct remap_plan_3d *mid2_plan;      // remap from 2nd -> 3rd FFTs
-  struct remap_plan_3d *post_plan;      // remap from 3rd FFTs -> output
-  FFT_DATA *copy;                   // memory for remap results (if needed)
-  FFT_DATA *scratch;                // scratch space for remaps
-  int total1,total2,total3;         // # of 1st,2nd,3rd FFTs (times length)
-  int length1,length2,length3;      // length of 1st,2nd,3rd FFTs
-  int pre_target;                   // where to put remap results
-  int mid1_target,mid2_target;
-  int scaled;                       // whether to scale FFT results
-  int normnum;                      // # of values to rescale
-  double norm;                      // normalization factor for rescaling
-
-  double coretime;
-  double ffttime;
-  int iterate;
-                                    // system specific 1d FFT info
-
-#ifdef FFT_CUFFT
-  //CUdeviceptr cudata;
-  cufftData* cudata;
-  cufftData* cudata2;
-  unsigned int cudatasize;
-  cufftHandle plan_fast;
-  cufftHandle plan_mid;
-  cufftHandle plan_slow;
-  cufftHandle plan_3d;
-  int nfast;
-  int nmid;
-  int nslow;
-  int ihi_out,ilo_out,jhi_out,jlo_out,khi_out,klo_out;
-  int me,nprocs;
-#endif
-  int init;
-};
-
-// function prototypes
-
-void fft_3d_destroy_plan_cuda(struct fft_plan_3d *);
-void factor_cuda(int, int *, int *);
-void bifactor_cuda(int, int *, int *);
-void fft_1d_only_cuda(FFT_DATA *, int, int, struct fft_plan_3d *);
-void fft_3d_cudaA(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
-void fft_3d_cuda(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
-struct fft_plan_3d *fft_3d_create_plan_cuda(MPI_Comm, int, int, int,
-  int, int, int, int, int, int, int, int, int, int, int, int,
-  int, int, int *,bool init);
diff --git a/src/USER-CUDA/fft3d_wrap_cuda.cpp b/src/USER-CUDA/fft3d_wrap_cuda.cpp
deleted file mode 100644
index f02c38d831..0000000000
--- a/src/USER-CUDA/fft3d_wrap_cuda.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include "fft3d_wrap_cuda.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-FFT3dCuda::FFT3dCuda(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
-             int in_ilo, int in_ihi, int in_jlo, int in_jhi,
-             int in_klo, int in_khi,
-             int out_ilo, int out_ihi, int out_jlo, int out_jhi,
-             int out_klo, int out_khi,
-             int scaled, int permute, int *nbuf,bool init) : Pointers(lmp)
-{
-#ifdef FFT_CUFFT
-  plan = fft_3d_create_plan_cuda(comm,nfast,nmid,nslow,
-                            in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
-                            out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                            scaled,permute,nbuf,init);
-#endif
-#ifndef FFT_CUFFT
-  plan = fft_3d_create_plan(comm,nfast,nmid,nslow,
-                            in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
-                            out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
-                            scaled,permute,nbuf,0);
-#endif
-  if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
-}
-
-/* ---------------------------------------------------------------------- */
-
-FFT3dCuda::~FFT3dCuda()
-{
-#ifdef FFT_CUFFT
-  fft_3d_destroy_plan_cuda(plan);
-#endif
-#ifndef FFT_CUFFT
-   fft_3d_destroy_plan(plan);
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FFT3dCuda::compute(double *in, double *out, int flag)
-{
-#ifdef FFT_CUFFT
-  fft_3d_cuda((FFT_DATA *) in,(FFT_DATA *) out,flag,plan);
-#endif
-#ifndef FFT_CUFFT
-  fft_3d((FFT_DATA *) in,(FFT_DATA *) out,flag,plan);
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FFT3dCuda::timing1d(double *in, int nsize, int flag)
-{
-#ifdef FFT_CUFFT
-  fft_1d_only_cuda((FFT_DATA *) in,nsize,flag,plan);
-#endif
-#ifndef FFT_CUFFT
-  fft_1d_only((FFT_DATA *) in,nsize,flag,plan);
-#endif
-}
-
-#ifdef FFT_CUFFT
-void FFT3dCuda::set_cudata(void* cudata,void* cudata2)
-{
-
-  plan->cudata=(cufftData*) cudata;
-  plan->cudata2=(cufftData*) cudata2;
-
-}
-#endif
diff --git a/src/USER-CUDA/fft3d_wrap_cuda.h b/src/USER-CUDA/fft3d_wrap_cuda.h
deleted file mode 100644
index cc6baa9ebe..0000000000
--- a/src/USER-CUDA/fft3d_wrap_cuda.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef FFT3D_WRAP_CUDA_H_
-#define FFT3D_WRAP_CUDA_H_
-
-#include "pointers.h"
-
-#ifdef FFT_CUFFT
-  #include "fft3d_cuda.h"
-#endif
-#ifndef FFT_CUFFT
-  #include "fft3d.h"
-#endif
-
-namespace LAMMPS_NS {
-
-class FFT3dCuda : protected Pointers {
- public:
-  FFT3dCuda(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,int,int,int,
-        int,int,int,int,int,int,int,int,int *,bool);
-  ~FFT3dCuda();
-  void compute(double *, double *, int);
-  void timing1d(double *, int, int);
-
-#ifdef FFT_CUFFT
-  void set_cudata(void* cudata,void* cudata2);
-#endif
- private:
-  struct fft_plan_3d *plan;
-};
-
-}
-
-#endif /*FFT3D_WRAP_CUDA_H_*/
diff --git a/src/USER-CUDA/fix_addforce_cuda.cpp b/src/USER-CUDA/fix_addforce_cuda.cpp
deleted file mode 100644
index 5462668f4f..0000000000
--- a/src/USER-CUDA/fix_addforce_cuda.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-
-#include <cstring>
-#include <cstdlib>
-#include "fix_addforce_cuda.h"
-#include "fix_addforce_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "respa.h"
-#include "error.h"
-#include "force.h"
-#include "domain.h"
-#include "user_cuda.h"
-#include "memory.h"
-#include "cuda_modify_flags.h"
-
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixAddForceCuda::FixAddForceCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg < 6) error->all(FLERR,"Illegal fix addforce/cuda command");
-
-  scalar_flag = 1;
-  vector_flag = 1;
-  size_vector = 3;
-  global_freq = 1;
-  extscalar = 1;
-  extvector = 1;
-
-  xvalue = force->numeric(FLERR,arg[3]);
-  yvalue = force->numeric(FLERR,arg[4]);
-  zvalue = force->numeric(FLERR,arg[5]);
-
-  // optional args
-
-  iregion = -1;
-
-  int iarg = 6;
-  while (iarg < narg) {
-    if (strcmp(arg[iarg],"region") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix addforce/cuda command");
-      iregion = domain->find_region(arg[iarg+1]);
-      if (iregion == -1) error->all(FLERR,"Fix addforce/cuda region ID does not exist");
-      iarg += 2;
-    } else error->all(FLERR,"Illegal fix addforce/cuda command");
-  }
-
-  if(iregion!=-1) error->all(FLERR,"Error: fix addforce/cuda does not currently support 'region' option");
-
-  force_flag = 0;
-  foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
-  cu_foriginal = NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixAddForceCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
-  mask |= THERMO_ENERGY_CUDA;
-  mask |= POST_FORCE_RESPA;
-  mask |= MIN_POST_FORCE_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAddForceCuda::init()
-{
-  if(not cu_foriginal)
-  cu_foriginal = new cCudaData<double, F_CFLOAT, x> (foriginal,4);
-  if (strstr(update->integrate_style,"respa"))
-    nlevels_respa = ((Respa *) update->integrate)->nlevels;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAddForceCuda::setup(int vflag)
-{
-  MYDBG( printf("# CUDA: FixAddForceCuda::setup\n"); )
-
-  if (strstr(update->integrate_style,"verlet"))
-  {
-    Cuda_FixAddForceCuda_Init(&cuda->shared_data);
-    cuda->cu_f->upload();
-    post_force(vflag);
-    cuda->cu_f->download();
-
-  }
-  else {
-    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
-    cuda->cu_f->download();
-    post_force_respa(vflag,nlevels_respa-1,0);
-    cuda->cu_f->upload();
-    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
-  }
-  MYDBG( printf("# CUDA: FixAddForceCuda::setup done\n"); )
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAddForceCuda::min_setup(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAddForceCuda::post_force(int vflag)
-{
-  MYDBG( printf("# CUDA: FixAddForceCuda::postforce start\n"); )
-  force_flag = 0;
-  cu_foriginal->memset_device(0);
-  Cuda_FixAddForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_CFLOAT*) cu_foriginal->dev_data());
-  cu_foriginal->download();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAddForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
-{
-  if (ilevel == nlevels_respa-1) post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAddForceCuda::min_post_force(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ----------------------------------------------------------------------
-   potential energy of added force
-------------------------------------------------------------------------- */
-
-double FixAddForceCuda::compute_scalar()
-{
-  // only sum across procs one time
-
-  if (force_flag == 0) {
-    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
-    force_flag = 1;
-  }
-  return foriginal_all[0];
-}
-
-/* ----------------------------------------------------------------------
-   return components of total force on fix group before force was changed
-------------------------------------------------------------------------- */
-
-double FixAddForceCuda::compute_vector(int n)
-{
-  // only sum across procs one time
-
-  if (force_flag == 0) {
-    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
-    force_flag = 1;
-  }
-  return foriginal_all[n+1];
-}
diff --git a/src/USER-CUDA/fix_addforce_cuda.h b/src/USER-CUDA/fix_addforce_cuda.h
deleted file mode 100644
index 043cae6d21..0000000000
--- a/src/USER-CUDA/fix_addforce_cuda.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(addforce/cuda,FixAddForceCuda)
-
-#else
-
-#ifndef LMP_FIX_ADD_FORCE_CUDA_H
-#define LMP_FIX_ADD_FORCE_CUDA_H
-
-#include "fix.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class FixAddForceCuda : public Fix {
- public:
-  FixAddForceCuda(class LAMMPS *, int, char **);
-  int setmask();
-  void init();
-  void setup(int);
-  void min_setup(int);
-  void post_force(int);
-  void post_force_respa(int, int, int);
-  void min_post_force(int);
-  double compute_scalar();
-  double compute_vector(int);
-
- private:
-  class Cuda *cuda;
-  int iregion;
-  double xvalue,yvalue,zvalue;
-  double foriginal[4],foriginal_all[4];
-  cCudaData<double     , F_CFLOAT                   , x>* cu_foriginal;
-  int force_flag;
-  int nlevels_respa;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_aveforce_cuda.cpp b/src/USER-CUDA/fix_aveforce_cuda.cpp
deleted file mode 100644
index 9b4ceaa67c..0000000000
--- a/src/USER-CUDA/fix_aveforce_cuda.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-
-#include <mpi.h>
-#include <cstring>
-#include <cstdlib>
-#include "fix_aveforce_cuda.h"
-#include "fix_aveforce_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "respa.h"
-#include "domain.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-#include "variable.h"
-#include "input.h"
-#include "modify.h"
-#include "atom_masks.h"
-#include "error.h"
-#include "force.h"
-
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-enum{NONE,CONSTANT,EQUAL};
-
-/* ---------------------------------------------------------------------- */
-
-FixAveForceCuda::FixAveForceCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 6) error->all(FLERR,"Illegal fix aveforce command");
-
-  vector_flag = 1;
-  size_vector = 3;
-  global_freq = 1;
-  extvector = 1;
-
-  xstr = ystr = zstr = NULL;
-  xvalue = yvalue = zvalue = 0;
-
-  if (strstr(arg[3],"v_") == arg[3]) {
-    int n = strlen(&arg[3][2]) + 1;
-    xstr = new char[n];
-    strcpy(xstr,&arg[3][2]);
-  } else if (strcmp(arg[3],"NULL") == 0) {
-    xstyle = NONE;
-  } else {
-    xvalue = force->numeric(FLERR,arg[3]);
-    xstyle = CONSTANT;
-  }
-  if (strstr(arg[4],"v_") == arg[4]) {
-    int n = strlen(&arg[4][2]) + 1;
-    ystr = new char[n];
-    strcpy(ystr,&arg[4][2]);
-  } else if (strcmp(arg[4],"NULL") == 0) {
-    ystyle = NONE;
-  } else {
-    yvalue = force->numeric(FLERR,arg[4]);
-    ystyle = CONSTANT;
-  }
-  if (strstr(arg[5],"v_") == arg[5]) {
-    int n = strlen(&arg[5][2]) + 1;
-    zstr = new char[n];
-    strcpy(zstr,&arg[5][2]);
-  } else if (strcmp(arg[5],"NULL") == 0) {
-    zstyle = NONE;
-  } else {
-    zvalue = force->numeric(FLERR,arg[5]);
-    zstyle = CONSTANT;
-  }
-
-  // optional args
-
-  iregion = -1;
-
-  int iarg = 6;
-  while (iarg < narg) {
-    if (strcmp(arg[iarg],"region") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix aveforce command");
-      iregion = domain->find_region(arg[iarg+1]);
-      if (iregion == -1) error->all(FLERR,"Fix aveforce region ID does not exist");
-      iarg += 2;
-    } else error->all(FLERR,"Illegal fix aveforce command");
-
-  }
-
-  if(iregion!=-1) error->all(FLERR,"Error: fix aveforce/cuda does not currently support 'region' option");
-
-  foriginal_all[0] = foriginal_all[1] = foriginal_all[2] = foriginal_all[3] = 0.0;
-  foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
-  cu_foriginal = NULL;
-
-}
-
-FixAveForceCuda::~FixAveForceCuda()
-{
-  delete [] xstr;
-  delete [] ystr;
-  delete [] zstr;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixAveForceCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
-  mask |= POST_FORCE_RESPA;
-  mask |= MIN_POST_FORCE_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAveForceCuda::init()
-{
-  if(not cu_foriginal)
-  cu_foriginal = new cCudaData<double, F_CFLOAT, x> (foriginal,4);
-
-  if (xstr) {
-    xvar = input->variable->find(xstr);
-    if (xvar < 0)
-      error->all(FLERR,"Variable name for fix aveforce does not exist");
-    if (input->variable->equalstyle(xvar)) xstyle = EQUAL;
-    else error->all(FLERR,"Variable for fix aveforce is invalid style");
-  }
-  if (ystr) {
-    yvar = input->variable->find(ystr);
-    if (yvar < 0)
-      error->all(FLERR,"Variable name for fix aveforce does not exist");
-    if (input->variable->equalstyle(yvar)) ystyle = EQUAL;
-    else error->all(FLERR,"Variable for fix aveforce is invalid style");
-  }
-  if (zstr) {
-    zvar = input->variable->find(zstr);
-    if (zvar < 0)
-      error->all(FLERR,"Variable name for fix aveforce does not exist");
-    if (input->variable->equalstyle(zvar)) zstyle = EQUAL;
-    else error->all(FLERR,"Variable for fix aveforce is invalid style");
-  }
-
-  if (xstyle == EQUAL || ystyle == EQUAL || zstyle == EQUAL) varflag = EQUAL;
-  else varflag = CONSTANT;
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAveForceCuda::setup(int vflag)
-{
-  if (strstr(update->integrate_style,"verlet"))
-  {
-    Cuda_FixAveForceCuda_Init(&cuda->shared_data);
-    cuda->cu_f->upload();
-    post_force(vflag);
-    cuda->cu_f->download();
-
-  }
-  else
-  {
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAveForceCuda::min_setup(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAveForceCuda::post_force(int vflag)
-{
-  // sum forces on participating atoms
-
-  cu_foriginal->memset_device(0);
-  Cuda_FixAveForceCuda_PostForce_FOrg(&cuda->shared_data, groupbit,(F_CFLOAT*) cu_foriginal->dev_data());
-  cu_foriginal->download();
-
-  // average the force on participating atoms
-  // add in requested amount
-
-  MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
-  int ncount = static_cast<int> (foriginal_all[3]);
-  if (ncount == 0) return;
-
-  if (varflag == EQUAL) {
-    unsigned int datamask = EMPTY_MASK;
-    if (xstyle == EQUAL) datamask &= input->variable->data_mask(xstr);
-    if (ystyle == EQUAL) datamask &= input->variable->data_mask(ystr);
-    if (zstyle == EQUAL) datamask &= input->variable->data_mask(zstr);
-
-    cuda->download(datamask);
-    modify->clearstep_compute();
-    if (xstyle == EQUAL) xvalue = input->variable->compute_equal(xvar);
-    if (ystyle == EQUAL) yvalue = input->variable->compute_equal(yvar);
-    if (zstyle == EQUAL) zvalue = input->variable->compute_equal(zvar);
-    modify->addstep_compute(update->ntimestep + 1);
-  }
-
-  double fave[3];
-  fave[0] = foriginal_all[0]/ncount + xvalue;
-  fave[1] = foriginal_all[1]/ncount + yvalue;
-  fave[2] = foriginal_all[2]/ncount + zvalue;
-
-  // set force of all participating atoms to same value
-  // only for active dimensions
-
-  Cuda_FixAveForceCuda_PostForce_Set(&cuda->shared_data, groupbit,!(xstyle==NONE),!(ystyle==NONE),!(zstyle==NONE),fave[0],fave[1],fave[2]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAveForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
-{
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixAveForceCuda::min_post_force(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ----------------------------------------------------------------------
-   return components of total force on fix group before force was changed
-------------------------------------------------------------------------- */
-
-double FixAveForceCuda::compute_vector(int n)
-{
-  return foriginal_all[n];
-}
diff --git a/src/USER-CUDA/fix_aveforce_cuda.h b/src/USER-CUDA/fix_aveforce_cuda.h
deleted file mode 100644
index c22e702ee2..0000000000
--- a/src/USER-CUDA/fix_aveforce_cuda.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(aveforce/cuda,FixAveForceCuda)
-
-#else
-
-
-#ifndef LMP_FIX_AVE_FORCE_CUDA_H
-#define LMP_FIX_AVE_FORCE_CUDA_H
-
-#include "fix.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class FixAveForceCuda : public Fix {
- public:
-  FixAveForceCuda(class LAMMPS *, int, char **);
-  ~FixAveForceCuda();
-  int setmask();
-  void init();
-  void setup(int);
-  void min_setup(int);
-  void post_force(int);
-  void post_force_respa(int, int, int);
-  void min_post_force(int);
-  double compute_vector(int);
-
- private:
-  class Cuda *cuda;
-  char *xstr,*ystr,*zstr;
-  int iregion;
-  double xvalue,yvalue,zvalue;
-  double foriginal_all[4];
-  double foriginal[4];
-  cCudaData<double     , F_CFLOAT                   , x>* cu_foriginal;
-  int varflag;
-  int xvar,yvar,zvar,xstyle,ystyle,zstyle;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_enforce2d_cuda.cpp b/src/USER-CUDA/fix_enforce2d_cuda.cpp
deleted file mode 100644
index d10edf1cce..0000000000
--- a/src/USER-CUDA/fix_enforce2d_cuda.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include "fix_enforce2d_cuda.h"
-#include "fix_enforce2d_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "domain.h"
-#include "respa.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixEnforce2DCuda::FixEnforce2DCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 3) error->all(FLERR,"Illegal fix enforce2d command");
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixEnforce2DCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
-  mask |= POST_FORCE_RESPA;
-  mask |= MIN_POST_FORCE_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixEnforce2DCuda::init()
-{
-  if (domain->dimension == 3)
-    error->all(FLERR,"Cannot use fix enforce2d/cuda with 3d simulation");
-  if (atom->omega_flag)
-    error->warning(FLERR,"Enforce2d/cuda does not support omega_flag on gpu yet. Will be handled on cpu.");
-
-  if (atom->angmom_flag)
-    error->warning(FLERR,"Enforce2d/cuda does not support angmom_flag (angular momentum) on gpu yet. Will be handled on cpu.");
-
-  if (atom->torque_flag)
-    error->warning(FLERR,"Enforce2d/cuda does not support torque_flag on gpu yet. Will be handled on cpu.");
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixEnforce2DCuda::setup(int vflag)
-{
-  if (strstr(update->integrate_style,"verlet"))
-  {
-    Cuda_FixEnforce2dCuda_Init(&cuda->shared_data);
-    cuda->cu_f->upload();
-    cuda->cu_v->upload();
-    post_force(vflag);
-    cuda->cu_f->download();
-    cuda->cu_v->download();
-  }
-  else {
-    int nlevels_respa = ((Respa *) update->integrate)->nlevels;
-    for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
-      ((Respa *) update->integrate)->copy_flevel_f(ilevel);
-      post_force_respa(vflag,ilevel,0);
-      ((Respa *) update->integrate)->copy_f_flevel(ilevel);
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixEnforce2DCuda::min_setup(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixEnforce2DCuda::post_force(int vflag)
-{
-  Cuda_FixEnforce2dCuda_PostForce(&cuda->shared_data, groupbit);
-
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  if (atom->omega_flag) {
-    double **omega = atom->omega;
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        omega[i][0] = 0.0;
-        omega[i][1] = 0.0;
-      }
-  }
-
-  if (atom->angmom_flag) {
-    double **angmom = atom->angmom;
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        angmom[i][0] = 0.0;
-        angmom[i][1] = 0.0;
-      }
-  }
-
-  if (atom->torque_flag) {
-    double **torque = atom->torque;
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        torque[i][0] = 0.0;
-        torque[i][1] = 0.0;
-      }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixEnforce2DCuda::post_force_respa(int vflag, int ilevel, int iloop)
-{
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixEnforce2DCuda::min_post_force(int vflag)
-{
-  post_force(vflag);
-}
diff --git a/src/USER-CUDA/fix_enforce2d_cuda.h b/src/USER-CUDA/fix_enforce2d_cuda.h
deleted file mode 100644
index 63bf289d9e..0000000000
--- a/src/USER-CUDA/fix_enforce2d_cuda.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(enforce2d/cuda,FixEnforce2DCuda)
-
-#else
-
-#ifndef LMP_FIX_ENFORCE2D_CUDA_H
-#define LMP_FIX_ENFORCE2D_CUDA_H
-
-#include "fix.h"
-
-namespace LAMMPS_NS {
-
-class FixEnforce2DCuda : public Fix {
- public:
-  FixEnforce2DCuda(class LAMMPS *, int, char **);
-  int setmask();
-  void init();
-  void setup(int);
-  void min_setup(int);
-  void post_force(int);
-  void post_force_respa(int, int, int);
-  void min_post_force(int);
-
-  private:
-  class Cuda *cuda;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_freeze_cuda.cpp b/src/USER-CUDA/fix_freeze_cuda.cpp
deleted file mode 100644
index c4a04af564..0000000000
--- a/src/USER-CUDA/fix_freeze_cuda.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-#include <cstring>
-#include <cstdlib>
-#include "fix_freeze_cuda.h"
-#include "fix_freeze_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "respa.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "memory.h"
-#include "modify.h"
-#include "cuda_modify_flags.h"
-
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixFreezeCuda::FixFreezeCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-  if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 3) error->all(FLERR,"Illegal fix freeze command");
-
-  if (!atom->torque_flag)
-    error->all(FLERR,"Fix freeze requires atom attribute torque");
-
-  vector_flag = 1;
-  size_vector = 3;
-  global_freq = 1;
-  extvector = 1;
-
-
-
-  force_flag = 0;
-  foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
-  cu_foriginal=NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixFreezeCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
-  mask |= THERMO_ENERGY_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixFreezeCuda::init()
-{
-  if(not cu_foriginal)
-  cu_foriginal = new cCudaData<double, F_CFLOAT, x> (foriginal,3);
-  int count = 0;
-  for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"freeze") == 0) count++;
-  if (count > 1) error->all(FLERR,"More than one fix freeze");
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixFreezeCuda::setup(int vflag)
-{
-  MYDBG( printf("# CUDA: FixFreezeCuda::setup\n"); )
-
-  if (strstr(update->integrate_style,"verlet"))
-  {
-    Cuda_FixFreezeCuda_Init(&cuda->shared_data);
-    cuda->cu_f->upload();
-    post_force(vflag);
-    cuda->cu_f->download();
-
-  }
-
-  MYDBG( printf("# CUDA: FixFreezeCuda::setup done\n"); )
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* ---------------------------------------------------------------------- */
-
-void FixFreezeCuda::post_force(int vflag)
-{
-  MYDBG( printf("# CUDA: FixFreezeCuda::postforce start\n"); )
-  force_flag = 0;
-  cu_foriginal->memset_device(0);
-  Cuda_FixFreezeCuda_PostForce(&cuda->shared_data, groupbit, (F_CFLOAT*) cu_foriginal->dev_data());
-  cu_foriginal->download();
-}
-
-/* ---------------------------------------------------------------------- */
-
-
-
-/* ----------------------------------------------------------------------
-   return components of total force on fix group before force was changed
-------------------------------------------------------------------------- */
-
-double FixFreezeCuda::compute_vector(int n)
-{
-  // only sum across procs one time
-
-  if (force_flag == 0) {
-    MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world);
-    force_flag = 1;
-  }
-  return foriginal_all[n+1];
-}
diff --git a/src/USER-CUDA/fix_freeze_cuda.h b/src/USER-CUDA/fix_freeze_cuda.h
deleted file mode 100644
index 9f6a1a99f0..0000000000
--- a/src/USER-CUDA/fix_freeze_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(freeze/cuda,FixFreezeCuda)
-
-#else
-
-#ifndef LMP_FIX_FREEZE_CUDA_H
-#define LMP_FIX_FREEZE_CUDA_H
-
-#include "fix.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class FixFreezeCuda : public Fix {
- public:
-  FixFreezeCuda(class LAMMPS *, int, char **);
-  int setmask();
-  void init();
-  void setup(int);
-  void post_force(int);
-  double compute_vector(int);
-
- private:
-  class Cuda *cuda;
-  double foriginal[3],foriginal_all[3];
-  cCudaData<double     , F_CFLOAT                   , x>* cu_foriginal;
-  int force_flag;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_gravity_cuda.cpp b/src/USER-CUDA/fix_gravity_cuda.cpp
deleted file mode 100644
index 34107ed593..0000000000
--- a/src/USER-CUDA/fix_gravity_cuda.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <cstdlib>
-#include "fix_gravity_cuda.h"
-#include "fix_gravity_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "domain.h"
-#include "respa.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-#include "math_const.h"
-#include "error.h"
-#include "force.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-using namespace MathConst;
-
-enum{CHUTE,SPHERICAL,GRADIENT,VECTOR};
-
-/* ---------------------------------------------------------------------- */
-
-FixGravityCuda::FixGravityCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg < 5) error->all(FLERR,"Illegal fix gravity command");
-
-  magnitude = force->numeric(FLERR,arg[3]);
-
-  if (strcmp(arg[4],"chute") == 0) {
-    if (narg != 6) error->all(FLERR,"Illegal fix gravity command");
-    style = CHUTE;
-    phi = 0.0;
-    theta = 180.0 - force->numeric(FLERR,arg[5]);
-  } else if (strcmp(arg[4],"spherical") == 0) {
-    if (narg != 7) error->all(FLERR,"Illegal fix gravity command");
-    style = SPHERICAL;
-    phi = force->numeric(FLERR,arg[5]);
-    theta = force->numeric(FLERR,arg[6]);
-  } else if (strcmp(arg[4],"gradient") == 0) {
-    if (narg != 9) error->all(FLERR,"Illegal fix gravity command");
-    style = GRADIENT;
-    phi = force->numeric(FLERR,arg[5]);
-    theta = force->numeric(FLERR,arg[6]);
-    phigrad = force->numeric(FLERR,arg[7]);
-    thetagrad = force->numeric(FLERR,arg[8]);
-  } else if (strcmp(arg[4],"vector") == 0) {
-    if (narg != 8) error->all(FLERR,"Illegal fix gravity command");
-    style = VECTOR;
-    xdir = force->numeric(FLERR,arg[5]);
-    ydir = force->numeric(FLERR,arg[6]);
-    zdir = force->numeric(FLERR,arg[7]);
-  } else error->all(FLERR,"Illegal fix gravity command");
-
-  degree2rad = MY_PI/180.0;
-
-  if (style == CHUTE || style == SPHERICAL || style == GRADIENT) {
-    if (domain->dimension == 3) {
-      xgrav = sin(degree2rad * theta) * cos(degree2rad * phi);
-      ygrav = sin(degree2rad * theta) * sin(degree2rad * phi);
-      zgrav = cos(degree2rad * theta);
-    } else {
-      xgrav = sin(degree2rad * theta);
-      ygrav = cos(degree2rad * theta);
-      zgrav = 0.0;
-    }
-  } else if (style == VECTOR) {
-    if (domain->dimension == 3) {
-      double length = sqrt(xdir*xdir + ydir*ydir + zdir*zdir);
-      xgrav = xdir/length;
-      ygrav = ydir/length;
-      zgrav = zdir/length;
-    } else {
-      double length = sqrt(xdir*xdir + ydir*ydir);
-      xgrav = xdir/length;
-      ygrav = ydir/length;
-      zgrav = 0.0;
-    }
-  }
-
-  time_origin = update->ntimestep;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixGravityCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixGravityCuda::init()
-{
-  dt = update->dt;
-
-  xacc = magnitude*xgrav;
-  yacc = magnitude*ygrav;
-  zacc = magnitude*zgrav;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixGravityCuda::setup(int vflag)
-{
-  MYDBG( printf("# CUDA: FixGravityCuda::setup\n"); )
-
-  if (strstr(update->integrate_style,"verlet"))
-  {
-    Cuda_FixGravityCuda_Init(&cuda->shared_data);
-    cuda->cu_f->upload();
-    post_force(vflag);
-    cuda->cu_f->download();
-
-  }
-  else {
-  }
-  MYDBG( printf("# CUDA: FixGravityCuda::setup done\n"); )
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixGravityCuda::post_force(int vflag)
-{
-  // update direction of gravity vector if gradient style
-
-  if (style == GRADIENT) {
-    if (domain->dimension == 3) {
-      double phi_current = degree2rad *
-        (phi + (update->ntimestep - time_origin)*dt*phigrad*360.0);
-      double theta_current = degree2rad *
-        (theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0);
-      xgrav = sin(theta_current) * cos(phi_current);
-      ygrav = sin(theta_current) * sin(phi_current);
-      zgrav = cos(theta_current);
-    } else {
-      double theta_current = degree2rad *
-        (theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0);
-      xgrav = sin(theta_current);
-      ygrav = cos(theta_current);
-    }
-    xacc = magnitude*xgrav;
-    yacc = magnitude*ygrav;
-    zacc = magnitude*zgrav;
-  }
-
-  MYDBG( printf("# CUDA: FixGravityCuda::postforce start\n"); )
-  Cuda_FixGravityCuda_PostForce(&cuda->shared_data, groupbit, xacc,yacc,zacc);
-}
diff --git a/src/USER-CUDA/fix_gravity_cuda.h b/src/USER-CUDA/fix_gravity_cuda.h
deleted file mode 100644
index 98d2586660..0000000000
--- a/src/USER-CUDA/fix_gravity_cuda.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(gravity/cuda,FixGravityCuda)
-
-#else
-
-#ifndef LMP_FIX_GRAVITY_CUDA_H
-#define LMP_FIX_GRAVITY_CUDA_H
-
-#include "fix.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class FixGravityCuda : public Fix {
- public:
-  FixGravityCuda(class LAMMPS *, int, char **);
-  int setmask();
-  void init();
-  void setup(int);
-  void post_force(int);
-
- private:
-  class Cuda *cuda;
-  int style;
-  double magnitude,dt;
-  double phi,theta,phigrad,thetagrad;
-  double xdir,ydir,zdir;
-  double xgrav,ygrav,zgrav,xacc,yacc,zacc;
-  double degree2rad;
-  int time_origin;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_nh_cuda.cpp b/src/USER-CUDA/fix_nh_cuda.cpp
deleted file mode 100644
index 1a5092a68f..0000000000
--- a/src/USER-CUDA/fix_nh_cuda.cpp
+++ /dev/null
@@ -1,2072 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mark Stevens (SNL), Aidan Thompson (SNL)
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include "fix_nh_cuda.h"
-#include "atom.h"
-#include "force.h"
-#include "comm.h"
-#include "modify.h"
-#include "fix_deform.h"
-#include "compute.h"
-#include "kspace.h"
-#include "update.h"
-#include "respa.h"
-#include "domain.h"
-#include "memory.h"
-#include "error.h"
-#include "math_extra.h"
-#include "user_cuda.h"
-#include "fix_nh_cuda_cu.h"
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-enum{NOBIAS,BIAS};
-enum{NONE,XYZ,XY,YZ,XZ};
-enum{ISO,ANISO,TRICLINIC};
-
-/* ----------------------------------------------------------------------
-   NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
- ---------------------------------------------------------------------- */
-
-FixNHCuda::FixNHCuda(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg < 4) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-
-  restart_global = 1;
-  time_integrate = 1;
-  scalar_flag = 1;
-  vector_flag = 1;
-  global_freq = 1;
-  extscalar = 1;
-  extvector = 0;
-
-  triggerneighsq = -1;
-  // default values
-
-  pcouple = NONE;
-  drag = 0.0;
-  allremap = 1;
-  mtchain = mpchain = 3;
-  nc_tchain = nc_pchain = 1;
-  mtk_flag = 1;
-  deviatoric_flag = 0;
-  nreset_h0 = 0;
-
-  // Used by FixNVTSllod to preserve non-default value
-
-  mtchain_default_flag = 1;
-
-  tstat_flag = 0;
-  double t_period = 0.0;
-
-  double p_period[6];
-  for (int i = 0; i < 6; i++) {
-    p_start[i] = p_stop[i] = p_period[i] = 0.0;
-    p_flag[i] = 0;
-  }
-
-  // process keywords
-
-  dimension = domain->dimension;
-
-  int iarg = 3;
-
-  while (iarg < narg) {
-    if (strcmp(arg[iarg],"temp") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      tstat_flag = 1;
-      t_start = force->numeric(FLERR,arg[iarg+1]);
-      t_stop = force->numeric(FLERR,arg[iarg+2]);
-      t_period = force->numeric(FLERR,arg[iarg+3]);
-      if (t_start < 0.0 || t_stop <= 0.0)
-        error->all(FLERR,"Target T for fix nvt/npt/nph cannot be 0.0");
-      iarg += 4;
-
-    } else if (strcmp(arg[iarg],"iso") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      pcouple = XYZ;
-      p_start[0] = p_start[1] = p_start[2] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[0] = p_stop[1] = p_stop[2] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[0] = p_period[1] = p_period[2] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[0] = p_flag[1] = p_flag[2] = 1;
-      if (dimension == 2) {
-        p_start[2] = p_stop[2] = p_period[2] = 0.0;
-        p_flag[2] = 0;
-      }
-      iarg += 4;
-    } else if (strcmp(arg[iarg],"aniso") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      pcouple = NONE;
-      p_start[0] = p_start[1] = p_start[2] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[0] = p_stop[1] = p_stop[2] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[0] = p_period[1] = p_period[2] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[0] = p_flag[1] = p_flag[2] = 1;
-      if (dimension == 2) {
-        p_start[2] = p_stop[2] = p_period[2] = 0.0;
-        p_flag[2] = 0;
-      }
-      iarg += 4;
-    } else if (strcmp(arg[iarg],"tri") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      pcouple = NONE;
-      p_start[0] = p_start[1] = p_start[2] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[0] = p_stop[1] = p_stop[2] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[0] = p_period[1] = p_period[2] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[0] = p_flag[1] = p_flag[2] = 1;
-      p_start[3] = p_start[4] = p_start[5] = 0.0;
-      p_stop[3] = p_stop[4] = p_stop[5] = 0.0;
-      p_period[3] = p_period[4] = p_period[5] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[3] = p_flag[4] = p_flag[5] = 1;
-      if (dimension == 2) {
-        p_start[2] = p_stop[2] = p_period[2] = 0.0;
-        p_flag[2] = 0;
-        p_start[3] = p_stop[3] = p_period[3] = 0.0;
-        p_flag[3] = 0;
-        p_start[4] = p_stop[4] = p_period[4] = 0.0;
-        p_flag[4] = 0;
-      }
-      iarg += 4;
-
-    } else if (strcmp(arg[iarg],"x") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      p_start[0] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[0] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[0] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[0] = 1;
-      deviatoric_flag = 1;
-      iarg += 4;
-    } else if (strcmp(arg[iarg],"y") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      p_start[1] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[1] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[1] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[1] = 1;
-      deviatoric_flag = 1;
-      iarg += 4;
-    } else if (strcmp(arg[iarg],"z") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      p_start[2] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[2] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[2] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[2] = 1;
-      deviatoric_flag = 1;
-      iarg += 4;
-      if (dimension == 2)
-        error->all(FLERR,"Invalid fix nvt/npt/nph command for a 2d simulation");
-
-    } else if (strcmp(arg[iarg],"yz") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      p_start[3] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[3] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[3] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[3] = 1;
-      deviatoric_flag = 1;
-      iarg += 4;
-      if (dimension == 2)
-        error->all(FLERR,"Invalid fix nvt/npt/nph command for a 2d simulation");
-    } else if (strcmp(arg[iarg],"xz") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      p_start[4] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[4] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[4] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[4] = 1;
-      deviatoric_flag = 1;
-      iarg += 4;
-      if (dimension == 2)
-        error->all(FLERR,"Invalid fix nvt/npt/nph command for a 2d simulation");
-    } else if (strcmp(arg[iarg],"xy") == 0) {
-      if (iarg+4 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      p_start[5] = force->numeric(FLERR,arg[iarg+1]);
-      p_stop[5] = force->numeric(FLERR,arg[iarg+2]);
-      p_period[5] = force->numeric(FLERR,arg[iarg+3]);
-      p_flag[5] = 1;
-      deviatoric_flag = 1;
-      iarg += 4;
-
-    } else if (strcmp(arg[iarg],"couple") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      if (strcmp(arg[iarg+1],"xyz") == 0) pcouple = XYZ;
-      else if (strcmp(arg[iarg+1],"xy") == 0) pcouple = XY;
-      else if (strcmp(arg[iarg+1],"yz") == 0) pcouple = YZ;
-      else if (strcmp(arg[iarg+1],"xz") == 0) pcouple = XZ;
-      else if (strcmp(arg[iarg+1],"none") == 0) pcouple = NONE;
-      else error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-
-    } else if (strcmp(arg[iarg],"drag") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      drag = force->numeric(FLERR,arg[iarg+1]);
-      if (drag < 0.0) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"dilate") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      if (strcmp(arg[iarg+1],"all") == 0) allremap = 1;
-      else if (strcmp(arg[iarg+1],"partial") == 0) allremap = 0;
-      else error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"tchain") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      mtchain = force->inumeric(FLERR,arg[iarg+1]);
-      if (mtchain < 1) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"pchain") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      mpchain = force->inumeric(FLERR,arg[iarg+1]);
-      if (mpchain < 0) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"mtk") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      if (strcmp(arg[iarg+1],"yes") == 0) mtk_flag = 1;
-      else if (strcmp(arg[iarg+1],"no") == 0) mtk_flag = 0;
-      else error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"tloop") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      nc_tchain = force->inumeric(FLERR,arg[iarg+1]);
-      if (nc_tchain < 0) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"ploop") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      nc_pchain = force->inumeric(FLERR,arg[iarg+1]);
-      if (nc_pchain < 0) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"nreset") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      nreset_h0 = force->inumeric(FLERR,arg[iarg+1]);
-      if (nreset_h0 < 0) error->all(FLERR,"Illegal fix nvt/npt/nph command");
-      iarg += 2;
-    } else error->all(FLERR,"Illegal fix nvt/npt/nph command");
-  }
-
-  // error checks
-
-  if (dimension == 2 && (p_flag[2] || p_flag[3] || p_flag[4]))
-    error->all(FLERR,"Invalid fix nvt/npt/nph command for a 2d simulation");
-  if (dimension == 2 && (pcouple == YZ || pcouple == XZ))
-    error->all(FLERR,"Invalid fix nvt/npt/nph command for a 2d simulation");
-
-  if (pcouple == XYZ && (p_flag[0] == 0 || p_flag[1] == 0))
-    error->all(FLERR,"Invalid fix nvt/npt/nph command pressure settings");
-  if (pcouple == XYZ && dimension == 3 && p_flag[2] == 0)
-    error->all(FLERR,"Invalid fix nvt/npt/nph command pressure settings");
-  if (pcouple == XY && (p_flag[0] == 0 || p_flag[1] == 0))
-    error->all(FLERR,"Invalid fix nvt/npt/nph command pressure settings");
-  if (pcouple == YZ && (p_flag[1] == 0 || p_flag[2] == 0))
-    error->all(FLERR,"Invalid fix nvt/npt/nph command pressure settings");
-  if (pcouple == XZ && (p_flag[0] == 0 || p_flag[2] == 0))
-    error->all(FLERR,"Invalid fix nvt/npt/nph command pressure settings");
-
-  if (p_flag[0] && domain->xperiodic == 0)
-    error->all(FLERR,"Cannot use fix nvt/npt/nph on a non-periodic dimension");
-  if (p_flag[1] && domain->yperiodic == 0)
-    error->all(FLERR,"Cannot use fix nvt/npt/nph on a non-periodic dimension");
-  if (p_flag[2] && domain->zperiodic == 0)
-    error->all(FLERR,"Cannot use fix nvt/npt/nph on a non-periodic dimension");
-  if (p_flag[3] && domain->zperiodic == 0)
-    error->all(FLERR,"Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension");
-  if (p_flag[4] && domain->zperiodic == 0)
-    error->all(FLERR,"Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension");
-  if (p_flag[5] && domain->yperiodic == 0)
-    error->all(FLERR,"Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension");
-
-  if (!domain->triclinic && (p_flag[3] || p_flag[4] || p_flag[5]))
-    error->all(FLERR,"Can not specify Pxy/Pxz/Pyz in "
-               "fix nvt/npt/nph with non-triclinic box");
-
-  if (pcouple == XYZ && dimension == 3 &&
-      (p_start[0] != p_start[1] || p_start[0] != p_start[2] ||
-       p_stop[0] != p_stop[1] || p_stop[0] != p_stop[2] ||
-       p_period[0] != p_period[1] || p_period[0] != p_period[2]))
-    error->all(FLERR,"Invalid fix nvt/npt/nph pressure settings");
-  if (pcouple == XYZ && dimension == 2 &&
-      (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] ||
-       p_period[0] != p_period[1]))
-    error->all(FLERR,"Invalid fix nvt/npt/nph pressure settings");
-  if (pcouple == XY &&
-      (p_start[0] != p_start[1] || p_stop[0] != p_stop[1] ||
-       p_period[0] != p_period[1]))
-    error->all(FLERR,"Invalid fix nvt/npt/nph pressure settings");
-  if (pcouple == YZ &&
-      (p_start[1] != p_start[2] || p_stop[1] != p_stop[2] ||
-       p_period[1] != p_period[2]))
-    error->all(FLERR,"Invalid fix nvt/npt/nph pressure settings");
-  if (pcouple == XZ &&
-      (p_start[0] != p_start[2] || p_stop[0] != p_stop[2] ||
-       p_period[0] != p_period[2]))
-    error->all(FLERR,"Invalid fix nvt/npt/nph pressure settings");
-
-  if ((tstat_flag && t_period <= 0.0) ||
-      (p_flag[0] && p_period[0] <= 0.0) ||
-      (p_flag[1] && p_period[1] <= 0.0) ||
-      (p_flag[2] && p_period[2] <= 0.0) ||
-      (p_flag[3] && p_period[3] <= 0.0) ||
-      (p_flag[4] && p_period[4] <= 0.0) ||
-      (p_flag[5] && p_period[5] <= 0.0))
-    error->all(FLERR,"Fix nvt/npt/nph damping parameters must be > 0.0");
-
-  // set pstat_flag and box change variables
-
-  pstat_flag = 0;
-  for (int i = 0; i < 6; i++)
-    if (p_flag[i]) pstat_flag = 1;
-
-  if (pstat_flag) {
-    if (p_flag[0] || p_flag[1] || p_flag[2]) box_change_size = 1;
-    if (p_flag[3] || p_flag[4] || p_flag[5]) box_change_shape = 1;
-    no_change_box = 1;
-    if (allremap == 0) restart_pbc = 1;
-  }
-
-  // pstyle = TRICLINIC if any off-diagonal term is controlled -> 6 dof
-  // else pstyle = ISO if XYZ coupling or XY coupling in 2d -> 1 dof
-  // else pstyle = ANISO -> 3 dof
-
-  if (p_flag[3] || p_flag[4] || p_flag[5]) pstyle = TRICLINIC;
-  else if (pcouple == XYZ || (dimension == 2 && pcouple == XY)) pstyle = ISO;
-  else pstyle = ANISO;
-
-  // convert input periods to frequencies
-
-  t_freq = 0.0;
-  p_freq[0] = p_freq[1] = p_freq[2] = p_freq[3] = p_freq[4] = p_freq[5] = 0.0;
-
-  if (tstat_flag) t_freq = 1.0 / t_period;
-  if (p_flag[0]) p_freq[0] = 1.0 / p_period[0];
-  if (p_flag[1]) p_freq[1] = 1.0 / p_period[1];
-  if (p_flag[2]) p_freq[2] = 1.0 / p_period[2];
-  if (p_flag[3]) p_freq[3] = 1.0 / p_period[3];
-  if (p_flag[4]) p_freq[4] = 1.0 / p_period[4];
-  if (p_flag[5]) p_freq[5] = 1.0 / p_period[5];
-
-  // Nose/Hoover temp and pressure init
-
-  size_vector = 0;
-
-  if (tstat_flag) {
-    int ich;
-    eta = new double[mtchain];
-
-    // add one extra dummy thermostat, set to zero
-
-    eta_dot = new double[mtchain+1];
-    eta_dot[mtchain] = 0.0;
-    eta_dotdot = new double[mtchain];
-    for (ich = 0; ich < mtchain; ich++) {
-      eta[ich] = eta_dot[ich] = eta_dotdot[ich] = 0.0;
-    }
-    eta_mass = new double[mtchain];
-    size_vector += 2*2*mtchain;
-  }
-
-  if (pstat_flag) {
-    omega[0] = omega[1] = omega[2] = 0.0;
-    omega_dot[0] = omega_dot[1] = omega_dot[2] = 0.0;
-    omega_mass[0] = omega_mass[1] = omega_mass[2] = 0.0;
-    omega[3] = omega[4] = omega[5] = 0.0;
-    omega_dot[3] = omega_dot[4] = omega_dot[5] = 0.0;
-    omega_mass[3] = omega_mass[4] = omega_mass[5] = 0.0;
-    if (pstyle == ISO) size_vector += 2*2*1;
-    else if (pstyle == ANISO) size_vector += 2*2*3;
-    else if (pstyle == TRICLINIC) size_vector += 2*2*6;
-
-    if (mpchain) {
-      int ich;
-      etap = new double[mpchain];
-
-      // add one extra dummy thermostat, set to zero
-
-      etap_dot = new double[mpchain+1];
-      etap_dot[mpchain] = 0.0;
-      etap_dotdot = new double[mpchain];
-      for (ich = 0; ich < mpchain; ich++) {
-        etap[ich] = etap_dot[ich] =
-          etap_dotdot[ich] = 0.0;
-      }
-      etap_mass = new double[mpchain];
-      size_vector += 2*2*mpchain;
-    }
-
-    if (deviatoric_flag) size_vector += 1;
-  }
-
-  nrigid = 0;
-  rfix = NULL;
-
-  // initialize vol0,t0 to zero to signal uninitialized
-  // values then assigned in init(), if necessary
-
-  vol0 = t0 = 0.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixNHCuda::~FixNHCuda()
-{
-  delete [] rfix;
-
-  // delete temperature and pressure if fix created them
-
-  if (tflag) modify->delete_compute(id_temp);
-  delete [] id_temp;
-
-  if (tstat_flag) {
-    delete [] eta;
-    delete [] eta_dot;
-    delete [] eta_dotdot;
-    delete [] eta_mass;
-  }
-
-  if (pstat_flag) {
-    if (pflag) modify->delete_compute(id_press);
-    delete [] id_press;
-    if (mpchain) {
-      delete [] etap;
-      delete [] etap_dot;
-      delete [] etap_dotdot;
-      delete [] etap_mass;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixNHCuda::setmask()
-{
-  int mask = 0;
-  mask |= INITIAL_INTEGRATE_CUDA;
-  mask |= FINAL_INTEGRATE_CUDA;
-  mask |= THERMO_ENERGY_CUDA;
-  //mask |= INITIAL_INTEGRATE_RESPA;
-  //mask |= FINAL_INTEGRATE_RESPA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNHCuda::init()
-{
-  // insure no conflict with fix deform
-
-  if (pstat_flag)
-    for (int i = 0; i < modify->nfix; i++)
-      if (strcmp(modify->fix[i]->style,"deform") == 0) {
-        int *dimflag = ((FixDeform *) modify->fix[i])->dimflag;
-        if ((p_flag[0] && dimflag[0]) || (p_flag[1] && dimflag[1]) ||
-            (p_flag[2] && dimflag[2]) || (p_flag[3] && dimflag[3]) ||
-            (p_flag[4] && dimflag[4]) || (p_flag[5] && dimflag[5]))
-          error->all(FLERR,"Cannot use fix npt and fix deform on "
-                     "same component of stress tensor");
-      }
-
-  // set temperature and pressure ptrs
-
-  int icompute = modify->find_compute(id_temp);
-  if (icompute < 0)
-    error->all(FLERR,"Temperature ID for fix nvt/nph/npt does not exist");
-  temperature = modify->compute[icompute];
-
-  if (temperature->tempbias) which = BIAS;
-  else which = NOBIAS;
-
-  if (pstat_flag) {
-    icompute = modify->find_compute(id_press);
-    if (icompute < 0) error->all(FLERR,"Pressure ID for fix npt/nph does not exist");
-    pressure = modify->compute[icompute];
-  }
-
-  // set timesteps and frequencies
-
-  dtv = update->dt;
-  dtf = 0.5 * update->dt * force->ftm2v;
-  dthalf = 0.5 * update->dt;
-  dt4 = 0.25 * update->dt;
-  dt8 = 0.125 * update->dt;
-  dto = dthalf;
-
-  p_freq_max = 0.0;
-  if (pstat_flag) {
-    p_freq_max = MAX(p_freq[0],p_freq[1]);
-    p_freq_max = MAX(p_freq_max,p_freq[2]);
-    if (pstyle == TRICLINIC) {
-      p_freq_max = MAX(p_freq_max,p_freq[3]);
-      p_freq_max = MAX(p_freq_max,p_freq[4]);
-      p_freq_max = MAX(p_freq_max,p_freq[5]);
-    }
-    pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain);
-  }
-
-  if (tstat_flag)
-    tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
-
-  // tally the number of dimensions that are barostatted
-  // also compute the initial volume and reference cell
-  // set initial volume and reference cell, if not already done
-
-  if (pstat_flag) {
-    pdim = p_flag[0] + p_flag[1] + p_flag[2];
-    if (vol0 == 0.0) {
-      if (dimension == 3) vol0 = domain->xprd * domain->yprd * domain->zprd;
-      else vol0 = domain->xprd * domain->yprd;
-      h0_inv[0] = domain->h_inv[0];
-      h0_inv[1] = domain->h_inv[1];
-      h0_inv[2] = domain->h_inv[2];
-      h0_inv[3] = domain->h_inv[3];
-      h0_inv[4] = domain->h_inv[4];
-      h0_inv[5] = domain->h_inv[5];
-    }
-  }
-
-  boltz = force->boltz;
-  nktv2p = force->nktv2p;
-
-  if (force->kspace) kspace_flag = 1;
-  else kspace_flag = 0;
-
-  if (strcmp(update->integrate_style,"respa") == 0) {
-    nlevels_respa = ((Respa *) update->integrate)->nlevels;
-    step_respa = ((Respa *) update->integrate)->step;
-    dto = 0.5*step_respa[0];
-  }
-
-  // detect if any rigid fixes exist so rigid bodies move when box is remapped
-  // rfix[] = indices to each fix rigid
-
-  delete [] rfix;
-  nrigid = 0;
-  rfix = NULL;
-
-  for (int i = 0; i < modify->nfix; i++)
-    if (modify->fix[i]->rigid_flag) nrigid++;
-  if (nrigid) {
-    rfix = new int[nrigid];
-    nrigid = 0;
-    for (int i = 0; i < modify->nfix; i++)
-      if (modify->fix[i]->rigid_flag) rfix[nrigid++] = i;
-  }
-  triggerneighsq= cuda->shared_data.atom.triggerneighsq;
-  cuda->neighbor_decide_by_integrator=1;
-  Cuda_FixNHCuda_Init(&cuda->shared_data,dtv,dtf);
-
-}
-
-/* ----------------------------------------------------------------------
-   compute T,P before integrator starts
-------------------------------------------------------------------------- */
-
-void FixNHCuda::setup(int vflag)
-{
-  // initialize some quantities that were not available earlier
-
-  //if (mtk_flag) mtk_factor = 1.0 + 1.0/atom->natoms;
-  //else mtk_factor = 1.0;
-  tdof = temperature->dof;
-
-  // t_target is used by compute_scalar(), even for NPH
-
-  if (tstat_flag) t_target = t_start;
-  else if (pstat_flag) {
-
-    // t0 = initial value for piston mass and energy conservation
-    // cannot be done in init() b/c temperature cannot be called there
-    // is b/c Modify::init() inits computes after fixes due to dof dependence
-    // guesstimate a unit-dependent t0 if actual T = 0.0
-    // if it was read in from a restart file, leave it be
-
-    if (t0 == 0.0) {
-      t0 = temperature->compute_scalar();
-      if (t0 == 0.0) {
-        if (strcmp(update->unit_style,"lj") == 0) t0 = 1.0;
-        else t0 = 300.0;
-      }
-    }
-    t_target = t0;
-  }
-
-  if (pstat_flag) compute_press_target();
-
-  t_current = temperature->compute_scalar();
-  if (pstat_flag) {
-    if (pstyle == ISO) double tmp = pressure->compute_scalar();
-    else pressure->compute_vector();
-    couple();
-    pressure->addstep(update->ntimestep+1);
-  }
-
-  // initial forces on thermostat variables
-
-  if (tstat_flag) {
-    eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq);
-    for (int ich = 1; ich < mtchain; ich++)
-      eta_mass[ich] = boltz * t_target / (t_freq*t_freq);
-    for (int ich = 1; ich < mtchain; ich++) {
-      eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1] -
-                         boltz*t_target) / eta_mass[ich];
-    }
-  }
-
-  if (pstat_flag) {
-    double kt = boltz * t_target;
-    double nkt = atom->natoms * kt;
-
-    for (int i = 0; i < 3; i++)
-      if (p_flag[i])
-        omega_mass[i] = nkt/(p_freq[i]*p_freq[i]);
-
-    if (pstyle == TRICLINIC) {
-      for (int i = 3; i < 6; i++)
-        if (p_flag[i]) omega_mass[i] = nkt/(p_freq[i]*p_freq[i]);
-    }
-
-  // initial forces on barostat thermostat variables
-
-    if (mpchain) {
-      etap_mass[0] = boltz * t_target / (p_freq_max*p_freq_max);
-      for (int ich = 1; ich < mpchain; ich++)
-        etap_mass[ich] = boltz * t_target / (p_freq_max*p_freq_max);
-      for (int ich = 1; ich < mpchain; ich++)
-        etap_dotdot[ich] =
-          (etap_mass[ich-1]*etap_dot[ich-1]*etap_dot[ich-1] -
-           boltz*t_target) / etap_mass[ich];
-    }
-
-    // compute appropriately coupled elements of mvv_current
-
-    //if (mtk_flag) couple_ke();
-  }
-}
-
-/* ----------------------------------------------------------------------
-   1st half of Verlet update
-------------------------------------------------------------------------- */
-
-void FixNHCuda::initial_integrate(int vflag)
-{
-  if(!temperature->cudable) cuda->downloadAll();
-
-  if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq)
-  {
-        triggerneighsq= cuda->shared_data.atom.triggerneighsq;
-        Cuda_FixNHCuda_Init(&cuda->shared_data,dtv,dtf);
-  }
-
-  // update eta_press_dot
-
-  if (pstat_flag && mpchain) nhc_press_integrate();
-
-  // update eta_dot
-
-  if (tstat_flag) {
-    double delta = update->ntimestep - update->beginstep;
-    delta /= update->endstep - update->beginstep;
-    t_target = t_start + delta * (t_stop-t_start);
-    eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq);
-    for (int ich = 1; ich < mtchain; ich++)
-      eta_mass[ich] = boltz * t_target / (t_freq*t_freq);
-    nhc_temp_integrate();
-  }
-
-  // need to recompute pressure to account for change in KE
-  // t_current is up-to-date, but compute_temperature is not
-  // compute appropriately coupled elements of mvv_current
-
-  if (pstat_flag) {
-    if (pstyle == ISO) {
-      temperature->compute_scalar();
-      double tmp = pressure->compute_scalar();
-    } else {
-      temperature->compute_vector();
-      pressure->compute_vector();
-    }
-    couple();
-    pressure->addstep(update->ntimestep+1);
-    //if (mtk_flag) couple_ke();
-  }
-
-  if(which==NOBIAS)
-  {
-    if (pstat_flag) {
-      compute_press_target();
-      nh_omega_dot();
-      factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
-            factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
-            factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
-      Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
-    }
-    else
-    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-  }
-  else if(which==BIAS)
-  {
-          if(pstat_flag)
-          {
-      compute_press_target();
-      nh_omega_dot();
-            factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
-            factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
-            factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
-            if(!temperature->cudable)
-            {
-                    nh_v_press();
-                    cuda->cu_v->upload();
-            }
-            else
-            {
-               int groupbit_org=temperature->groupbit;
-               temperature->groupbit=groupbit;
-              temperature->remove_bias_all();
-        Cuda_FixNHCuda_nh_v_press(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
-              temperature->restore_bias_all();
-            temperature->groupbit=groupbit_org;
-            }
-          }
-    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-  }
-
-  // remap simulation box by 1/2 step
-
-  if (pstat_flag) remap();
-
-  Cuda_FixNHCuda_nve_x(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-
-  // remap simulation box by 1/2 step
-  // redo KSpace coeffs since volume has changed
-
-  if (pstat_flag) {
-    remap();
-    if (kspace_flag) force->kspace->setup();
-  }
-}
-
-/* ----------------------------------------------------------------------
-   2nd half of Verlet update
-------------------------------------------------------------------------- */
-
-void FixNHCuda::final_integrate()
-{
-  if(!temperature->cudable) cuda->downloadAll();
-
-  if(which==NOBIAS)
-  {
-    if(pstat_flag)
-    {
-      factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
-      factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
-      factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
-
-      Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
-    }
-    else
-    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-  }
-  else if(which==BIAS)
-  {
-    Cuda_FixNHCuda_nve_v(&cuda->shared_data,groupbit,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-
-          if(pstat_flag)
-          {
-      factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
-      factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
-      factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
-            if(!temperature->cudable)
-            {
-                    cuda->cu_v->download();
-                    nh_v_press();
-                    cuda->cu_v->upload();
-            }
-            else
-            {
-            int groupbit_org=temperature->groupbit;
-            temperature->groupbit=groupbit;
-             temperature->remove_bias_all();
-        Cuda_FixNHCuda_nh_v_press(&cuda->shared_data, groupbit, factor,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal,(pstyle == TRICLINIC)?1:0);
-              temperature->restore_bias_all();
-            temperature->groupbit=groupbit_org;
-            }
-          }
-  }
-  // compute new T,P
-  // compute appropriately coupled elements of mvv_current
-
-  if(!temperature->cudable)        cuda->cu_v->download();
-  t_current = temperature->compute_scalar();
-  if (pstat_flag) {
-    if (pstyle == ISO) double tmp = pressure->compute_scalar();
-    else pressure->compute_vector();
-    couple();
-    pressure->addstep(update->ntimestep+1);
-  }
-
-  if (pstat_flag) nh_omega_dot();
-
-  // update eta_dot
-  // update eta_press_dot
-
-  if (tstat_flag) nhc_temp_integrate();
-  if (pstat_flag && mpchain) nhc_press_integrate();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNHCuda::initial_integrate_respa(int vflag, int ilevel, int iloop)
-{
-  // set timesteps by level
-
-  dtv = step_respa[ilevel];
-  dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
-  dthalf = 0.5 * step_respa[ilevel];
-
-  // outermost level - update eta_dot and omega_dot, apply to v, remap box
-  // all other levels - NVE update of v
-  // x,v updates only performed for atoms in group
-
-  if (ilevel == nlevels_respa-1) {
-
-    // update eta_press_dot
-
-    if (pstat_flag && mpchain) nhc_press_integrate();
-
-    // update eta_dot
-
-    if (tstat_flag) {
-      double delta = update->ntimestep - update->beginstep;
-      delta /= update->endstep - update->beginstep;
-      t_target = t_start + delta * (t_stop-t_start);
-      eta_mass[0] = tdof * boltz * t_target / (t_freq*t_freq);
-      for (int ich = 1; ich < mtchain; ich++)
-        eta_mass[ich] = boltz * t_target / (t_freq*t_freq);
-      nhc_temp_integrate();
-    }
-
-    // recompute pressure to account for change in KE
-    // t_current is up-to-date, but compute_temperature is not
-    // compute appropriately coupled elements of mvv_current
-
-    if (pstat_flag) {
-      if (pstyle == ISO) {
-        temperature->compute_scalar();
-        double tmp = pressure->compute_scalar();
-      } else {
-               temperature->compute_vector();
-        pressure->compute_vector();
-      }
-      couple();
-      pressure->addstep(update->ntimestep+1);
-      if (mtk_flag) couple_ke();
-    }
-
-    if (pstat_flag) {
-      compute_press_target();
-      nh_omega_dot();
-      nh_v_press();
-    }
-
-    nve_v();
-
-  } else nve_v();
-
-  // innermost level - also update x only for atoms in group
-  // if barostat, perform 1/2 step remap before and after
-
-  if (ilevel == 0) {
-    if (pstat_flag) remap();
-    nve_x();
-    if (pstat_flag) remap();
-  }
-
-  // if barostat, redo KSpace coeffs at outermost level,
-  // since volume has changed
-
-  if (ilevel == nlevels_respa-1 && kspace_flag && pstat_flag)
-    force->kspace->setup();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNHCuda::final_integrate_respa(int ilevel, int iloop)
-{
-  // set timesteps by level
-
-  dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
-  dthalf = 0.5 * step_respa[ilevel];
-
-  // outermost level - update eta_dot and omega_dot, apply via final_integrate
-  // all other levels - NVE update of v
-
-  if (ilevel == nlevels_respa-1) final_integrate();
-  else nve_v();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNHCuda::couple()
-{
-  double *tensor = pressure->vector;
-
-  if (pstyle == ISO)
-    p_current[0] = p_current[1] = p_current[2] = pressure->scalar;
-  else if (pcouple == XYZ) {
-    double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]);
-    p_current[0] = p_current[1] = p_current[2] = ave;
-  } else if (pcouple == XY) {
-    double ave = 0.5 * (tensor[0] + tensor[1]);
-    p_current[0] = p_current[1] = ave;
-    p_current[2] = tensor[2];
-  } else if (pcouple == YZ) {
-    double ave = 0.5 * (tensor[1] + tensor[2]);
-    p_current[1] = p_current[2] = ave;
-    p_current[0] = tensor[0];
-  } else if (pcouple == XZ) {
-    double ave = 0.5 * (tensor[0] + tensor[2]);
-    p_current[0] = p_current[2] = ave;
-    p_current[1] = tensor[1];
-  } else {
-    p_current[0] = tensor[0];
-    p_current[1] = tensor[1];
-    p_current[2] = tensor[2];
-  }
-
-  // switch order from xy-xz-yz to Voigt
-
-  if (pstyle == TRICLINIC) {
-    p_current[3] = tensor[5];
-    p_current[4] = tensor[4];
-    p_current[5] = tensor[3];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNHCuda::couple_ke()
-{
-  double *tensor = temperature->vector;
-  if (pstyle == ISO)
-    mvv_current[0] = mvv_current[1] = mvv_current[2] =
-      tdof * boltz * t_current/dimension;
-  else if (pcouple == XYZ) {
-    double ave = 1.0/3.0 * (tensor[0] + tensor[1] + tensor[2]);
-    mvv_current[0] = mvv_current[1] = mvv_current[2] = ave;
-  } else if (pcouple == XY) {
-    double ave = 0.5 * (tensor[0] + tensor[1]);
-    mvv_current[0] = mvv_current[1] = ave;
-    mvv_current[2] = tensor[2];
-  } else if (pcouple == YZ) {
-    double ave = 0.5 * (tensor[1] + tensor[2]);
-    mvv_current[1] = mvv_current[2] = ave;
-    mvv_current[0] = tensor[0];
-  } else if (pcouple == XZ) {
-    double ave = 0.5 * (tensor[0] + tensor[2]);
-    mvv_current[0] = mvv_current[2] = ave;
-    mvv_current[1] = tensor[1];
-  } else {
-    mvv_current[0] = tensor[0];
-    mvv_current[1] = tensor[1];
-    mvv_current[2] = tensor[2];
-  }
-}
-
-/* ----------------------------------------------------------------------
-   change box size
-   remap all atoms or fix group atoms depending on allremap flag
-   if rigid bodies exist, scale rigid body centers-of-mass
-------------------------------------------------------------------------- */
-
-void FixNHCuda::remap()
-{
-  int i;
-  double oldlo,oldhi,ctr;
-
-  double **x = atom->x;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  double *h = domain->h;
-
-  // omega is not used, except for book-keeping
-
-  for (int i = 0; i < 6; i++) omega[i] += dto*omega_dot[i];
-
-  // convert pertinent atoms and rigid bodies to lamda coords
-  if (allremap) domain->x2lamda(nlocal);
-  else {
-    for (i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        domain->x2lamda(x[i],x[i]);
-  }
-
-  if (nrigid)
-    for (i = 0; i < nrigid; i++)
-      modify->fix[rfix[i]]->deform(0);
-
-  // reset global and local box to new size/shape
-
-  // This operation corresponds to applying the
-  // translate and scale operations
-  // corresponding to the solution of the following ODE:
-  //
-  // h_dot = omega_dot * h
-  //
-  // where h_dot, omega_dot and h are all upper-triangular
-  // 3x3 tensors. In Voigt notation, the elements of the
-  // RHS product tensor are:
-  // h_dot = [0*0, 1*1, 2*2, 1*3+3*2, 0*4+5*3+4*2, 0*5+5*1]
-  //
-  // Ordering of operations preserves time symmetry.
-
-  double dto2 = dto/2.0;
-  double dto4 = dto/4.0;
-  double dto8 = dto/8.0;
-
-  if (pstyle == TRICLINIC) {
-
-    h[4] *= exp(dto8*omega_dot[0]);
-    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
-    h[4] *= exp(dto8*omega_dot[0]);
-
-    h[3] *= exp(dto4*omega_dot[1]);
-    h[3] += dto2*(omega_dot[3]*h[2]);
-    h[3] *= exp(dto4*omega_dot[1]);
-
-    h[5] *= exp(dto4*omega_dot[0]);
-    h[5] += dto2*(omega_dot[5]*h[1]);
-    h[5] *= exp(dto4*omega_dot[0]);
-
-    h[4] *= exp(dto8*omega_dot[0]);
-    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
-    h[4] *= exp(dto8*omega_dot[0]);
-
-  }
-
-  for (i = 0; i < 3; i++) {
-    if (p_flag[i]) {
-      oldlo = domain->boxlo[i];
-      oldhi = domain->boxhi[i];
-      ctr = 0.5 * (oldlo + oldhi);
-      domain->boxlo[i] = (oldlo-ctr)*exp(dto*omega_dot[i]) + ctr;
-      domain->boxhi[i] = (oldhi-ctr)*exp(dto*omega_dot[i]) + ctr;
-    }
-  }
-
-  if (pstyle == TRICLINIC) {
-
-    h[4] *= exp(dto8*omega_dot[0]);
-    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
-    h[4] *= exp(dto8*omega_dot[0]);
-
-    h[3] *= exp(dto4*omega_dot[1]);
-    h[3] += dto2*(omega_dot[3]*h[2]);
-    h[3] *= exp(dto4*omega_dot[1]);
-
-    h[5] *= exp(dto4*omega_dot[0]);
-    h[5] += dto2*(omega_dot[5]*h[1]);
-    h[5] *= exp(dto4*omega_dot[0]);
-
-    h[4] *= exp(dto8*omega_dot[0]);
-    h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
-    h[4] *= exp(dto8*omega_dot[0]);
-
-    domain->yz = h[3];
-    domain->xz = h[4];
-    domain->xy = h[5];
-
-    if (domain->yz < -0.5*domain->yprd || domain->yz > 0.5*domain->yprd ||
-        domain->xz < -0.5*domain->xprd || domain->xz > 0.5*domain->xprd ||
-        domain->xy < -0.5*domain->xprd || domain->xy > 0.5*domain->xprd)
-      error->all(FLERR,"Fix npt/nph has tilted box too far - "
-                 "box flips are not yet implemented");
-  }
-
-  domain->set_global_box();
-  domain->set_local_box();
-
-  // convert pertinent atoms and rigid bodies back to box coords
-
-  if (allremap) domain->lamda2x(nlocal);
-  else {
-    for (i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit)
-        domain->lamda2x(x[i],x[i]);
-  }
-
-  if (nrigid)
-    for (i = 0; i < nrigid; i++)
-      modify->fix[rfix[i]]->deform(1);
-}
-
-/* ----------------------------------------------------------------------
-   pack entire state of Fix into one write
-------------------------------------------------------------------------- */
-
-void FixNHCuda::write_restart(FILE *fp)
-{
-  int nsize = 2;
-  if (tstat_flag) nsize += 1 + 2*mtchain;
-  if (pstat_flag) {
-    nsize += 16 + 2*mpchain;
-    if (deviatoric_flag) nsize += 6;
-  }
-
-  double* list = (double *) memory->smalloc(nsize*sizeof(double),"nh:list");
-
-  int n = 0;
-
-  list[n++] = tstat_flag;
-  if (tstat_flag) {
-    list[n++] = mtchain;
-    for (int ich = 0; ich < mtchain; ich++)
-      list[n++] = eta[ich];
-    for (int ich = 0; ich < mtchain; ich++)
-      list[n++] = eta_dot[ich];
-  }
-
-  list[n++] = pstat_flag;
-  if (pstat_flag) {
-    list[n++] = omega[0];
-    list[n++] = omega[1];
-    list[n++] = omega[2];
-    list[n++] = omega[3];
-    list[n++] = omega[4];
-    list[n++] = omega[5];
-    list[n++] = omega_dot[0];
-    list[n++] = omega_dot[1];
-    list[n++] = omega_dot[2];
-    list[n++] = omega_dot[3];
-    list[n++] = omega_dot[4];
-    list[n++] = omega_dot[5];
-    list[n++] = vol0;
-    list[n++] = t0;
-    list[n++] = mpchain;
-    if (mpchain) {
-      for (int ich = 0; ich < mpchain; ich++)
-        list[n++] = etap[ich];
-      for (int ich = 0; ich < mpchain; ich++)
-        list[n++] = etap_dot[ich];
-    }
-
-    list[n++] = deviatoric_flag;
-    if (deviatoric_flag) {
-      list[n++] = h0_inv[0];
-      list[n++] = h0_inv[1];
-      list[n++] = h0_inv[2];
-      list[n++] = h0_inv[3];
-      list[n++] = h0_inv[4];
-      list[n++] = h0_inv[5];
-    }
-  }
-
-  if (comm->me == 0) {
-    int size = nsize * sizeof(double);
-    fwrite(&size,sizeof(int),1,fp);
-    fwrite(list,sizeof(double),nsize,fp);
-  }
-
-  memory->sfree(list);
-}
-
-/* ----------------------------------------------------------------------
-   use state info from restart file to restart the Fix
-------------------------------------------------------------------------- */
-
-void FixNHCuda::restart(char *buf)
-{
-  int n = 0;
-  double *list = (double *) buf;
-  int flag = static_cast<int> (list[n++]);
-  if (flag) {
-    int m = static_cast<int> (list[n++]);
-    if (tstat_flag && m == mtchain) {
-      for (int ich = 0; ich < mtchain; ich++)
-        eta[ich] = list[n++];
-      for (int ich = 0; ich < mtchain; ich++)
-        eta_dot[ich] = list[n++];
-    } else n += 2*m;
-  }
-  flag = static_cast<int> (list[n++]);
-  if (flag) {
-    omega[0] = list[n++];
-    omega[1] = list[n++];
-    omega[2] = list[n++];
-    omega[3] = list[n++];
-    omega[4] = list[n++];
-    omega[5] = list[n++];
-    omega_dot[0] = list[n++];
-    omega_dot[1] = list[n++];
-    omega_dot[2] = list[n++];
-    omega_dot[3] = list[n++];
-    omega_dot[4] = list[n++];
-    omega_dot[5] = list[n++];
-    vol0 = list[n++];
-    t0 = list[n++];
-    int m = static_cast<int> (list[n++]);
-    if (pstat_flag && m == mpchain) {
-      for (int ich = 0; ich < mpchain; ich++)
-        etap[ich] = list[n++];
-      for (int ich = 0; ich < mpchain; ich++)
-        etap_dot[ich] = list[n++];
-    } else n+=2*m;
-    flag = static_cast<int> (list[n++]);
-    if (flag) {
-      h0_inv[0] = list[n++];
-      h0_inv[1] = list[n++];
-      h0_inv[2] = list[n++];
-      h0_inv[3] = list[n++];
-      h0_inv[4] = list[n++];
-      h0_inv[5] = list[n++];
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixNHCuda::modify_param(int narg, char **arg)
-{
-  if (strcmp(arg[0],"temp") == 0) {
-    if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
-    if (tflag) {
-      modify->delete_compute(id_temp);
-      tflag = 0;
-    }
-    delete [] id_temp;
-    int n = strlen(arg[1]) + 1;
-    id_temp = new char[n];
-    strcpy(id_temp,arg[1]);
-
-    int icompute = modify->find_compute(arg[1]);
-    if (icompute < 0) error->all(FLERR,"Could not find fix_modify temperature ID");
-    temperature = modify->compute[icompute];
-
-    if (temperature->tempflag == 0)
-      error->all(FLERR,"Fix_modify temperature ID does not compute temperature");
-    if (temperature->igroup != 0 && comm->me == 0)
-      error->warning(FLERR,"Temperature for fix modify is not for group all");
-
-    // reset id_temp of pressure to new temperature ID
-
-    if (pstat_flag) {
-      icompute = modify->find_compute(id_press);
-      if (icompute < 0)
-        error->all(FLERR,"Pressure ID for fix modify does not exist");
-      modify->compute[icompute]->reset_extra_compute_fix(id_temp);
-    }
-
-    return 2;
-
-  } else if (strcmp(arg[0],"press") == 0) {
-    if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
-    if (!pstat_flag) error->all(FLERR,"Illegal fix_modify command");
-    if (pflag) {
-      modify->delete_compute(id_press);
-      pflag = 0;
-    }
-    delete [] id_press;
-    int n = strlen(arg[1]) + 1;
-    id_press = new char[n];
-    strcpy(id_press,arg[1]);
-
-    int icompute = modify->find_compute(arg[1]);
-    if (icompute < 0) error->all(FLERR,"Could not find fix_modify pressure ID");
-    pressure = modify->compute[icompute];
-
-    if (pressure->pressflag == 0)
-      error->all(FLERR,"Fix_modify pressure ID does not compute pressure");
-    return 2;
-  }
-
-  return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-double FixNHCuda::compute_scalar()
-{
-  int i;
-  double volume;
-  double energy;
-  double kt = boltz * t_target;
-  double lkt = tdof * kt;
-  double lkt_press = kt;
-  int ich;
-  if (dimension == 3) volume = domain->xprd * domain->yprd * domain->zprd;
-  else volume = domain->xprd * domain->yprd;
-
-  energy = 0.0;
-
-  // thermostat chain energy is equivalent to Eq. (2) in
-  // Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117
-  // Sum(0.5*p_eta_k^2/Q_k,k=1,M) + L*k*T*eta_1 + Sum(k*T*eta_k,k=2,M),
-  // where L = tdof
-  //       M = mtchain
-  //       p_eta_k = Q_k*eta_dot[k-1]
-  //       Q_1 = L*k*T/t_freq^2
-  //       Q_k = k*T/t_freq^2, k > 1
-
-  if (tstat_flag) {
-    energy += lkt * eta[0] + 0.5*eta_mass[0]*eta_dot[0]*eta_dot[0];
-    for (ich = 1; ich < mtchain; ich++)
-      energy += kt * eta[ich] + 0.5*eta_mass[ich]*eta_dot[ich]*eta_dot[ich];
-  }
-
-  // barostat energy is equivalent to Eq. (8) in
-  // Martyna, Tuckerman, Tobias, Klein, Mol Phys, 87, 1117
-  // Sum(0.5*p_omega^2/W + P*V),
-  // where N = natoms
-  //       p_omega = W*omega_dot
-  //       W = N*k*T/p_freq^2
-  //       sum is over barostatted dimensions
-
-  if (pstat_flag) {
-    for (i = 0; i < 3; i++)
-      if (p_flag[i])
-        energy += 0.5*omega_dot[i]*omega_dot[i]*omega_mass[i] +
-          p_hydro*(volume-vol0) / (pdim*nktv2p);
-
-    if (pstyle == TRICLINIC) {
-      for (i = 3; i < 6; i++)
-        if (p_flag[i])
-          energy += 0.5*omega_dot[i]*omega_dot[i]*omega_mass[i];
-    }
-
-    // extra contributions from thermostat chain for barostat
-
-    if (mpchain) {
-      energy += lkt_press * etap[0] + 0.5*etap_mass[0]*etap_dot[0]*etap_dot[0];
-      for (ich = 1; ich < mpchain; ich++)
-        energy += kt * etap[ich] +
-          0.5*etap_mass[ich]*etap_dot[ich]*etap_dot[ich];
-    }
-
-    // extra contribution from strain energy
-
-    if (deviatoric_flag) energy += compute_strain_energy();
-  }
-
-  return energy;
-}
-
-/* ----------------------------------------------------------------------
-   return a single element of the following vectors, in this order:
-      eta[tchain], eta_dot[tchain], omega[ndof], omega_dot[ndof]
-      etap[pchain], etap_dot[pchain], PE_eta[tchain], KE_eta_dot[tchain]
-      PE_omega[ndof], KE_omega_dot[ndof], PE_etap[pchain], KE_etap_dot[pchain]
-      PE_strain[1]
-  if no thermostat exists, related quantities are omitted from the list
-  if no barostat exists, related quantities are omitted from the list
-  ndof = 1,3,6 degrees of freedom for pstyle = ISO,ANISO,TRI
-------------------------------------------------------------------------- */
-
-double FixNHCuda::compute_vector(int n)
-{
-  int ilen;
-
-  if (tstat_flag) {
-    ilen = mtchain;
-    if (n < ilen) return eta[n];
-    n -= ilen;
-    ilen = mtchain;
-    if (n < ilen) return eta_dot[n];
-    n -= ilen;
-  }
-
-  if (pstat_flag) {
-    if (pstyle == ISO) {
-      ilen = 1;
-      if (n < ilen) return omega[n];
-      n -= ilen;
-    } else if (pstyle == ANISO) {
-      ilen = 3;
-      if (n < ilen) return omega[n];
-      n -= ilen;
-    } else {
-      ilen = 6;
-      if (n < ilen) return omega[n];
-      n -= ilen;
-    }
-
-    if (pstyle == ISO) {
-      ilen = 1;
-      if (n < ilen) return omega_dot[n];
-      n -= ilen;
-    } else if (pstyle == ANISO) {
-      ilen = 3;
-      if (n < ilen) return omega_dot[n];
-      n -= ilen;
-    } else {
-      ilen = 6;
-      if (n < ilen) return omega_dot[n];
-      n -= ilen;
-    }
-
-    if (mpchain) {
-      ilen = mpchain;
-      if (n < ilen) return etap[n];
-      n -= ilen;
-      ilen = mpchain;
-      if (n < ilen) return etap_dot[n];
-      n -= ilen;
-    }
-  }
-
-  double volume;
-  double kt = boltz * t_target;
-  double lkt = tdof * kt;
-  double lkt_press = kt;
-  int ich;
-  if (dimension == 3) volume = domain->xprd * domain->yprd * domain->zprd;
-  else volume = domain->xprd * domain->yprd;
-
-  if (tstat_flag) {
-    ilen = mtchain;
-    if (n < ilen) {
-      ich = n;
-      if (ich == 0)
-        return lkt * eta[0];
-      else
-        return kt * eta[ich];
-    }
-    n -= ilen;
-    ilen = mtchain;
-    if (n < ilen) {
-      ich = n;
-      if (ich == 0)
-        return 0.5*eta_mass[0]*eta_dot[0]*eta_dot[0];
-      else
-        return 0.5*eta_mass[ich]*eta_dot[ich]*eta_dot[ich];
-    }
-    n -= ilen;
-  }
-
-  if (pstat_flag) {
-    if (pstyle == ISO) {
-      ilen = 1;
-      if (n < ilen)
-        return p_hydro*(volume-vol0) / nktv2p;
-      n -= ilen;
-    } else if (pstyle == ANISO) {
-      ilen = 3;
-      if (n < ilen)
-        if (p_flag[n])
-          return p_hydro*(volume-vol0) / (pdim*nktv2p);
-        else
-          return 0.0;
-      n -= ilen;
-    } else {
-      ilen = 6;
-      if (n < ilen)
-        if (n > 2) return 0.0;
-        else if (p_flag[n])
-          return p_hydro*(volume-vol0) / (pdim*nktv2p);
-        else
-          return 0.0;
-      n -= ilen;
-    }
-
-    if (pstyle == ISO) {
-      ilen = 1;
-      if (n < ilen)
-        return pdim*0.5*omega_dot[n]*omega_dot[n]*omega_mass[n];
-      n -= ilen;
-    } else if (pstyle == ANISO) {
-      ilen = 3;
-      if (n < ilen)
-        if (p_flag[n])
-          return 0.5*omega_dot[n]*omega_dot[n]*omega_mass[n];
-        else return 0.0;
-      n -= ilen;
-    } else {
-      ilen = 6;
-      if (n < ilen)
-        if (p_flag[n])
-          return 0.5*omega_dot[n]*omega_dot[n]*omega_mass[n];
-        else return 0.0;
-      n -= ilen;
-    }
-
-    if (mpchain) {
-      ilen = mpchain;
-      if (n < ilen) {
-        ich = n;
-        if (ich == 0) return lkt_press * etap[0];
-        else return kt * etap[ich];
-      }
-      n -= ilen;
-      ilen = mpchain;
-      if (n < ilen) {
-        ich = n;
-        if (ich == 0)
-          return 0.5*etap_mass[0]*etap_dot[0]*etap_dot[0];
-        else
-          return 0.5*etap_mass[ich]*etap_dot[ich]*etap_dot[ich];
-      }
-      n -= ilen;
-    }
-
-    if (deviatoric_flag) {
-      ilen = 1;
-      if (n < ilen)
-        return compute_strain_energy();
-      n -= ilen;
-    }
-  }
-
-  return 0.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNHCuda::reset_dt()
-{
-  dtv = update->dt;
-  dtf = 0.5 * update->dt * force->ftm2v;
-  dthalf = 0.5 * update->dt;
-  dt4 = 0.25 * update->dt;
-  dt8 = 0.125 * update->dt;
-  dto = dthalf;
-
-  // If using respa, then remap is performed in innermost level
-
-  if (strcmp(update->integrate_style,"respa") == 0)
-    dto = 0.5*step_respa[0];
-
-  p_freq_max = 0.0;
-  if (pstat_flag) {
-    p_freq_max = MAX(p_freq[0],p_freq[1]);
-    p_freq_max = MAX(p_freq_max,p_freq[2]);
-    if (pstyle == TRICLINIC) {
-      p_freq_max = MAX(p_freq_max,p_freq[3]);
-      p_freq_max = MAX(p_freq_max,p_freq[4]);
-      p_freq_max = MAX(p_freq_max,p_freq[5]);
-    }
-    pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain);
-  }
-
-  if (tstat_flag)
-    tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
-}
-
-/* ----------------------------------------------------------------------
-   perform half-step update of chain thermostat variables
-------------------------------------------------------------------------- */
-
-void FixNHCuda::nhc_temp_integrate()
-{
-  int ich;
-  double expfac;
-
-  double lkt = tdof * boltz * t_target;
-  double kecurrent = tdof * boltz * t_current;
-  eta_dotdot[0] = (kecurrent - lkt)/eta_mass[0];
-
-  double ncfac = 1.0/nc_tchain;
-  for (int iloop = 0; iloop < nc_tchain; iloop++) {
-
-    for (ich = mtchain-1; ich > 0; ich--) {
-      expfac = exp(-ncfac*dt8*eta_dot[ich+1]);
-      eta_dot[ich] *= expfac;
-      eta_dot[ich] += eta_dotdot[ich] * ncfac*dt4;
-      eta_dot[ich] *= tdrag_factor;
-      eta_dot[ich] *= expfac;
-    }
-
-    expfac = exp(-ncfac*dt8*eta_dot[1]);
-    eta_dot[0] *= expfac;
-    eta_dot[0] += eta_dotdot[0] * ncfac*dt4;
-    eta_dot[0] *= tdrag_factor;
-    eta_dot[0] *= expfac;
-
-    factor_eta = exp(-ncfac*dthalf*eta_dot[0]);
-        if(which==NOBIAS)
-        Cuda_FixNHCuda_nh_v_temp(&cuda->shared_data,groupbit,factor_eta,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-    else if(which==BIAS)
-    {
-            if(!temperature->cudable)
-            {
-                    cuda->downloadAll();
-                        nh_v_temp();
-                        cuda->cu_v->upload();
-            }
-            else
-            {
-               int groupbit_org=temperature->groupbit;
-               temperature->groupbit=groupbit;
-               temperature->remove_bias_all();
-                        Cuda_FixNHCuda_nh_v_temp(&cuda->shared_data,groupbit,factor_eta,(igroup == atom->firstgroup)?atom->nfirst:atom->nlocal);
-               temperature->restore_bias_all();
-               temperature->groupbit=groupbit_org;
-            }
-
-    }
-    // rescale temperature due to velocity scaling
-    // should not be necessary to explicitly recompute the temperature
-
-    t_current *= factor_eta*factor_eta;
-    kecurrent = tdof * boltz * t_current;
-    eta_dotdot[0] = (kecurrent - lkt)/eta_mass[0];
-
-    for (ich = 0; ich < mtchain; ich++)
-      eta[ich] += ncfac*dthalf*eta_dot[ich];
-
-    eta_dot[0] *= expfac;
-    eta_dot[0] += eta_dotdot[0] * ncfac*dt4;
-    eta_dot[0] *= expfac;
-
-    for (ich = 1; ich < mtchain; ich++) {
-      expfac = exp(-ncfac*dt8*eta_dot[ich+1]);
-      eta_dot[ich] *= expfac;
-      eta_dotdot[ich] = (eta_mass[ich-1]*eta_dot[ich-1]*eta_dot[ich-1]
-                         - boltz * t_target)/eta_mass[ich];
-      eta_dot[ich] += eta_dotdot[ich] * ncfac*dt4;
-      eta_dot[ich] *= expfac;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   perform half-step update of chain thermostat variables for barostat
-   scale barostat velocities
-------------------------------------------------------------------------- */
-
-void FixNHCuda::nhc_press_integrate()
-{
-  int ich,i;
-  double expfac,factor_etap,kecurrent;
-  double kt = boltz * t_target;
-  double lkt_press = kt;
-
-  kecurrent = 0.0;
-  for (i = 0; i < 3; i++)
-    if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
-
-  if (pstyle == TRICLINIC) {
-    for (i = 3; i < 6; i++)
-      if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
-  }
-
-  etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0];
-
-  double ncfac = 1.0/nc_pchain;
-  for (int iloop = 0; iloop < nc_pchain; iloop++) {
-
-    for (ich = mpchain-1; ich > 0; ich--) {
-      expfac = exp(-ncfac*dt8*etap_dot[ich+1]);
-      etap_dot[ich] *= expfac;
-      etap_dot[ich] += etap_dotdot[ich] * ncfac*dt4;
-      etap_dot[ich] *= pdrag_factor;
-      etap_dot[ich] *= expfac;
-    }
-
-    expfac = exp(-ncfac*dt8*etap_dot[1]);
-    etap_dot[0] *= expfac;
-    etap_dot[0] += etap_dotdot[0] * ncfac*dt4;
-    etap_dot[0] *= pdrag_factor;
-    etap_dot[0] *= expfac;
-
-    for (ich = 0; ich < mpchain; ich++)
-      etap[ich] += ncfac*dthalf*etap_dot[ich];
-
-    factor_etap = exp(-ncfac*dthalf*etap_dot[0]);
-    for (i = 0; i < 3; i++)
-      if (p_flag[i]) omega_dot[i] *= factor_etap;
-
-    if (pstyle == TRICLINIC) {
-      for (i = 3; i < 6; i++)
-        if (p_flag[i]) omega_dot[i] *= factor_etap;
-    }
-
-    kecurrent = 0.0;
-    for (i = 0; i < 3; i++)
-      if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
-
-    if (pstyle == TRICLINIC) {
-      for (i = 3; i < 6; i++)
-        if (p_flag[i]) kecurrent += omega_mass[i]*omega_dot[i]*omega_dot[i];
-    }
-
-    etap_dotdot[0] = (kecurrent - lkt_press)/etap_mass[0];
-
-    etap_dot[0] *= expfac;
-    etap_dot[0] += etap_dotdot[0] * ncfac*dt4;
-    etap_dot[0] *= expfac;
-
-    for (ich = 1; ich < mpchain; ich++) {
-      expfac = exp(-ncfac*dt8*etap_dot[ich+1]);
-      etap_dot[ich] *= expfac;
-      etap_dotdot[ich] =
-        (etap_mass[ich-1]*etap_dot[ich-1]*etap_dot[ich-1] - boltz*t_target) /
-        etap_mass[ich];
-      etap_dot[ich] += etap_dotdot[ich] * ncfac*dt4;
-      etap_dot[ich] *= expfac;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   perform half-step barostat scaling of velocities
------------------------------------------------------------------------*/
-
-void FixNHCuda::nh_v_press()
-{
-  double factor[3];
-  double **v = atom->v;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  factor[0] = exp(-dt4*(omega_dot[0]+mtk_term2));
-  factor[1] = exp(-dt4*(omega_dot[1]+mtk_term2));
-  factor[2] = exp(-dt4*(omega_dot[2]+mtk_term2));
-
-  if (which == NOBIAS) {
-    for (int i = 0; i < nlocal; i++) {
-      if (mask[i] & groupbit) {
-        v[i][0] *= factor[0];
-        v[i][1] *= factor[1];
-        v[i][2] *= factor[2];
-        if (pstyle == TRICLINIC) {
-          v[i][0] += -dthalf*(v[i][1]*omega_dot[5] + v[i][2]*omega_dot[4]);
-          v[i][1] += -dthalf*v[i][2]*omega_dot[3];
-        }
-        v[i][0] *= factor[0];
-        v[i][1] *= factor[1];
-        v[i][2] *= factor[2];
-      }
-    }
-  } else if (which == BIAS) {
-    for (int i = 0; i < nlocal; i++) {
-      if (mask[i] & groupbit) {
-        temperature->remove_bias(i,v[i]);
-        v[i][0] *= factor[0];
-        v[i][1] *= factor[1];
-        v[i][2] *= factor[2];
-        if (pstyle == TRICLINIC) {
-          v[i][0] += -dthalf*(v[i][1]*omega_dot[5] + v[i][2]*omega_dot[4]);
-          v[i][1] += -dthalf*v[i][2]*omega_dot[3];
-        }
-        v[i][0] *= factor[0];
-        v[i][1] *= factor[1];
-        v[i][2] *= factor[2];
-        temperature->restore_bias(i,v[i]);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   perform half-step update of velocities
------------------------------------------------------------------------*/
-
-void FixNHCuda::nve_v()
-{
-  double dtfm;
-  double **v = atom->v;
-  double **f = atom->f;
-  double *rmass = atom->rmass;
-  double *mass = atom->mass;
-  int *type = atom->type;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  if (rmass) {
-    for (int i = 0; i < nlocal; i++) {
-      if (mask[i] & groupbit) {
-        dtfm = dtf / rmass[i];
-        v[i][0] += dtfm*f[i][0];
-        v[i][1] += dtfm*f[i][1];
-        v[i][2] += dtfm*f[i][2];
-      }
-    }
-  } else {
-    for (int i = 0; i < nlocal; i++) {
-      if (mask[i] & groupbit) {
-        dtfm = dtf / mass[type[i]];
-        v[i][0] += dtfm*f[i][0];
-        v[i][1] += dtfm*f[i][1];
-        v[i][2] += dtfm*f[i][2];
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   perform full-step update of positions
------------------------------------------------------------------------*/
-
-void FixNHCuda::nve_x()
-{
-  double **x = atom->x;
-  double **v = atom->v;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  // x update by full step only for atoms in group
-
-  for (int i = 0; i < nlocal; i++) {
-    if (mask[i] & groupbit) {
-      x[i][0] += dtv * v[i][0];
-      x[i][1] += dtv * v[i][1];
-      x[i][2] += dtv * v[i][2];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   perform half-step thermostat scaling of velocities
------------------------------------------------------------------------*/
-
-void FixNHCuda::nh_v_temp()
-{
-  double **v = atom->v;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-  if (which == NOBIAS) {
-    for (int i = 0; i < nlocal; i++) {
-      if (mask[i] & groupbit) {
-        v[i][0] *= factor_eta;
-        v[i][1] *= factor_eta;
-        v[i][2] *= factor_eta;
-      }
-    }
-  } else if (which == BIAS) {
-    for (int i = 0; i < nlocal; i++) {
-      if (mask[i] & groupbit) {
-        temperature->remove_bias(i,v[i]);
-        v[i][0] *= factor_eta;
-        v[i][1] *= factor_eta;
-        v[i][2] *= factor_eta;
-        temperature->restore_bias(i,v[i]);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   compute sigma tensor
-   needed whenever p_target or h0_inv changes
------------------------------------------------------------------------*/
-
-void FixNHCuda::compute_sigma()
-{
-  // if nreset_h0 > 0, reset vol0 and h0_inv
-  // every nreset_h0 timesteps
-
-  if (nreset_h0 > 0) {
-    int delta = update->ntimestep - update->beginstep;
-    if (delta % nreset_h0 == 0) {
-      if (dimension == 3) vol0 = domain->xprd * domain->yprd * domain->zprd;
-      else vol0 = domain->xprd * domain->yprd;
-      h0_inv[0] = domain->h_inv[0];
-      h0_inv[1] = domain->h_inv[1];
-      h0_inv[2] = domain->h_inv[2];
-      h0_inv[3] = domain->h_inv[3];
-      h0_inv[4] = domain->h_inv[4];
-      h0_inv[5] = domain->h_inv[5];
-    }
-  }
-
-  // generate upper-triangular half of
-  // sigma = vol0*h0inv*(p_target-p_hydro)*h0inv^t
-  // units of sigma are are PV/L^2 e.g. atm.A
-  //
-  // [ 0 5 4 ]   [ 0 5 4 ] [ 0 5 4 ] [ 0 - - ]
-  // [ 5 1 3 ] = [ - 1 3 ] [ 5 1 3 ] [ 5 1 - ]
-  // [ 4 3 2 ]   [ - - 2 ] [ 4 3 2 ] [ 4 3 2 ]
-
-  sigma[0] =
-    vol0*(h0_inv[0]*((p_target[0]-p_hydro)*h0_inv[0] +
-                     p_target[5]*h0_inv[5]+p_target[4]*h0_inv[4]) +
-          h0_inv[5]*(p_target[5]*h0_inv[0] +
-                     (p_target[1]-p_hydro)*h0_inv[5]+p_target[3]*h0_inv[4]) +
-          h0_inv[4]*(p_target[4]*h0_inv[0]+p_target[3]*h0_inv[5] +
-                     (p_target[2]-p_hydro)*h0_inv[4]));
-  sigma[1] =
-    vol0*(h0_inv[1]*((p_target[1]-p_hydro)*h0_inv[1] +
-                     p_target[3]*h0_inv[3]) +
-          h0_inv[3]*(p_target[3]*h0_inv[1] +
-                     (p_target[2]-p_hydro)*h0_inv[3]));
-  sigma[2] =
-    vol0*(h0_inv[2]*((p_target[2]-p_hydro)*h0_inv[2]));
-  sigma[3] =
-    vol0*(h0_inv[1]*(p_target[3]*h0_inv[2]) +
-          h0_inv[3]*((p_target[2]-p_hydro)*h0_inv[2]));
-  sigma[4] =
-    vol0*(h0_inv[0]*(p_target[4]*h0_inv[2]) +
-          h0_inv[5]*(p_target[3]*h0_inv[2]) +
-          h0_inv[4]*((p_target[2]-p_hydro)*h0_inv[2]));
-  sigma[5] =
-    vol0*(h0_inv[0]*(p_target[5]*h0_inv[1]+p_target[4]*h0_inv[3]) +
-          h0_inv[5]*((p_target[1]-p_hydro)*h0_inv[1]+p_target[3]*h0_inv[3]) +
-          h0_inv[4]*(p_target[3]*h0_inv[1]+(p_target[2]-p_hydro)*h0_inv[3]));
-}
-
-/* ----------------------------------------------------------------------
-   compute strain energy
------------------------------------------------------------------------*/
-
-double FixNHCuda::compute_strain_energy()
-{
-  // compute strain energy = 0.5*Tr(sigma*h*h^t) in energy units
-
-  double* h = domain->h;
-  double d0,d1,d2;
-
-  d0 =
-    sigma[0]*(h[0]*h[0]+h[5]*h[5]+h[4]*h[4]) +
-    sigma[5]*(          h[1]*h[5]+h[3]*h[4]) +
-    sigma[4]*(                    h[2]*h[4]);
-  d1 =
-    sigma[5]*(          h[5]*h[1]+h[4]*h[3]) +
-    sigma[1]*(          h[1]*h[1]+h[3]*h[3]) +
-    sigma[3]*(                    h[2]*h[3]);
-  d2 =
-    sigma[4]*(                    h[4]*h[2]) +
-    sigma[3]*(                    h[3]*h[2]) +
-    sigma[2]*(                    h[2]*h[2]);
-
-  double energy = 0.5*(d0+d1+d2)/nktv2p;
-  return energy;
-}
-
-/* ----------------------------------------------------------------------
-   compute deviatoric barostat force = h*sigma*h^t
------------------------------------------------------------------------*/
-
-void FixNHCuda::compute_deviatoric()
-{
-  // generate upper-triangular part of h*sigma*h^t
-  // units of fdev are are PV, e.g. atm*A^3
-  // [ 0 5 4 ]   [ 0 5 4 ] [ 0 5 4 ] [ 0 - - ]
-  // [ 5 1 3 ] = [ - 1 3 ] [ 5 1 3 ] [ 5 1 - ]
-  // [ 4 3 2 ]   [ - - 2 ] [ 4 3 2 ] [ 4 3 2 ]
-
-  double* h = domain->h;
-
-  fdev[0] =
-    h[0]*(sigma[0]*h[0]+sigma[5]*h[5]+sigma[4]*h[4]) +
-    h[5]*(sigma[5]*h[0]+sigma[1]*h[5]+sigma[3]*h[4]) +
-    h[4]*(sigma[4]*h[0]+sigma[3]*h[5]+sigma[2]*h[4]);
-  fdev[1] =
-    h[1]*(              sigma[1]*h[1]+sigma[3]*h[3]) +
-    h[3]*(              sigma[3]*h[1]+sigma[2]*h[3]);
-  fdev[2] =
-    h[2]*(                            sigma[2]*h[2]);
-  fdev[3] =
-    h[1]*(                            sigma[3]*h[2]) +
-    h[3]*(                            sigma[2]*h[2]);
-  fdev[4] =
-    h[0]*(                            sigma[4]*h[2]) +
-    h[5]*(                            sigma[3]*h[2]) +
-    h[4]*(                            sigma[2]*h[2]);
-  fdev[5] =
-    h[0]*(              sigma[5]*h[1]+sigma[4]*h[3]) +
-    h[5]*(              sigma[1]*h[1]+sigma[3]*h[3]) +
-    h[4]*(              sigma[3]*h[1]+sigma[2]*h[3]);
-}
-
-/* ----------------------------------------------------------------------
-   compute hydrostatic target pressure
------------------------------------------------------------------------*/
-
-void FixNHCuda::compute_press_target()
-{
-  double delta = update->ntimestep - update->beginstep;
-  if (update->endstep > update->beginstep)
-    delta /= update->endstep - update->beginstep;
-  else delta = 0.0;
-
-  p_hydro = 0.0;
-  for (int i = 0; i < 3; i++)
-    if (p_flag[i]) {
-      p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]);
-      p_hydro += p_target[i];
-    }
-  p_hydro /= pdim;
-
-  if (pstyle == TRICLINIC)
-    for (int i = 3; i < 6; i++)
-      p_target[i] = p_start[i] + delta * (p_stop[i]-p_start[i]);
-
-  // if deviatoric, recompute sigma each time p_target changes
-
-  if (deviatoric_flag) compute_sigma();
-}
-
-/* ----------------------------------------------------------------------
-   update omega_dot, omega, dilation
------------------------------------------------------------------------*/
-
-void FixNHCuda::nh_omega_dot()
-{
-  double f_omega,volume;
-
-  if (dimension == 3) volume = domain->xprd*domain->yprd*domain->zprd;
-  else volume = domain->xprd*domain->yprd;
-
-  if (deviatoric_flag) compute_deviatoric();
-
-  mtk_term1 = 0.0;
-  if (mtk_flag)
-    if (pstyle == ISO) {
-      mtk_term1 = tdof * boltz * t_current;
-      mtk_term1 /= pdim * atom->natoms;
-    } else {
-      double *mvv_current = temperature->vector;
-      for (int i = 0; i < 3; i++)
-        if (p_flag[i])
-          mtk_term1 += mvv_current[i];
-      mtk_term1 /= pdim * atom->natoms;
-    }
-
-  for (int i = 0; i < 3; i++)
-    if (p_flag[i]) {
-      f_omega = (p_current[i]-p_hydro)*volume /
-        (omega_mass[i] * nktv2p) + mtk_term1 / omega_mass[i];
-      if (deviatoric_flag) f_omega -= fdev[i]/(omega_mass[i] * nktv2p);
-      omega_dot[i] += f_omega*dthalf;
-      omega_dot[i] *= pdrag_factor;
-    }
-
-  mtk_term2 = 0.0;
-  if (mtk_flag) {
-    for (int i = 0; i < 3; i++)
-      if (p_flag[i])
-        mtk_term2 += omega_dot[i];
-    mtk_term2 /= pdim * atom->natoms;
-  }
-
-  if (pstyle == TRICLINIC) {
-    for (int i = 3; i < 6; i++) {
-      if (p_flag[i]) {
-        f_omega = p_current[i]*volume/(omega_mass[i] * nktv2p);
-        if (deviatoric_flag)
-          f_omega -= fdev[i]/(omega_mass[i] * nktv2p);
-        omega_dot[i] += f_omega*dthalf;
-        omega_dot[i] *= pdrag_factor;
-      }
-    }
-  }
-}
diff --git a/src/USER-CUDA/fix_nh_cuda.h b/src/USER-CUDA/fix_nh_cuda.h
deleted file mode 100644
index 3cb97873c0..0000000000
--- a/src/USER-CUDA/fix_nh_cuda.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_FIX_NH_CUDA_H
-#define LMP_FIX_NH_CUDA_H
-
-#include "fix.h"
-#include "cuda_precision.h"
-
-namespace LAMMPS_NS {
-
-class FixNHCuda : public Fix {
- public:
-  FixNHCuda(class LAMMPS *, int, char **);
-  virtual ~FixNHCuda();
-  int setmask();
-  virtual void init();
-  void setup(int);
-  virtual void initial_integrate(int);
-  virtual void final_integrate();
-  void initial_integrate_respa(int, int, int);
-  void final_integrate_respa(int, int);
-  double compute_scalar();
-  double compute_vector(int);
-  void write_restart(FILE *);
-  void restart(char *);
-  int modify_param(int, char **);
-  void reset_dt();
-
- protected:
-  class Cuda *cuda;
-  int dimension,which;
-  double dtv,dtf,dthalf,dt4,dt8,dto;
-  double boltz,nktv2p,tdof;
-  double vol0,t0;
-
-  double t_start,t_stop;
-  double t_current,t_target;
-  double t_freq;
-
-  int tstat_flag;                   // 1 if control T
-  int pstat_flag;                   // 1 if control P
-
-  int pstyle,pcouple,allremap;
-  int p_flag[6];                   // 1 if control P on this dim, 0 if not
-  double p_start[6],p_stop[6];
-  double p_freq[6],p_target[6];
-  double omega[6],omega_dot[6];
-  double omega_mass[6];
-  double p_current[6],dilation[6];
-  double drag,tdrag_factor;        // drag factor on particle thermostat
-  double pdrag_factor;             // drag factor on barostat
-  double factor[6];                // velocity scaling due to barostat
-  int kspace_flag;                 // 1 if KSpace invoked, 0 if not
-  int nrigid;                      // number of rigid fixes
-  int *rfix;                       // indices of rigid fixes
-
-  int nlevels_respa;
-  double *step_respa;
-
-  char *id_temp,*id_press;
-  class Compute *temperature,*pressure;
-  int tflag,pflag;
-
-  double *eta,*eta_dot;            // chain thermostat for particles
-  double *eta_dotdot;
-  double *eta_mass;
-  int mtchain;                     // length of chain
-
-  double *etap;                    // chain thermostat for barostat
-  double *etap_dot;
-  double *etap_dotdot;
-  double *etap_mass;
-  int mpchain;                     // length of chain
-
-  int mtk_flag;                    // 0 if using Hoover barostat
-  double mtk_term1,mtk_term2;
-  int mtchain_default_flag;
-  int pdim;                        // number of barostatted dims
-  double mvv_current[3];           // diagonal of KE tensor
-  double mtk_factor;               // MTK factor
-  double p_freq_max;               // maximum barostat frequency
-
-  double p_hydro;                  // hydrostatic target pressure
-
-  int nc_tchain,nc_pchain;
-  double factor_eta;
-  double sigma[6];                 // scaled target stress
-  double fdev[6];                  // deviatoric force on barostat
-  int deviatoric_flag;             // 0 if target stress tensor is hydrostatic
-  double h0_inv[6];                // h_inv of reference (zero strain) box
-  int nreset_h0;                   // interval for resetting h0
-
-  void couple();
-  void couple_ke();
-  void remap();
-  void nhc_temp_integrate();
-  void nhc_press_integrate();
-
-  virtual void nve_x();            // may be overwritten by child classes
-  virtual void nve_v();
-  virtual void nh_v_press();
-  virtual void nh_v_temp();
-
-  void compute_sigma();
-  void compute_deviatoric();
-  double compute_strain_energy();
-  void compute_press_target();
-  void nh_omega_dot();
-
-  X_CFLOAT triggerneighsq;
-};
-
-}
-
-#endif
diff --git a/src/USER-CUDA/fix_npt_cuda.cpp b/src/USER-CUDA/fix_npt_cuda.cpp
deleted file mode 100644
index ea2dd1fe04..0000000000
--- a/src/USER-CUDA/fix_npt_cuda.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include "fix_npt_cuda.h"
-#include "modify.h"
-#include "error.h"
-
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixNPTCuda::FixNPTCuda(LAMMPS *lmp, int narg, char **arg) :
-  FixNHCuda(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (!tstat_flag)
-    error->all(FLERR,"Temperature control must be used with fix npt");
-  if (!pstat_flag)
-    error->all(FLERR,"Pressure control must be used with fix npt");
-
-  // create a new compute temp style
-  // id = fix-ID + temp
-  // compute group = all since pressure is always global (group all)
-  // and thus its KE/temperature contribution should use group all
-
-  int n = strlen(id) + 6;
-  id_temp = new char[n];
-  strcpy(id_temp,id);
-  strcat(id_temp,"_temp");
-
-  char **newarg = new char*[3];
-  newarg[0] = id_temp;
-  newarg[1] = (char *) "all";
-  newarg[2] = (char *) "temp/cuda";
-
-  modify->add_compute(3,newarg);
-  delete [] newarg;
-  tflag = 1;
-
-  // create a new compute pressure style
-  // id = fix-ID + press, compute group = all
-  // pass id_temp as 4th arg to pressure constructor
-
-  n = strlen(id) + 7;
-  id_press = new char[n];
-  strcpy(id_press,id);
-  strcat(id_press,"_press");
-
-  newarg = new char*[4];
-  newarg[0] = id_press;
-  newarg[1] = (char *) "all";
-  newarg[2] = (char *) "pressure/cuda";
-  newarg[3] = id_temp;
-  modify->add_compute(4,newarg);
-  delete [] newarg;
-  pflag = 1;
-}
diff --git a/src/USER-CUDA/fix_npt_cuda.h b/src/USER-CUDA/fix_npt_cuda.h
deleted file mode 100644
index e10efb6a9a..0000000000
--- a/src/USER-CUDA/fix_npt_cuda.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(npt/cuda,FixNPTCuda)
-
-#else
-
-#ifndef LMP_FIX_NPTCuda_H
-#define LMP_FIX_NPTCuda_H
-
-#include "fix_nh_cuda.h"
-
-namespace LAMMPS_NS {
-
-class FixNPTCuda : public FixNHCuda {
- public:
-  FixNPTCuda(class LAMMPS *, int, char **);
-  ~FixNPTCuda() {}
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_nve_cuda.cpp b/src/USER-CUDA/fix_nve_cuda.cpp
deleted file mode 100644
index fbe85b176f..0000000000
--- a/src/USER-CUDA/fix_nve_cuda.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstdio>
-#include <cstring>
-#include "fix_nve_cuda.h"
-#include "fix_nve_cuda_cu.h"
-#include "atom.h"
-#include "force.h"
-#include "update.h"
-#include "respa.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixNVECuda::FixNVECuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-
-  if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (strcmp(style,"nve/sphere") != 0 && narg < 3)
-                error->all(FLERR,"Illegal fix nve command");
-
-        time_integrate = 1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixNVECuda::setmask()
-{
-        int mask = 0;
-        mask |= INITIAL_INTEGRATE_CUDA;
-        mask |= FINAL_INTEGRATE_CUDA;
-        // mask |= INITIAL_INTEGRATE_RESPA_CUDA;
-        // mask |= FINAL_INTEGRATE_RESPA_CUDA;
-        return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNVECuda::init()
-{
-        dtv = update->dt;
-        dtf = 0.5 * update->dt * force->ftm2v;
-
-        if (strstr(update->integrate_style,"respa"))
-                step_respa = ((Respa *) update->integrate)->step;
-
-        triggerneighsq= cuda->shared_data.atom.triggerneighsq;
-    cuda->neighbor_decide_by_integrator=1;
-    Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
-
-}
-
-/* ----------------------------------------------------------------------
-   allow for both per-type and per-atom mass
-------------------------------------------------------------------------- */
-
-void FixNVECuda::initial_integrate(int vflag)
-{
-        if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq)
-        {
-                triggerneighsq= cuda->shared_data.atom.triggerneighsq;
-                Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
-        }
-        int nlocal = atom->nlocal;
-        if(igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-    Cuda_FixNVECuda_InitialIntegrate(& cuda->shared_data, groupbit,nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNVECuda::final_integrate()
-{
-        int nlocal = atom->nlocal;
-        if(igroup == atom->firstgroup) nlocal = atom->nfirst;
-
-        Cuda_FixNVECuda_FinalIntegrate(& cuda->shared_data, groupbit,nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNVECuda::initial_integrate_respa(int vflag, int ilevel, int flag)
-{
-        //this point should not be reached yet since RESPA is not supported
-        if (flag) return;             // only used by NPT,NPH
-
-        dtv = step_respa[ilevel];
-        dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
-
-        // innermost level - NVE update of v and x
-        // all other levels - NVE update of v
-
-        if(ilevel == 0) initial_integrate(vflag);
-        else final_integrate();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNVECuda::final_integrate_respa(int ilevel, int iloop)
-{
-        //this point should not be reached yet since RESPA is not supported
-        dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
-        final_integrate();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixNVECuda::reset_dt()
-{
-        dtv = update->dt;
-        dtf = 0.5 * update->dt * force->ftm2v;
-        Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
-}
diff --git a/src/USER-CUDA/fix_nve_cuda.h b/src/USER-CUDA/fix_nve_cuda.h
deleted file mode 100644
index 090d327db5..0000000000
--- a/src/USER-CUDA/fix_nve_cuda.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(nve/cuda,FixNVECuda)
-
-#else
-
-#ifndef LMP_FIX_NVE_CUDA_H
-#define LMP_FIX_NVE_CUDA_H
-
-#include "fix.h"
-#include "cuda_precision.h"
-
-namespace LAMMPS_NS {
-
-class FixNVECuda : public Fix
-{
-        public:
-                FixNVECuda(class LAMMPS *, int, char **);
-                int setmask();
-                virtual void init();
-                virtual void initial_integrate(int);
-                virtual void final_integrate();
-                void initial_integrate_respa(int, int, int);
-                void final_integrate_respa(int, int);
-                void reset_dt();
-
-                X_CFLOAT triggerneighsq;
-
-        protected:
-                class Cuda *cuda;
-                double dtv, dtf;
-                double *step_respa;
-                int mass_require;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_nvt_cuda.cpp b/src/USER-CUDA/fix_nvt_cuda.cpp
deleted file mode 100644
index e1380b0005..0000000000
--- a/src/USER-CUDA/fix_nvt_cuda.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include "fix_nvt_cuda.h"
-#include "group.h"
-#include "modify.h"
-#include "error.h"
-
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixNVTCuda::FixNVTCuda(LAMMPS *lmp, int narg, char **arg) :
-  FixNHCuda(lmp, narg, arg)
-{
-  if (!tstat_flag)
-    error->all(FLERR,"Temperature control must be used with fix nvt");
-  if (pstat_flag)
-    error->all(FLERR,"Pressure control can not be used with fix nvt");
-
-  // create a new compute temp style
-  // id = fix-ID + temp
-
-  int n = strlen(id) + 6;
-  id_temp = new char[n];
-  strcpy(id_temp,id);
-  strcat(id_temp,"_temp");
-
-  char **newarg = new char*[3];
-  newarg[0] = id_temp;
-  newarg[1] = group->names[igroup];
-  newarg[2] = (char *) "temp/cuda";
-
-  modify->add_compute(3,newarg);
-  delete [] newarg;
-  tflag = 1;
-}
diff --git a/src/USER-CUDA/fix_nvt_cuda.h b/src/USER-CUDA/fix_nvt_cuda.h
deleted file mode 100644
index 65f38e05d6..0000000000
--- a/src/USER-CUDA/fix_nvt_cuda.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(nvt/cuda,FixNVTCuda)
-
-#else
-
-#ifndef LMP_FIX_NVTCuda_H
-#define LMP_FIX_NVTCuda_H
-
-#include "fix_nh_cuda.h"
-
-namespace LAMMPS_NS {
-
-class FixNVTCuda : public FixNHCuda {
- public:
-  FixNVTCuda(class LAMMPS *, int, char **);
-  ~FixNVTCuda() {}
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_set_force_cuda.cpp b/src/USER-CUDA/fix_set_force_cuda.cpp
deleted file mode 100644
index b7000a5548..0000000000
--- a/src/USER-CUDA/fix_set_force_cuda.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-#include <cstring>
-#include <cstdlib>
-#include "fix_set_force_cuda.h"
-#include "fix_set_force_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "respa.h"
-#include "error.h"
-#include "force.h"
-#include "user_cuda.h"
-#include "memory.h"
-#include "cuda_modify_flags.h"
-
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixSetForceCuda::FixSetForceCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-  if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 6) error->all(FLERR,"Illegal fix setforce/cuda command");
-
-  vector_flag = 1;
-  size_vector = 3;
-  global_freq = 1;
-  extvector = 1;
-
-  flagx = flagy = flagz = 1;
-  if (strcmp(arg[3],"NULL") == 0) flagx = 0;
-  else xvalue = force->numeric(FLERR,arg[3]);
-  if (strcmp(arg[4],"NULL") == 0) flagy = 0;
-  else yvalue = force->numeric(FLERR,arg[4]);
-  if (strcmp(arg[5],"NULL") == 0) flagz = 0;
-  else zvalue = force->numeric(FLERR,arg[5]);
-
-  force_flag = 0;
-  foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
-  cu_foriginal=NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixSetForceCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
-  mask |= THERMO_ENERGY_CUDA;
-  mask |= POST_FORCE_RESPA;
-  mask |= MIN_POST_FORCE_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixSetForceCuda::init()
-{
-  if(not cu_foriginal)
-  cu_foriginal = new cCudaData<double, F_CFLOAT, x> (foriginal,3);
-  if (strstr(update->integrate_style,"respa"))
-    nlevels_respa = ((Respa *) update->integrate)->nlevels;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixSetForceCuda::setup(int vflag)
-{
-  MYDBG( printf("# CUDA: FixSetForceCuda::setup\n"); )
-
-  if (strstr(update->integrate_style,"verlet"))
-  {
-    Cuda_FixSetForceCuda_Init(&cuda->shared_data);
-    cuda->cu_f->upload();
-    post_force(vflag);
-    cuda->cu_f->download();
-
-  }
-  else {
-    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
-    cuda->cu_f->download();
-    post_force_respa(vflag,nlevels_respa-1,0);
-    cuda->cu_f->upload();
-    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
-  }
-  MYDBG( printf("# CUDA: FixSetForceCuda::setup done\n"); )
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixSetForceCuda::min_setup(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixSetForceCuda::post_force(int vflag)
-{
-  MYDBG( printf("# CUDA: FixSetForceCuda::postforce start\n"); )
-  force_flag = 0;
-  cu_foriginal->memset_device(0);
-  Cuda_FixSetForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_CFLOAT*) cu_foriginal->dev_data(),flagx,flagy,flagz);
-  cu_foriginal->download();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixSetForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
-{
-  if (ilevel == nlevels_respa-1) post_force(vflag);
-  else {
-          cuda->cu_f->download();
-          cuda->cu_mask->download();
-
-    double **f = atom->f;
-    int *mask = atom->mask;
-    int nlocal = atom->nlocal;
-
-    foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
-    force_flag = 0;
-
-    for (int i = 0; i < nlocal; i++)
-      if (mask[i] & groupbit) {
-        foriginal[0] += f[i][0];
-        foriginal[1] += f[i][1];
-        foriginal[2] += f[i][2];
-        if (flagx) f[i][0] = 0.0;
-        if (flagy) f[i][1] = 0.0;
-        if (flagz) f[i][2] = 0.0;
-      }
-          cuda->cu_f->upload();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixSetForceCuda::min_post_force(int vflag)
-{
-  post_force(vflag);
-}
-
-
-/* ----------------------------------------------------------------------
-   return components of total force on fix group before force was changed
-------------------------------------------------------------------------- */
-
-double FixSetForceCuda::compute_vector(int n)
-{
-  // only sum across procs one time
-
-  if (force_flag == 0) {
-    MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world);
-    force_flag = 1;
-  }
-  return foriginal_all[n+1];
-}
diff --git a/src/USER-CUDA/fix_set_force_cuda.h b/src/USER-CUDA/fix_set_force_cuda.h
deleted file mode 100644
index a195aec0ec..0000000000
--- a/src/USER-CUDA/fix_set_force_cuda.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(setforce/cuda,FixSetForceCuda)
-
-#else
-
-#ifndef LMP_FIX_SET_FORCE_CUDA_H
-#define LMP_FIX_SET_FORCE_CUDA_H
-
-#include "fix.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class FixSetForceCuda : public Fix {
- public:
-  FixSetForceCuda(class LAMMPS *, int, char **);
-  int setmask();
-  void init();
-  void setup(int);
-  void min_setup(int);
-  void post_force(int);
-  void post_force_respa(int, int, int);
-  void min_post_force(int);
-  double compute_vector(int);
-
- private:
-  class Cuda *cuda;
-  int flagx,flagy,flagz;
-  double xvalue,yvalue,zvalue;
-  double foriginal[3],foriginal_all[3];
-  cCudaData<double     , F_CFLOAT                   , x>* cu_foriginal;
-  int force_flag;
-  int nlevels_respa;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_shake_cuda.cpp b/src/USER-CUDA/fix_shake_cuda.cpp
deleted file mode 100644
index 92274d1d46..0000000000
--- a/src/USER-CUDA/fix_shake_cuda.cpp
+++ /dev/null
@@ -1,2885 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include <cstdio>
-#include <ctime>
-#include "fix_shake_cuda.h"
-#include "fix_shake_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "respa.h"
-#include "modify.h"
-#include "domain.h"
-#include "force.h"
-#include "bond.h"
-#include "angle.h"
-#include "comm.h"
-#include "group.h"
-#include "fix_respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-#include "math_const.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-using namespace MathConst;
-
-#define BIG 1.0e20
-#define MASSDELTA 0.1
-
-/* ---------------------------------------------------------------------- */
-
-FixShakeCuda::FixShakeCuda(LAMMPS* lmp, int narg, char** arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-
-  if(cuda == NULL)
-    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if(atom->map_style != 1)
-    error->all(FLERR, "Fix shake/cuda needs atom map style array. In particular it does not currently work with hash-tables.");
-
-  MPI_Comm_rank(world, &me);
-  MPI_Comm_size(world, &nprocs);
-  neighbor_step = true;
-
-  virial_flag = 1;
-  create_attribute = 1;
-  dof_flag = 1;
-
-  // error check
-
-  if(atom->molecular == 0)
-    error->all(FLERR, "Cannot use fix shake with non-molecular system");
-
-  // perform initial allocation of atom-based arrays
-  // register with Atom class
-
-  shake_flag = NULL;
-  shake_atom = shake_type = NULL;
-  xshake = NULL;
-  cu_shake_flag = NULL;
-  cu_shake_atom = NULL;
-  cu_shake_type = NULL;
-  cu_xshake = NULL;
-  cu_list = NULL;
-  cu_bond_distance = NULL;
-  cu_angle_distance = NULL;
-  cu_virial = new cCudaData<double           , ENERGY_CFLOAT , xx >(virial, 6);
-  grow_arrays(atom->nmax);
-  atom->add_callback(0);
-
-  // set comm size needed by this fix
-
-  comm_forward = 3;
-
-  // parse SHAKE args
-
-  if(narg < 8) error->all(FLERR, "Illegal fix shake command");
-
-  tolerance = force->numeric(FLERR,arg[3]);
-  max_iter = force->inumeric(FLERR,arg[4]);
-  output_every = force->inumeric(FLERR,arg[5]);
-
-  // parse SHAKE args for bond and angle types
-  // will be used by find_clusters
-  // store args for "b" "a" "t" as flags in (1:n) list for fast access
-  // store args for "m" in list of length nmass for looping over
-  // for "m" verify that atom masses have been set
-
-  bond_flag = new int[atom->nbondtypes + 1];
-
-  for(int i = 1; i <= atom->nbondtypes; i++) bond_flag[i] = 0;
-
-  angle_flag = new int[atom->nangletypes + 1];
-
-  for(int i = 1; i <= atom->nangletypes; i++) angle_flag[i] = 0;
-
-  type_flag = new int[atom->ntypes + 1];
-
-  for(int i = 1; i <= atom->ntypes; i++) type_flag[i] = 0;
-
-  mass_list = new double[atom->ntypes];
-  nmass = 0;
-
-  char mode = '\0';
-  int next = 6;
-
-  while(next < narg) {
-
-    if(strcmp(arg[next], "b") == 0) mode = 'b';
-    else if(strcmp(arg[next], "a") == 0) mode = 'a';
-    else if(strcmp(arg[next], "t") == 0) mode = 't';
-    else if(strcmp(arg[next], "m") == 0) {
-      mode = 'm';
-      atom->check_mass();
-
-    } else if(mode == 'b') {
-      int i = force->inumeric(FLERR,arg[next]);
-
-      if(i < 1 || i > atom->nbondtypes)
-        error->all(FLERR, "Invalid bond type index for fix shake");
-
-      bond_flag[i] = 1;
-
-    } else if(mode == 'a') {
-      int i = force->inumeric(FLERR,arg[next]);
-
-      if(i < 1 || i > atom->nangletypes)
-        error->all(FLERR, "Invalid angle type index for fix shake");
-
-      angle_flag[i] = 1;
-
-    } else if(mode == 't') {
-      int i = force->inumeric(FLERR,arg[next]);
-
-      if(i < 1 || i > atom->ntypes)
-        error->all(FLERR, "Invalid atom type index for fix shake");
-
-      type_flag[i] = 1;
-
-    } else if(mode == 'm') {
-      double massone = force->numeric(FLERR,arg[next]);
-
-      if(massone == 0.0) error->all(FLERR, "Invalid atom mass for fix shake");
-
-      if(nmass == atom->ntypes) error->all(FLERR, "Too many masses for fix shake");
-
-      mass_list[nmass++] = massone;
-
-    } else error->all(FLERR, "Illegal fix shake command");
-
-    next++;
-  }
-
-  // allocate bond and angle distance arrays, indexed from 1 to n
-
-  bond_distance = new double[atom->nbondtypes + 1];
-  angle_distance = new double[atom->nangletypes + 1];
-
-  cu_bond_distance = new cCudaData<double, X_CFLOAT, xx> (bond_distance, atom->nbondtypes + 1);
-  cu_angle_distance = new cCudaData<double, X_CFLOAT, xx> (angle_distance, atom->nangletypes + 1);
-
-  // allocate statistics arrays
-
-  if(output_every) {
-    int nb = atom->nbondtypes + 1;
-    b_count = new int[nb];
-    b_count_all = new int[nb];
-    b_ave = new double[nb];
-    b_ave_all = new double[nb];
-    b_max = new double[nb];
-    b_max_all = new double[nb];
-    b_min = new double[nb];
-    b_min_all = new double[nb];
-
-    int na = atom->nangletypes + 1;
-    a_count = new int[na];
-    a_count_all = new int[na];
-    a_ave = new double[na];
-    a_ave_all = new double[na];
-    a_max = new double[na];
-    a_max_all = new double[na];
-    a_min = new double[na];
-    a_min_all = new double[na];
-  }
-
-  cudable_comm = true;
-  // identify all SHAKE clusters
-
-  find_clusters();
-
-  // initialize list of SHAKE clusters to constrain
-
-  maxlist = 0;
-  list = NULL;
-  Cuda_FixShakeCuda_Init(&cuda->shared_data, dtv, dtfsq,
-                         cu_shake_flag->dev_data(), cu_shake_atom->dev_data(), cu_shake_type->dev_data(), cu_xshake->dev_data(),
-                         cu_bond_distance->dev_data(), cu_angle_distance->dev_data(), cu_virial->dev_data(),
-                         max_iter, tolerance);
-
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixShakeCuda::~FixShakeCuda()
-{
-  // unregister callbacks to this fix from Atom class
-
-  atom->delete_callback(id, 0);
-
-  // set bond_type and angle_type back to positive for SHAKE clusters
-  // must set for all SHAKE bonds and angles stored by each atom
-
-  int** bond_type = atom->bond_type;
-  int** angle_type = atom->angle_type;
-  int nlocal = atom->nlocal;
-
-  int n;
-
-  for(int i = 0; i < nlocal; i++) {
-    if(shake_flag[i] == 0) continue;
-    else if(shake_flag[i] == 1) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][2]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = anglefind(i, shake_atom[i][1], shake_atom[i][2]);
-
-      if(n >= 0) angle_type[i][n] = -angle_type[i][n];
-    } else if(shake_flag[i] == 2) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-    } else if(shake_flag[i] == 3) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][2]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-    } else if(shake_flag[i] == 4) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][2]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][3]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-    }
-  }
-
-  // delete locally stored arrays
-
-  memory->destroy(shake_flag);
-  memory->destroy(shake_atom);
-  memory->destroy(shake_type);
-  memory->destroy(xshake);
-
-  delete [] bond_flag;
-  delete [] angle_flag;
-  delete [] type_flag;
-  delete [] mass_list;
-
-  delete [] bond_distance;
-  delete [] angle_distance;
-
-  if(output_every) {
-    delete [] b_count;
-    delete [] b_count_all;
-    delete [] b_ave;
-    delete [] b_ave_all;
-    delete [] b_max;
-    delete [] b_max_all;
-    delete [] b_min;
-    delete [] b_min_all;
-
-    delete [] a_count;
-    delete [] a_count_all;
-    delete [] a_ave;
-    delete [] a_ave_all;
-    delete [] a_max;
-    delete [] a_max_all;
-    delete [] a_min;
-    delete [] a_min_all;
-  }
-
-  memory->destroy(list);
-
-  delete cu_shake_flag;
-  delete cu_shake_atom;
-  delete cu_shake_type;
-  delete cu_xshake;
-  delete cu_list;
-  delete cu_bond_distance;
-  delete cu_angle_distance;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixShakeCuda::setmask()
-{
-  int mask = 0;
-  mask |= PRE_NEIGHBOR_CUDA;
-  mask |= POST_FORCE_CUDA;
-  mask |= POST_FORCE_RESPA;
-  return mask;
-}
-
-/* ----------------------------------------------------------------------
-   set bond and angle distances
-   this init must happen after force->bond and force->angle inits
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::init()
-{
-  int i, m, flag, flag_all, type1, type2, bond1_type, bond2_type;
-  double rsq, angle;
-
-  // error if more than one shake fix
-
-  int count = 0;
-
-  for(i = 0; i < modify->nfix; i++)
-    if(strcmp(modify->fix[i]->style, "shake") == 0) count++;
-
-  if(count > 1) error->all(FLERR, "More than one fix shake");
-
-  // cannot use with minimization since SHAKE turns off bonds
-  // that should contribute to potential energy
-
-  if(update->whichflag == 2)
-    error->all(FLERR, "Fix shake cannot be used with minimization");
-
-  // error if npt,nph fix comes before shake fix
-
-  for(i = 0; i < modify->nfix; i++) {
-    if(strcmp(modify->fix[i]->style, "npt") == 0) break;
-
-    if(strcmp(modify->fix[i]->style, "nph") == 0) break;
-  }
-
-  if(i < modify->nfix) {
-    for(int j = i; j < modify->nfix; j++)
-      if(strcmp(modify->fix[j]->style, "shake") == 0)
-        error->all(FLERR, "Shake fix must come before NPT/NPH fix");
-  }
-
-  // if rRESPA, find associated fix that must exist
-  // could have changed locations in fix list since created
-  // set ptrs to rRESPA variables
-
-  if(strstr(update->integrate_style, "respa")) {
-    for(i = 0; i < modify->nfix; i++)
-      if(strcmp(modify->fix[i]->style, "RESPA") == 0) ifix_respa = i;
-
-    nlevels_respa = ((Respa*) update->integrate)->nlevels;
-    loop_respa = ((Respa*) update->integrate)->loop;
-    step_respa = ((Respa*) update->integrate)->step;
-  }
-
-  // set equilibrium bond distances
-
-  if(force->bond == NULL)
-    error->all(FLERR, "Bond potential must be defined for SHAKE");
-
-  for(i = 1; i <= atom->nbondtypes; i++)
-    bond_distance[i] = force->bond->equilibrium_distance(i);
-
-  // set equilibrium angle distances
-
-  int nlocal = atom->nlocal;
-
-  for(i = 1; i <= atom->nangletypes; i++) {
-    if(angle_flag[i] == 0) continue;
-
-    if(force->angle == NULL)
-      error->all(FLERR, "Angle potential must be defined for SHAKE");
-
-    // scan all atoms for a SHAKE angle cluster
-    // extract bond types for the 2 bonds in the cluster
-    // bond types must be same in all clusters of this angle type,
-    //   else set error flag
-
-    flag = 0;
-    bond1_type = bond2_type = 0;
-
-    for(m = 0; m < nlocal; m++) {
-      if(shake_flag[m] != 1) continue;
-
-      if(shake_type[m][2] != i) continue;
-
-      type1 = MIN(shake_type[m][0], shake_type[m][1]);
-      type2 = MAX(shake_type[m][0], shake_type[m][1]);
-
-      if(bond1_type > 0) {
-        if(type1 != bond1_type || type2 != bond2_type) {
-          flag = 1;
-          break;
-        }
-      }
-
-      bond1_type = type1;
-      bond2_type = type2;
-    }
-
-    // error check for any bond types that are not the same
-
-    MPI_Allreduce(&flag, &flag_all, 1, MPI_INT, MPI_MAX, world);
-
-    if(flag_all) error->all(FLERR, "Shake angles have different bond types");
-
-    // insure all procs have bond types
-
-    MPI_Allreduce(&bond1_type, &flag_all, 1, MPI_INT, MPI_MAX, world);
-    bond1_type = flag_all;
-    MPI_Allreduce(&bond2_type, &flag_all, 1, MPI_INT, MPI_MAX, world);
-    bond2_type = flag_all;
-
-    // if bond types are 0, no SHAKE angles of this type exist
-    // just skip this angle
-
-    if(bond1_type == 0) {
-      angle_distance[i] = 0.0;
-      continue;
-    }
-
-    // compute the angle distance as a function of 2 bond distances
-
-    angle = force->angle->equilibrium_angle(i);
-    rsq = 2.0 * bond_distance[bond1_type] * bond_distance[bond2_type] *
-          (1.0 - cos(angle));
-    angle_distance[i] = sqrt(rsq);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   SHAKE as pre-integrator constraint
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::setup(int vflag)
-{
-  pre_neighbor();
-
-  if(output_every) stats();
-
-  // setup SHAKE output
-
-  int ntimestep = update->ntimestep;
-  next_output = ntimestep + output_every;
-
-  if(output_every == 0) next_output = update->laststep + 1;
-
-  if(output_every && ntimestep % output_every != 0)
-    next_output = (ntimestep / output_every) * output_every + output_every;
-
-  // half timestep constraint on pre-step, full timestep thereafter
-
-  if(strstr(update->integrate_style, "verlet")) {
-    dtv = update->dt;
-    dtfsq = 0.5 * update->dt * update->dt * force->ftm2v;
-    post_force(vflag);
-    dtfsq = update->dt * update->dt * force->ftm2v;
-  } else {
-    dtv = step_respa[0];
-    dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v;
-    dtf_inner = dtf_innerhalf;
-    ((Respa*) update->integrate)->copy_flevel_f(nlevels_respa - 1);
-    post_force_respa(vflag, nlevels_respa - 1, 0);
-    ((Respa*) update->integrate)->copy_f_flevel(nlevels_respa - 1);
-    dtf_inner = step_respa[0] * force->ftm2v;
-  }
-
-  Cuda_FixShakeCuda_Init(&cuda->shared_data, dtv, dtfsq,
-                         cu_shake_flag->dev_data(), cu_shake_atom->dev_data(), cu_shake_type->dev_data(), cu_xshake->dev_data(),
-                         cu_bond_distance->dev_data(), cu_angle_distance->dev_data(), cu_virial->dev_data(),
-                         max_iter, tolerance);
-}
-
-/* ----------------------------------------------------------------------
-   build list of SHAKE clusters to constrain
-   if one or more atoms in cluster are on this proc,
-     this proc lists the cluster exactly once
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::pre_neighbor()
-{
-  int atom1, atom2, atom3, atom4;
-
-  // local copies of atom quantities
-  // used by SHAKE until next re-neighboring
-
-  x = atom->x;
-  v = atom->v;
-  f = atom->f;
-  mass = atom->mass;
-  rmass = atom->rmass;
-  type = atom->type;
-  nlocal = atom->nlocal;
-
-  // extend size of SHAKE list if necessary
-
-  if(nlocal > maxlist) {
-    maxlist = nlocal;
-    memory->destroy(list);
-    memory->create(list, maxlist, "shake:list");
-    delete cu_list;
-    cu_list = new cCudaData<int           , int            , xx >(list, maxlist);
-  }
-
-  // build list of SHAKE clusters I compute
-
-  nlist = 0;
-  int count2 = 0, count3 = 0, count4 = 0, count3a = 0;
-
-  for(int i = 0; i < nlocal; i++)
-    if(shake_flag[i]) {
-      if(shake_flag[i] == 2) count2++;
-
-      if(shake_flag[i] == 3) count3++;
-
-      if(shake_flag[i] == 4) count4++;
-
-      if(shake_flag[i] == 1) count3a++;
-
-      if(shake_flag[i] == 2) {
-        atom1 = atom->map(shake_atom[i][0]);
-        atom2 = atom->map(shake_atom[i][1]);
-
-        if(atom1 == -1 || atom2 == -1) {
-          char str[128];
-          sprintf(str,
-                  "Shake atoms %d %d missing on proc %d at step " BIGINT_FORMAT,
-                  shake_atom[i][0], shake_atom[i][1], me, update->ntimestep);
-          error->one(FLERR, str);
-        }
-
-        if(i <= atom1 && i <= atom2) list[nlist++] = i;
-      } else if(shake_flag[i] % 2 == 1) {
-        atom1 = atom->map(shake_atom[i][0]);
-        atom2 = atom->map(shake_atom[i][1]);
-        atom3 = atom->map(shake_atom[i][2]);
-
-        if(atom1 == -1 || atom2 == -1 || atom3 == -1) {
-          char str[128];
-          sprintf(str,
-                  "Shake atoms %d %d %d missing on proc %d at step "
-                  BIGINT_FORMAT,
-                  shake_atom[i][0], shake_atom[i][1], shake_atom[i][2],
-                  me, update->ntimestep);
-          error->one(FLERR, str);
-        }
-
-        if(i <= atom1 && i <= atom2 && i <= atom3) list[nlist++] = i;
-      } else {
-        atom1 = atom->map(shake_atom[i][0]);
-        atom2 = atom->map(shake_atom[i][1]);
-        atom3 = atom->map(shake_atom[i][2]);
-        atom4 = atom->map(shake_atom[i][3]);
-
-        if(atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1) {
-          char str[128];
-          sprintf(str,
-                  "Shake atoms %d %d %d %d missing on proc %d at step "
-                  BIGINT_FORMAT,
-                  shake_atom[i][0], shake_atom[i][1],
-                  shake_atom[i][2], shake_atom[i][3],
-                  me, update->ntimestep);
-          error->one(FLERR, str);
-        }
-
-        if(i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4)
-          list[nlist++] = i;
-      }
-    }
-
-  count2 /= 2;
-  count3 /= 3;
-  count4 /= 4;
-  count3a /= 3;
-  count3 += count2;
-  count4 += count3;
-  count3a += count4;
-
-  for(int k = 0, l = count2; k < count2; k++) {
-    if(shake_flag[list[k]] != 2) {
-      while(shake_flag[list[l]] != 2 && l < nlist - 1) l++;
-
-      if(shake_flag[list[l]] != 2) {
-        printf("FixShakeCuda: Error in List SortA %i %i\n", k, l);
-        return;
-      }
-
-      int tmp = list[k];
-      list[k] = list[l];
-      list[l] = tmp;
-    }
-  }
-
-  for(int k = count2, l = count3; k < count3; k++) {
-    if(shake_flag[list[k]] != 3) {
-      while(shake_flag[list[l]] != 3 && l < nlist - 1) l++;
-
-      if(shake_flag[list[l]] != 3) {
-        printf("FixShakeCuda: Error in List SortB %i %i\n", k, l);
-        return;
-      }
-
-      int tmp = list[k];
-      list[k] = list[l];
-      list[l] = tmp;
-    }
-  }
-
-  for(int k = count3, l = count4; k < count4; k++) {
-    if(shake_flag[list[k]] != 4) {
-      while(shake_flag[list[l]] != 4 && l < nlist - 1) l++;
-
-      if(shake_flag[list[l]] != 4) {
-        printf("FixShakeCuda: Error in List SortC %i %i\n", k, l);
-        return;
-      }
-
-      int tmp = list[k];
-      list[k] = list[l];
-      list[l] = tmp;
-    }
-  }
-
-  cu_list->upload();
-  cu_bond_distance->upload();
-  cu_angle_distance->upload();
-  cu_shake_flag->upload();
-  cu_shake_atom->upload();
-  cu_shake_type->upload();
-
-  neighbor_step = true;
-}
-
-/* ----------------------------------------------------------------------
-   compute the force adjustment for SHAKE constraint
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::post_force(int vflag)
-{
-  my_times starttime;
-  my_times endtime;
-
-
-  if(cuda->finished_setup && neighbor_step) {
-    Cuda_FixShakeCuda_Init(&cuda->shared_data, dtv, dtfsq,
-                           cu_shake_flag->dev_data(), cu_shake_atom->dev_data(), cu_shake_type->dev_data(), cu_xshake->dev_data(),
-                           cu_bond_distance->dev_data(), cu_angle_distance->dev_data(), cu_virial->dev_data(),
-                           max_iter, tolerance);
-
-  }
-
-  if(not cuda->finished_setup)
-    cuda->downloadAll();
-
-  if(update->ntimestep == next_output) {
-    if(cuda->finished_setup)
-      cuda->cu_x->download();
-
-    stats();
-  }
-
-  // xshake = unconstrained move with current v,f
-
-  unconstrained_update();
-
-  // communicate results if necessary
-
-  //if(cuda->finished_setup) cu_xshake->download();
-
-  if(nprocs > 1) {
-    //if(cuda->finished_setup)
-    //cu_xshake->download();
-    comm->forward_comm_fix(this);
-    //if(cuda->finished_setup)
-    //cu_xshake->upload();
-  }
-
-  // virial setup
-
-  if(vflag) v_setup(vflag);
-  else evflag = 0;
-
-  // loop over clusters
-
-  my_gettime(CLOCK_REALTIME, &starttime);
-
-  if(cuda->finished_setup) {
-    cu_virial->upload();
-
-    if(vflag_atom) cuda->cu_vatom->upload();
-
-    Cuda_FixShakeCuda_Shake(&cuda->shared_data, vflag, vflag_atom, (int*)cu_list->dev_data(), nlist);
-    cu_virial->download();
-
-    if(vflag_atom) cuda->cu_vatom->download();
-
-  } else
-    for(int i = 0; i < nlist; i++) {
-      int m = list[i];
-
-      if(shake_flag[m] == 2) shake2(m);
-      else if(shake_flag[m] == 3) shake3(m);
-      else if(shake_flag[m] == 4) shake4(m);
-      else shake3angle(m);
-    }
-
-  if((not cuda->finished_setup))  cuda->cu_f->upload();
-
-  my_gettime(CLOCK_REALTIME, &endtime);
-
-  if(cuda->finished_setup)
-    time_postforce += (endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000);
-  else
-    time_postforce = 0.0;
-
-  //printf("Postforce time: %lf\n",time_postforce);
-}
-
-/* ----------------------------------------------------------------------
-   count # of degrees-of-freedom removed by SHAKE for atoms in igroup
-------------------------------------------------------------------------- */
-
-int FixShakeCuda::dof(int igroup)
-{
-  int groupbit = group->bitmask[igroup];
-
-  int* mask = atom->mask;
-  int* tag = atom->tag;
-  int nlocal = atom->nlocal;
-
-  // count dof in a cluster if and only if
-  // the central atom is in group and atom i is the central atom
-
-  int n = 0;
-
-  for(int i = 0; i < nlocal; i++) {
-    if(!(mask[i] & groupbit)) continue;
-
-    if(shake_flag[i] == 0) continue;
-
-    if(shake_atom[i][0] != tag[i]) continue;
-
-    if(shake_flag[i] == 1) n += 3;
-    else if(shake_flag[i] == 2) n += 1;
-    else if(shake_flag[i] == 3) n += 2;
-    else if(shake_flag[i] == 4) n += 3;
-  }
-
-  int nall;
-  MPI_Allreduce(&n, &nall, 1, MPI_INT, MPI_SUM, world);
-  return nall;
-}
-
-/* ----------------------------------------------------------------------
-   identify whether each atom is in a SHAKE cluster
-   only include atoms in fix group and those bonds/angles specified in input
-   test whether all clusters are valid
-   set shake_flag, shake_atom, shake_type values
-   set bond,angle types negative so will be ignored in neighbor lists
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::find_clusters()
-{
-  int i, j, m, n;
-  int flag, flag_all, messtag, loop, nbuf, nbufmax, size;
-  double massone;
-  int* buf, *bufcopy;
-  MPI_Request request;
-  MPI_Status status;
-
-  if(me == 0 && screen) fprintf(screen, "Finding SHAKE clusters ...\n");
-
-  // local copies of atom ptrs
-
-  int* tag = atom->tag;
-  int* type = atom->type;
-  int* mask = atom->mask;
-  double* mass = atom->mass;
-  double* rmass = atom->rmass;
-  int** bond_type = atom->bond_type;
-  int** angle_type = atom->angle_type;
-  int** nspecial = atom->nspecial;
-  int** special = atom->special;
-  int nlocal = atom->nlocal;
-
-  // setup ring of procs
-
-  int next = me + 1;
-  int prev = me - 1;
-
-  if(next == nprocs) next = 0;
-
-  if(prev < 0) prev = nprocs - 1;
-
-  // -----------------------------------------------------
-  // allocate arrays for self (1d) and bond partners (2d)
-  // max = max # of bond partners for owned atoms = 2nd dim of partner arrays
-  // npartner[i] = # of bonds attached to atom i
-  // nshake[i] = # of SHAKE bonds attached to atom i
-  // partner_tag[i][] = global IDs of each partner
-  // partner_mask[i][] = mask of each partner
-  // partner_type[i][] = type of each partner
-  // partner_massflag[i][] = 1 if partner meets mass criterion, 0 if not
-  // partner_bondtype[i][] = type of bond attached to each partner
-  // partner_shake[i][] = 1 if SHAKE bonded to partner, 0 if not
-  // partner_nshake[i][] = nshake value for each partner
-  // -----------------------------------------------------
-
-  int max = 0;
-
-  for(i = 0; i < nlocal; i++) max = MAX(max, nspecial[i][0]);
-
-  int* npartner, *nshake;
-  memory->create(npartner, nlocal, "shake:npartner");
-  memory->create(nshake, nlocal, "shake:nshake");
-
-  int** partner_tag, **partner_mask, **partner_type, **partner_massflag;
-  int** partner_bondtype, **partner_shake, **partner_nshake;
-  memory->create(partner_tag, nlocal, max, "shake:partner_tag");
-  memory->create(partner_mask, nlocal, max, "shake:partner_mask");
-  memory->create(partner_type, nlocal, max, "shake:partner_type");
-  memory->create(partner_massflag, nlocal, max, "shake:partner_massflag");
-  memory->create(partner_bondtype, nlocal, max, "shake:partner_bondtype");
-  memory->create(partner_shake, nlocal, max, "shake:partner_shake");
-  memory->create(partner_nshake, nlocal, max, "shake:partner_nshake");
-
-  // -----------------------------------------------------
-  // set npartner and partner_tag from special arrays
-  // -----------------------------------------------------
-
-  for(i = 0; i < nlocal; i++) {
-    npartner[i] = nspecial[i][0];
-
-    for(j = 0; j < npartner[i]; j++) partner_tag[i][j] = special[i][j];
-  }
-
-  // -----------------------------------------------------
-  // set partner_mask, partner_type, partner_massflag, partner_bondtype
-  //   for bonded partners
-  // requires communication for off-proc partners
-  // -----------------------------------------------------
-
-  // fill in mask, type, massflag, bondtype if own bond partner
-  // info to store in buf for each off-proc bond = nper = 6
-  //   2 atoms IDs in bond, space for mask, type, massflag, bondtype
-  // nbufmax = largest buffer needed to hold info from any proc
-
-  int nper = 6;
-
-  nbuf = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    for(j = 0; j < npartner[i]; j++) {
-      partner_mask[i][j] = 0;
-      partner_type[i][j] = 0;
-      partner_massflag[i][j] = 0;
-      partner_bondtype[i][j] = 0;
-
-      m = atom->map(partner_tag[i][j]);
-
-      if(m >= 0 && m < nlocal) {
-        partner_mask[i][j] = mask[m];
-        partner_type[i][j] = type[m];
-
-        if(nmass) {
-          if(rmass) massone = rmass[m];
-          else massone = mass[type[m]];
-
-          partner_massflag[i][j] = masscheck(massone);
-        }
-
-        n = bondfind(i, tag[i], partner_tag[i][j]);
-
-        if(n >= 0) partner_bondtype[i][j] = bond_type[i][n];
-        else {
-          n = bondfind(m, tag[i], partner_tag[i][j]);
-
-          if(n >= 0) partner_bondtype[i][j] = bond_type[m][n];
-        }
-      } else nbuf += nper;
-    }
-  }
-
-  MPI_Allreduce(&nbuf, &nbufmax, 1, MPI_INT, MPI_MAX, world);
-
-  buf = new int[nbufmax];
-  bufcopy = new int[nbufmax];
-
-  // fill buffer with info
-
-  size = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    for(j = 0; j < npartner[i]; j++) {
-      m = atom->map(partner_tag[i][j]);
-
-      if(m < 0 || m >= nlocal) {
-        buf[size] = tag[i];
-        buf[size + 1] = partner_tag[i][j];
-        buf[size + 2] = 0;
-        buf[size + 3] = 0;
-        buf[size + 4] = 0;
-        n = bondfind(i, tag[i], partner_tag[i][j]);
-
-        if(n >= 0) buf[size + 5] = bond_type[i][n];
-        else buf[size + 5] = 0;
-
-        size += nper;
-      }
-    }
-  }
-
-  // cycle buffer around ring of procs back to self
-  // when receive buffer, scan bond partner IDs for atoms I own
-  // if I own partner:
-  //   fill in mask and type and massflag
-  //   search for bond with 1st atom and fill in bondtype
-
-  messtag = 1;
-
-  for(loop = 0; loop < nprocs; loop++) {
-    i = 0;
-
-    while(i < size) {
-      m = atom->map(buf[i + 1]);
-
-      if(m >= 0 && m < nlocal) {
-        buf[i + 2] = mask[m];
-        buf[i + 3] = type[m];
-
-        if(nmass) {
-          if(rmass) massone = rmass[m];
-          else massone = mass[type[m]];
-
-          buf[i + 4] = masscheck(massone);
-        }
-
-        if(buf[i + 5] == 0) {
-          n = bondfind(m, buf[i], buf[i + 1]);
-
-          if(n >= 0) buf[i + 5] = bond_type[m][n];
-        }
-      }
-
-      i += nper;
-    }
-
-    if(me != next) {
-      MPI_Irecv(bufcopy, nbufmax, MPI_INT, prev, messtag, world, &request);
-      MPI_Send(buf, size, MPI_INT, next, messtag, world);
-      MPI_Wait(&request, &status);
-      MPI_Get_count(&status, MPI_INT, &size);
-
-      for(j = 0; j < size; j++) buf[j] = bufcopy[j];
-    }
-  }
-
-  // store partner info returned to me
-
-  m = 0;
-
-  while(m < size) {
-    i = atom->map(buf[m]);
-
-    for(j = 0; j < npartner[i]; j++)
-      if(buf[m + 1] == partner_tag[i][j]) break;
-
-    partner_mask[i][j] = buf[m + 2];
-    partner_type[i][j] = buf[m + 3];
-    partner_massflag[i][j] = buf[m + 4];
-    partner_bondtype[i][j] = buf[m + 5];
-    m += nper;
-  }
-
-  delete [] buf;
-  delete [] bufcopy;
-
-  // error check for unfilled partner info
-  // if partner_type not set, is an error
-  // partner_bondtype may not be set if special list is not consistent
-  //   with bondatom (e.g. due to delete_bonds command)
-  // this is OK if one or both atoms are not in fix group, since
-  //   bond won't be SHAKEn anyway
-  // else it's an error
-
-  flag = 0;
-
-  for(i = 0; i < nlocal; i++)
-    for(j = 0; j < npartner[i]; j++) {
-      if(partner_type[i][j] == 0) flag = 1;
-
-      if(!(mask[i] & groupbit)) continue;
-
-      if(!(partner_mask[i][j] & groupbit)) continue;
-
-      if(partner_bondtype[i][j] == 0) flag = 1;
-    }
-
-  MPI_Allreduce(&flag, &flag_all, 1, MPI_INT, MPI_SUM, world);
-
-  if(flag_all) error->all(FLERR, "Did not find fix shake partner info");
-
-  // -----------------------------------------------------
-  // identify SHAKEable bonds
-  // set nshake[i] = # of SHAKE bonds attached to atom i
-  // set partner_shake[i][] = 1 if SHAKE bonded to partner, 0 if not
-  // both atoms must be in group, bondtype must be > 0
-  // check if bondtype is in input bond_flag
-  // check if type of either atom is in input type_flag
-  // check if mass of either atom is in input mass_list
-  // -----------------------------------------------------
-
-  int np;
-
-  for(i = 0; i < nlocal; i++) {
-    nshake[i] = 0;
-    np = npartner[i];
-
-    for(j = 0; j < np; j++) {
-      partner_shake[i][j] = 0;
-
-      if(!(mask[i] & groupbit)) continue;
-
-      if(!(partner_mask[i][j] & groupbit)) continue;
-
-      if(partner_bondtype[i][j] <= 0) continue;
-
-      if(bond_flag[partner_bondtype[i][j]]) {
-        partner_shake[i][j] = 1;
-        nshake[i]++;
-        continue;
-      }
-
-      if(type_flag[type[i]] || type_flag[partner_type[i][j]]) {
-        partner_shake[i][j] = 1;
-        nshake[i]++;
-        continue;
-      }
-
-      if(nmass) {
-        if(partner_massflag[i][j]) {
-          partner_shake[i][j] = 1;
-          nshake[i]++;
-          continue;
-        } else {
-          if(rmass) massone = rmass[i];
-          else massone = mass[type[i]];
-
-          if(masscheck(massone)) {
-            partner_shake[i][j] = 1;
-            nshake[i]++;
-            continue;
-          }
-        }
-      }
-    }
-  }
-
-  // -----------------------------------------------------
-  // set partner_nshake for bonded partners
-  // requires communication for off-proc partners
-  // -----------------------------------------------------
-
-  // fill in partner_nshake if own bond partner
-  // info to store in buf for each off-proc bond =
-  //   2 atoms IDs in bond, space for nshake value
-  // nbufmax = largest buffer needed to hold info from any proc
-
-  nbuf = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    for(j = 0; j < npartner[i]; j++) {
-      m = atom->map(partner_tag[i][j]);
-
-      if(m >= 0 && m < nlocal) partner_nshake[i][j] = nshake[m];
-      else nbuf += 3;
-    }
-  }
-
-  MPI_Allreduce(&nbuf, &nbufmax, 1, MPI_INT, MPI_MAX, world);
-
-  buf = new int[nbufmax];
-  bufcopy = new int[nbufmax];
-
-  // fill buffer with info
-
-  size = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    for(j = 0; j < npartner[i]; j++) {
-      m = atom->map(partner_tag[i][j]);
-
-      if(m < 0 || m >= nlocal) {
-        buf[size] = tag[i];
-        buf[size + 1] = partner_tag[i][j];
-        size += 3;
-      }
-    }
-  }
-
-  // cycle buffer around ring of procs back to self
-  // when receive buffer, scan bond partner IDs for atoms I own
-  // if I own partner, fill in nshake value
-
-  messtag = 2;
-
-  for(loop = 0; loop < nprocs; loop++) {
-    i = 0;
-
-    while(i < size) {
-      m = atom->map(buf[i + 1]);
-
-      if(m >= 0 && m < nlocal) buf[i + 2] = nshake[m];
-
-      i += 3;
-    }
-
-    if(me != next) {
-      MPI_Irecv(bufcopy, nbufmax, MPI_INT, prev, messtag, world, &request);
-      MPI_Send(buf, size, MPI_INT, next, messtag, world);
-      MPI_Wait(&request, &status);
-      MPI_Get_count(&status, MPI_INT, &size);
-
-      for(j = 0; j < size; j++) buf[j] = bufcopy[j];
-    }
-  }
-
-  // store partner info returned to me
-
-  m = 0;
-
-  while(m < size) {
-    i = atom->map(buf[m]);
-
-    for(j = 0; j < npartner[i]; j++)
-      if(buf[m + 1] == partner_tag[i][j]) break;
-
-    partner_nshake[i][j] = buf[m + 2];
-    m += 3;
-  }
-
-  delete [] buf;
-  delete [] bufcopy;
-
-  // -----------------------------------------------------
-  // error checks
-  // no atom with nshake > 3
-  // no connected atoms which both have nshake > 1
-  // -----------------------------------------------------
-
-  flag = 0;
-
-  for(i = 0; i < nlocal; i++) if(nshake[i] > 3) flag = 1;
-
-  MPI_Allreduce(&flag, &flag_all, 1, MPI_INT, MPI_SUM, world);
-
-  if(flag_all) error->all(FLERR, "Shake cluster of more than 4 atoms");
-
-  flag = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    if(nshake[i] <= 1) continue;
-
-    for(j = 0; j < npartner[i]; j++)
-      if(partner_shake[i][j] && partner_nshake[i][j] > 1) flag = 1;
-  }
-
-  MPI_Allreduce(&flag, &flag_all, 1, MPI_INT, MPI_SUM, world);
-
-  if(flag_all) error->all(FLERR, "Shake clusters are connected");
-
-  // -----------------------------------------------------
-  // set SHAKE arrays that are stored with atoms & add angle constraints
-  // zero shake arrays for all owned atoms
-  // if I am central atom set shake_flag & shake_atom & shake_type
-  // for 2-atom clusters, I am central atom if my atom ID < partner ID
-  // for 3-atom clusters, test for angle constraint
-  //   angle will be stored by this atom if it exists
-  //   if angle type matches angle_flag, then it is angle-constrained
-  // shake_flag[] = 0 if atom not in SHAKE cluster
-  //                2,3,4 = size of bond-only cluster
-  //                1 = 3-atom angle cluster
-  // shake_atom[][] = global IDs of 2,3,4 atoms in cluster
-  //                  central atom is 1st
-  //                  for 2-atom cluster, lowest ID is 1st
-  // shake_type[][] = bondtype of each bond in cluster
-  //                  for 3-atom angle cluster, 3rd value is angletype
-  // -----------------------------------------------------
-
-  for(i = 0; i < nlocal; i++) {
-    shake_flag[i] = 0;
-    shake_atom[i][0] = 0;
-    shake_atom[i][1] = 0;
-    shake_atom[i][2] = 0;
-    shake_atom[i][3] = 0;
-    shake_type[i][0] = 0;
-    shake_type[i][1] = 0;
-    shake_type[i][2] = 0;
-
-    if(nshake[i] == 1) {
-      for(j = 0; j < npartner[i]; j++)
-        if(partner_shake[i][j]) break;
-
-      if(partner_nshake[i][j] == 1 && tag[i] < partner_tag[i][j]) {
-        shake_flag[i] = 2;
-        shake_atom[i][0] = tag[i];
-        shake_atom[i][1] = partner_tag[i][j];
-        shake_type[i][0] = partner_bondtype[i][j];
-      }
-    }
-
-    if(nshake[i] > 1) {
-      shake_flag[i] = 1;
-      shake_atom[i][0] = tag[i];
-
-      for(j = 0; j < npartner[i]; j++)
-        if(partner_shake[i][j]) {
-          m = shake_flag[i];
-          shake_atom[i][m] = partner_tag[i][j];
-          shake_type[i][m - 1] = partner_bondtype[i][j];
-          shake_flag[i]++;
-        }
-    }
-
-    if(nshake[i] == 2) {
-      n = anglefind(i, shake_atom[i][1], shake_atom[i][2]);
-
-      if(n < 0) continue;
-
-      if(angle_type[i][n] < 0) continue;
-
-      if(angle_flag[angle_type[i][n]]) {
-        shake_flag[i] = 1;
-        shake_type[i][2] = angle_type[i][n];
-      }
-    }
-  }
-
-  // -----------------------------------------------------
-  // set shake_flag,shake_atom,shake_type for non-central atoms
-  // requires communication for off-proc atoms
-  // -----------------------------------------------------
-
-  // fill in shake arrays for each bond partner I own
-  // info to store in buf for each off-proc bond =
-  //   all values from shake_flag, shake_atom, shake_type
-  // nbufmax = largest buffer needed to hold info from any proc
-
-  nbuf = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    if(shake_flag[i] == 0) continue;
-
-    for(j = 0; j < npartner[i]; j++) {
-      if(partner_shake[i][j] == 0) continue;
-
-      m = atom->map(partner_tag[i][j]);
-
-      if(m >= 0 && m < nlocal) {
-        shake_flag[m] = shake_flag[i];
-        shake_atom[m][0] = shake_atom[i][0];
-        shake_atom[m][1] = shake_atom[i][1];
-        shake_atom[m][2] = shake_atom[i][2];
-        shake_atom[m][3] = shake_atom[i][3];
-        shake_type[m][0] = shake_type[i][0];
-        shake_type[m][1] = shake_type[i][1];
-        shake_type[m][2] = shake_type[i][2];
-      } else nbuf += 9;
-    }
-  }
-
-  MPI_Allreduce(&nbuf, &nbufmax, 1, MPI_INT, MPI_MAX, world);
-
-  buf = new int[nbufmax];
-  bufcopy = new int[nbufmax];
-
-  // fill buffer with info
-
-  size = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    if(shake_flag[i] == 0) continue;
-
-    for(j = 0; j < npartner[i]; j++) {
-      if(partner_shake[i][j] == 0) continue;
-
-      m = atom->map(partner_tag[i][j]);
-
-      if(m < 0 || m >= nlocal) {
-        buf[size] = partner_tag[i][j];
-        buf[size + 1] = shake_flag[i];
-        buf[size + 2] = shake_atom[i][0];
-        buf[size + 3] = shake_atom[i][1];
-        buf[size + 4] = shake_atom[i][2];
-        buf[size + 5] = shake_atom[i][3];
-        buf[size + 6] = shake_type[i][0];
-        buf[size + 7] = shake_type[i][1];
-        buf[size + 8] = shake_type[i][2];
-        size += 9;
-      }
-    }
-  }
-
-  // cycle buffer around ring of procs back to self
-  // when receive buffer, scan for ID that I own
-  // if I own ID, fill in shake array values
-
-  messtag = 3;
-
-  for(loop = 0; loop < nprocs; loop++) {
-    i = 0;
-
-    while(i < size) {
-      m = atom->map(buf[i]);
-
-      if(m >= 0 && m < nlocal) {
-        shake_flag[m] = buf[i + 1];
-        shake_atom[m][0] = buf[i + 2];
-        shake_atom[m][1] = buf[i + 3];
-        shake_atom[m][2] = buf[i + 4];
-        shake_atom[m][3] = buf[i + 5];
-        shake_type[m][0] = buf[i + 6];
-        shake_type[m][1] = buf[i + 7];
-        shake_type[m][2] = buf[i + 8];
-      }
-
-      i += 9;
-    }
-
-    if(me != next) {
-      MPI_Irecv(bufcopy, nbufmax, MPI_INT, prev, messtag, world, &request);
-      MPI_Send(buf, size, MPI_INT, next, messtag, world);
-      MPI_Wait(&request, &status);
-      MPI_Get_count(&status, MPI_INT, &size);
-
-      for(j = 0; j < size; j++) buf[j] = bufcopy[j];
-    }
-  }
-
-  delete [] buf;
-  delete [] bufcopy;
-
-  // -----------------------------------------------------
-  // free local memory
-  // -----------------------------------------------------
-
-  memory->destroy(npartner);
-  memory->destroy(nshake);
-  memory->destroy(partner_tag);
-  memory->destroy(partner_mask);
-  memory->destroy(partner_type);
-  memory->destroy(partner_massflag);
-  memory->destroy(partner_bondtype);
-  memory->destroy(partner_shake);
-  memory->destroy(partner_nshake);
-
-  // -----------------------------------------------------
-  // set bond_type and angle_type negative for SHAKE clusters
-  // must set for all SHAKE bonds and angles stored by each atom
-  // -----------------------------------------------------
-
-  for(i = 0; i < nlocal; i++) {
-    if(shake_flag[i] == 0) continue;
-    else if(shake_flag[i] == 1) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][2]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = anglefind(i, shake_atom[i][1], shake_atom[i][2]);
-
-      if(n >= 0) angle_type[i][n] = -angle_type[i][n];
-    } else if(shake_flag[i] == 2) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-    } else if(shake_flag[i] == 3) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][2]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-    } else if(shake_flag[i] == 4) {
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][1]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][2]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-
-      n = bondfind(i, shake_atom[i][0], shake_atom[i][3]);
-
-      if(n >= 0) bond_type[i][n] = -bond_type[i][n];
-    }
-  }
-
-  // -----------------------------------------------------
-  // print info on SHAKE clusters
-  // -----------------------------------------------------
-
-  int count1, count2, count3, count4;
-  count1 = count2 = count3 = count4 = 0;
-
-  for(i = 0; i < nlocal; i++) {
-    if(shake_flag[i] == 1) count1++;
-    else if(shake_flag[i] == 2) count2++;
-    else if(shake_flag[i] == 3) count3++;
-    else if(shake_flag[i] == 4) count4++;
-  }
-
-  for(int i = 0; i < nlocal; i++) {
-  }
-
-
-  int tmp;
-  tmp = count1;
-  MPI_Allreduce(&tmp, &count1, 1, MPI_INT, MPI_SUM, world);
-  tmp = count2;
-  MPI_Allreduce(&tmp, &count2, 1, MPI_INT, MPI_SUM, world);
-  tmp = count3;
-  MPI_Allreduce(&tmp, &count3, 1, MPI_INT, MPI_SUM, world);
-  tmp = count4;
-  MPI_Allreduce(&tmp, &count4, 1, MPI_INT, MPI_SUM, world);
-
-  if(me == 0) {
-    if(screen) {
-      fprintf(screen, "  %d = # of size 2 clusters\n", count2 / 2);
-      fprintf(screen, "  %d = # of size 3 clusters\n", count3 / 3);
-      fprintf(screen, "  %d = # of size 4 clusters\n", count4 / 4);
-      fprintf(screen, "  %d = # of frozen angles\n", count1 / 3);
-    }
-
-    if(logfile) {
-      fprintf(logfile, "  %d = # of size 2 clusters\n", count2 / 2);
-      fprintf(logfile, "  %d = # of size 3 clusters\n", count3 / 3);
-      fprintf(logfile, "  %d = # of size 4 clusters\n", count4 / 4);
-      fprintf(logfile, "  %d = # of frozen angles\n", count1 / 3);
-    }
-  }
-
-  cu_shake_flag->upload();
-  cu_shake_atom->upload();
-  cu_shake_type->upload();
-  Cuda_FixShakeCuda_Init(&cuda->shared_data, dtv, dtfsq,
-                         cu_shake_flag->dev_data(), cu_shake_atom->dev_data(), cu_shake_type->dev_data(), cu_xshake->dev_data(),
-                         cu_bond_distance->dev_data(), cu_angle_distance->dev_data(), cu_virial->dev_data(),
-                         max_iter, tolerance);
-
-}
-
-void FixShakeCuda::swap_clusters(int i, int j)
-{
-  int tmp;
-  tmp = shake_flag[i];
-  shake_flag[i] = shake_flag[j];
-  shake_flag[j] = tmp;
-  tmp = shake_atom[i][0];
-  shake_atom[i][0] = shake_atom[j][0];
-  shake_atom[j][0] = tmp;
-  tmp = shake_atom[i][1];
-  shake_atom[i][1] = shake_atom[j][1];
-  shake_atom[j][1] = tmp;
-  tmp = shake_atom[i][2];
-  shake_atom[i][2] = shake_atom[j][2];
-  shake_atom[j][2] = tmp;
-  tmp = shake_atom[i][3];
-  shake_atom[i][3] = shake_atom[j][3];
-  shake_atom[j][3] = tmp;
-  tmp = shake_type[i][0];
-  shake_type[i][0] = shake_type[j][0];
-  shake_type[j][0] = tmp;
-  tmp = shake_type[i][1];
-  shake_type[i][1] = shake_type[j][1];
-  shake_type[j][1] = tmp;
-  tmp = shake_type[i][2];
-  shake_type[i][2] = shake_type[j][2];
-  shake_type[j][2] = tmp;
-}
-
-/* ----------------------------------------------------------------------
-   check if massone is within MASSDELTA of any mass in mass_list
-   return 1 if yes, 0 if not
-------------------------------------------------------------------------- */
-
-int FixShakeCuda::masscheck(double massone)
-{
-  for(int i = 0; i < nmass; i++)
-    if(fabs(mass_list[i] - massone) <= MASSDELTA) return 1;
-
-  return 0;
-}
-
-/* ----------------------------------------------------------------------
-   update the unconstrained position of each atom
-   only for SHAKE clusters, else set to 0.0
-   assumes NVE update, seems to be accurate enough for NVT,NPT,NPH as well
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::unconstrained_update()
-{
-  if(cuda->finished_setup) {
-    Cuda_FixShakeCuda_UnconstrainedUpdate(&cuda->shared_data);
-    return;
-  }
-
-  double dtfmsq;
-
-  if(rmass) {
-    for(int i = 0; i < nlocal; i++) {
-      if(shake_flag[i]) {
-        dtfmsq = dtfsq / rmass[i];
-        xshake[i][0] = x[i][0] + dtv * v[i][0] + dtfmsq * f[i][0];
-        xshake[i][1] = x[i][1] + dtv * v[i][1] + dtfmsq * f[i][1];
-        xshake[i][2] = x[i][2] + dtv * v[i][2] + dtfmsq * f[i][2];
-      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
-    }
-  } else {
-    for(int i = 0; i < nlocal; i++) {
-      if(shake_flag[i]) {
-        dtfmsq = dtfsq / mass[type[i]];
-        xshake[i][0] = x[i][0] + dtv * v[i][0] + dtfmsq * f[i][0];
-        xshake[i][1] = x[i][1] + dtv * v[i][1] + dtfmsq * f[i][1];
-        xshake[i][2] = x[i][2] + dtv * v[i][2] + dtfmsq * f[i][2];
-      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
-    }
-  }
-
-  cu_xshake->upload();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShakeCuda::shake2(int m)
-{
-  int nlist, list[2];
-  double v[6];
-  double invmass0, invmass1;
-
-  // local atom IDs and constraint distances
-
-  int i0 = atom->map(shake_atom[m][0]);
-  int i1 = atom->map(shake_atom[m][1]);
-  double bond1 = bond_distance[shake_type[m][0]];
-
-  // r01 = distance vec between atoms, with PBC
-
-  double r01[3];
-  r01[0] = x[i0][0] - x[i1][0];
-  r01[1] = x[i0][1] - x[i1][1];
-  r01[2] = x[i0][2] - x[i1][2];
-  domain->minimum_image(r01);
-
-  // s01 = distance vec after unconstrained update, with PBC
-
-  double s01[3];
-  s01[0] = xshake[i0][0] - xshake[i1][0];
-  s01[1] = xshake[i0][1] - xshake[i1][1];
-  s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
-
-  // scalar distances between atoms
-
-  double r01sq = r01[0] * r01[0] + r01[1] * r01[1] + r01[2] * r01[2];
-  double s01sq = s01[0] * s01[0] + s01[1] * s01[1] + s01[2] * s01[2];
-
-  // a,b,c = coeffs in quadratic equation for lamda
-
-  if(rmass) {
-    invmass0 = 1.0 / rmass[i0];
-    invmass1 = 1.0 / rmass[i1];
-  } else {
-    invmass0 = 1.0 / mass[type[i0]];
-    invmass1 = 1.0 / mass[type[i1]];
-  }
-
-  double a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
-  double b = 2.0 * (invmass0 + invmass1) *
-             (s01[0] * r01[0] + s01[1] * r01[1] + s01[2] * r01[2]);
-  double c = s01sq - bond1 * bond1;
-
-  // error check
-
-  double determ = b * b - 4.0 * a * c;
-
-  if(determ < 0.0) {
-    error->warning(FLERR, "Shake determinant < 0.0");
-    determ = 0.0;
-  }
-
-  // exact quadratic solution for lamda
-
-  double lamda, lamda1, lamda2;
-  lamda1 = (-b + sqrt(determ)) / (2.0 * a);
-  lamda2 = (-b - sqrt(determ)) / (2.0 * a);
-
-  if(fabs(lamda1) <= fabs(lamda2)) lamda = lamda1;
-  else lamda = lamda2;
-
-  // update forces if atom is owned by this processor
-  lamda /= dtfsq;
-
-  if(i0 < nlocal) {
-    f[i0][0] += lamda * r01[0];
-    f[i0][1] += lamda * r01[1];
-    f[i0][2] += lamda * r01[2];
-  }
-
-  if(i1 < nlocal) {
-    f[i1][0] -= lamda * r01[0];
-    f[i1][1] -= lamda * r01[1];
-    f[i1][2] -= lamda * r01[2];
-  }
-
-  if(evflag) {
-    nlist = 0;
-
-    if(i0 < nlocal) list[nlist++] = i0;
-
-    if(i1 < nlocal) list[nlist++] = i1;
-
-    v[0] = lamda * r01[0] * r01[0];
-    v[1] = lamda * r01[1] * r01[1];
-    v[2] = lamda * r01[2] * r01[2];
-    v[3] = lamda * r01[0] * r01[1];
-    v[4] = lamda * r01[0] * r01[2];
-    v[5] = lamda * r01[1] * r01[2];
-
-    v_tally(nlist, list, 2.0, v);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShakeCuda::shake3(int m)
-{
-  int nlist, list[3];
-  double v[6];
-  double invmass0, invmass1, invmass2;
-
-  // local atom IDs and constraint distances
-
-  int i0 = atom->map(shake_atom[m][0]);
-  int i1 = atom->map(shake_atom[m][1]);
-  int i2 = atom->map(shake_atom[m][2]);
-  double bond1 = bond_distance[shake_type[m][0]];
-  double bond2 = bond_distance[shake_type[m][1]];
-
-  // r01,r02 = distance vec between atoms, with PBC
-
-  double r01[3];
-  r01[0] = x[i0][0] - x[i1][0];
-  r01[1] = x[i0][1] - x[i1][1];
-  r01[2] = x[i0][2] - x[i1][2];
-  domain->minimum_image(r01);
-
-  double r02[3];
-  r02[0] = x[i0][0] - x[i2][0];
-  r02[1] = x[i0][1] - x[i2][1];
-  r02[2] = x[i0][2] - x[i2][2];
-  domain->minimum_image(r02);
-
-  // s01,s02 = distance vec after unconstrained update, with PBC
-
-  double s01[3];
-  s01[0] = xshake[i0][0] - xshake[i1][0];
-  s01[1] = xshake[i0][1] - xshake[i1][1];
-  s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
-
-  double s02[3];
-  s02[0] = xshake[i0][0] - xshake[i2][0];
-  s02[1] = xshake[i0][1] - xshake[i2][1];
-  s02[2] = xshake[i0][2] - xshake[i2][2];
-  domain->minimum_image(s02);
-
-  // scalar distances between atoms
-
-  double r01sq = r01[0] * r01[0] + r01[1] * r01[1] + r01[2] * r01[2];
-  double r02sq = r02[0] * r02[0] + r02[1] * r02[1] + r02[2] * r02[2];
-  double s01sq = s01[0] * s01[0] + s01[1] * s01[1] + s01[2] * s01[2];
-  double s02sq = s02[0] * s02[0] + s02[1] * s02[1] + s02[2] * s02[2];
-
-  // matrix coeffs and rhs for lamda equations
-
-  if(rmass) {
-    invmass0 = 1.0 / rmass[i0];
-    invmass1 = 1.0 / rmass[i1];
-    invmass2 = 1.0 / rmass[i2];
-  } else {
-    invmass0 = 1.0 / mass[type[i0]];
-    invmass1 = 1.0 / mass[type[i1]];
-    invmass2 = 1.0 / mass[type[i2]];
-  }
-
-  double a11 = 2.0 * (invmass0 + invmass1) *
-               (s01[0] * r01[0] + s01[1] * r01[1] + s01[2] * r01[2]);
-  double a12 = 2.0 * invmass0 *
-               (s01[0] * r02[0] + s01[1] * r02[1] + s01[2] * r02[2]);
-  double a21 = 2.0 * invmass0 *
-               (s02[0] * r01[0] + s02[1] * r01[1] + s02[2] * r01[2]);
-  double a22 = 2.0 * (invmass0 + invmass2) *
-               (s02[0] * r02[0] + s02[1] * r02[1] + s02[2] * r02[2]);
-
-  // inverse of matrix
-
-  double determ = a11 * a22 - a12 * a21;
-
-  if(determ == 0.0) error->one(FLERR, "Shake determinant = 0.0");
-
-  double determinv = 1.0 / determ;
-
-  double a11inv = a22 * determinv;
-  double a12inv = -a12 * determinv;
-  double a21inv = -a21 * determinv;
-  double a22inv = a11 * determinv;
-
-  // quadratic correction coeffs
-
-  double r0102 = (r01[0] * r02[0] + r01[1] * r02[1] + r01[2] * r02[2]);
-
-  double quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
-  double quad1_0202 = invmass0 * invmass0 * r02sq;
-  double quad1_0102 = 2.0 * (invmass0 + invmass1) * invmass0 * r0102;
-
-  double quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
-  double quad2_0101 = invmass0 * invmass0 * r01sq;
-  double quad2_0102 = 2.0 * (invmass0 + invmass2) * invmass0 * r0102;
-
-  // iterate until converged
-
-  double lamda01 = 0.0;
-  double lamda02 = 0.0;
-  int niter = 0;
-  int done = 0;
-
-  double quad1, quad2, b1, b2, lamda01_new, lamda02_new;
-
-  while(!done && niter < max_iter) {
-    quad1 = quad1_0101 * lamda01 * lamda01 + quad1_0202 * lamda02 * lamda02 +
-            quad1_0102 * lamda01 * lamda02;
-    quad2 = quad2_0101 * lamda01 * lamda01 + quad2_0202 * lamda02 * lamda02 +
-            quad2_0102 * lamda01 * lamda02;
-
-    b1 = bond1 * bond1 - s01sq - quad1;
-    b2 = bond2 * bond2 - s02sq - quad2;
-
-    lamda01_new = a11inv * b1 + a12inv * b2;
-    lamda02_new = a21inv * b1 + a22inv * b2;
-
-    done = 1;
-
-    if(fabs(lamda01_new - lamda01) > tolerance) done = 0;
-
-    if(fabs(lamda02_new - lamda02) > tolerance) done = 0;
-
-    lamda01 = lamda01_new;
-    lamda02 = lamda02_new;
-    niter++;
-  }
-
-  // update forces if atom is owned by this processor
-
-  lamda01 = lamda01 / dtfsq;
-  lamda02 = lamda02 / dtfsq;
-
-  if(i0 < nlocal) {
-    f[i0][0] += lamda01 * r01[0] + lamda02 * r02[0];
-    f[i0][1] += lamda01 * r01[1] + lamda02 * r02[1];
-    f[i0][2] += lamda01 * r01[2] + lamda02 * r02[2];
-  }
-
-  if(i1 < nlocal) {
-    f[i1][0] -= lamda01 * r01[0];
-    f[i1][1] -= lamda01 * r01[1];
-    f[i1][2] -= lamda01 * r01[2];
-  }
-
-  if(i2 < nlocal) {
-    f[i2][0] -= lamda02 * r02[0];
-    f[i2][1] -= lamda02 * r02[1];
-    f[i2][2] -= lamda02 * r02[2];
-  }
-
-  if(evflag) {
-    nlist = 0;
-
-    if(i0 < nlocal) list[nlist++] = i0;
-
-    if(i1 < nlocal) list[nlist++] = i1;
-
-    if(i2 < nlocal) list[nlist++] = i2;
-
-    v[0] = lamda01 * r01[0] * r01[0] + lamda02 * r02[0] * r02[0];
-    v[1] = lamda01 * r01[1] * r01[1] + lamda02 * r02[1] * r02[1];
-    v[2] = lamda01 * r01[2] * r01[2] + lamda02 * r02[2] * r02[2];
-    v[3] = lamda01 * r01[0] * r01[1] + lamda02 * r02[0] * r02[1];
-    v[4] = lamda01 * r01[0] * r01[2] + lamda02 * r02[0] * r02[2];
-    v[5] = lamda01 * r01[1] * r01[2] + lamda02 * r02[1] * r02[2];
-
-    v_tally(nlist, list, 3.0, v);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShakeCuda::shake4(int m)
-{
-  int nlist, list[4];
-  double v[6];
-  double invmass0, invmass1, invmass2, invmass3;
-
-  // local atom IDs and constraint distances
-
-  int i0 = atom->map(shake_atom[m][0]);
-  int i1 = atom->map(shake_atom[m][1]);
-  int i2 = atom->map(shake_atom[m][2]);
-  int i3 = atom->map(shake_atom[m][3]);
-  double bond1 = bond_distance[shake_type[m][0]];
-  double bond2 = bond_distance[shake_type[m][1]];
-  double bond3 = bond_distance[shake_type[m][2]];
-
-  // r01,r02,r03 = distance vec between atoms, with PBC
-
-  double r01[3];
-  r01[0] = x[i0][0] - x[i1][0];
-  r01[1] = x[i0][1] - x[i1][1];
-  r01[2] = x[i0][2] - x[i1][2];
-  domain->minimum_image(r01);
-
-  double r02[3];
-  r02[0] = x[i0][0] - x[i2][0];
-  r02[1] = x[i0][1] - x[i2][1];
-  r02[2] = x[i0][2] - x[i2][2];
-  domain->minimum_image(r02);
-
-  double r03[3];
-  r03[0] = x[i0][0] - x[i3][0];
-  r03[1] = x[i0][1] - x[i3][1];
-  r03[2] = x[i0][2] - x[i3][2];
-  domain->minimum_image(r03);
-
-  // s01,s02,s03 = distance vec after unconstrained update, with PBC
-
-  double s01[3];
-  s01[0] = xshake[i0][0] - xshake[i1][0];
-  s01[1] = xshake[i0][1] - xshake[i1][1];
-  s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
-
-  double s02[3];
-  s02[0] = xshake[i0][0] - xshake[i2][0];
-  s02[1] = xshake[i0][1] - xshake[i2][1];
-  s02[2] = xshake[i0][2] - xshake[i2][2];
-  domain->minimum_image(s02);
-
-  double s03[3];
-  s03[0] = xshake[i0][0] - xshake[i3][0];
-  s03[1] = xshake[i0][1] - xshake[i3][1];
-  s03[2] = xshake[i0][2] - xshake[i3][2];
-  domain->minimum_image(s03);
-
-  // scalar distances between atoms
-
-  double r01sq = r01[0] * r01[0] + r01[1] * r01[1] + r01[2] * r01[2];
-  double r02sq = r02[0] * r02[0] + r02[1] * r02[1] + r02[2] * r02[2];
-  double r03sq = r03[0] * r03[0] + r03[1] * r03[1] + r03[2] * r03[2];
-  double s01sq = s01[0] * s01[0] + s01[1] * s01[1] + s01[2] * s01[2];
-  double s02sq = s02[0] * s02[0] + s02[1] * s02[1] + s02[2] * s02[2];
-  double s03sq = s03[0] * s03[0] + s03[1] * s03[1] + s03[2] * s03[2];
-
-  // matrix coeffs and rhs for lamda equations
-
-  if(rmass) {
-    invmass0 = 1.0 / rmass[i0];
-    invmass1 = 1.0 / rmass[i1];
-    invmass2 = 1.0 / rmass[i2];
-    invmass3 = 1.0 / rmass[i3];
-  } else {
-    invmass0 = 1.0 / mass[type[i0]];
-    invmass1 = 1.0 / mass[type[i1]];
-    invmass2 = 1.0 / mass[type[i2]];
-    invmass3 = 1.0 / mass[type[i3]];
-  }
-
-  double a11 = 2.0 * (invmass0 + invmass1) *
-               (s01[0] * r01[0] + s01[1] * r01[1] + s01[2] * r01[2]);
-  double a12 = 2.0 * invmass0 *
-               (s01[0] * r02[0] + s01[1] * r02[1] + s01[2] * r02[2]);
-  double a13 = 2.0 * invmass0 *
-               (s01[0] * r03[0] + s01[1] * r03[1] + s01[2] * r03[2]);
-  double a21 = 2.0 * invmass0 *
-               (s02[0] * r01[0] + s02[1] * r01[1] + s02[2] * r01[2]);
-  double a22 = 2.0 * (invmass0 + invmass2) *
-               (s02[0] * r02[0] + s02[1] * r02[1] + s02[2] * r02[2]);
-  double a23 = 2.0 * invmass0 *
-               (s02[0] * r03[0] + s02[1] * r03[1] + s02[2] * r03[2]);
-  double a31 = 2.0 * invmass0 *
-               (s03[0] * r01[0] + s03[1] * r01[1] + s03[2] * r01[2]);
-  double a32 = 2.0 * invmass0 *
-               (s03[0] * r02[0] + s03[1] * r02[1] + s03[2] * r02[2]);
-  double a33 = 2.0 * (invmass0 + invmass3) *
-               (s03[0] * r03[0] + s03[1] * r03[1] + s03[2] * r03[2]);
-
-  // inverse of matrix;
-
-  double determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
-                  a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
-
-  if(determ == 0.0) error->one(FLERR, "Shake determinant = 0.0");
-
-  double determinv = 1.0 / determ;
-
-  double a11inv = determinv * (a22 * a33 - a23 * a32);
-  double a12inv = -determinv * (a12 * a33 - a13 * a32);
-  double a13inv = determinv * (a12 * a23 - a13 * a22);
-  double a21inv = -determinv * (a21 * a33 - a23 * a31);
-  double a22inv = determinv * (a11 * a33 - a13 * a31);
-  double a23inv = -determinv * (a11 * a23 - a13 * a21);
-  double a31inv = determinv * (a21 * a32 - a22 * a31);
-  double a32inv = -determinv * (a11 * a32 - a12 * a31);
-  double a33inv = determinv * (a11 * a22 - a12 * a21);
-
-  // quadratic correction coeffs
-
-  double r0102 = (r01[0] * r02[0] + r01[1] * r02[1] + r01[2] * r02[2]);
-  double r0103 = (r01[0] * r03[0] + r01[1] * r03[1] + r01[2] * r03[2]);
-  double r0203 = (r02[0] * r03[0] + r02[1] * r03[1] + r02[2] * r03[2]);
-
-  double quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
-  double quad1_0202 = invmass0 * invmass0 * r02sq;
-  double quad1_0303 = invmass0 * invmass0 * r03sq;
-  double quad1_0102 = 2.0 * (invmass0 + invmass1) * invmass0 * r0102;
-  double quad1_0103 = 2.0 * (invmass0 + invmass1) * invmass0 * r0103;
-  double quad1_0203 = 2.0 * invmass0 * invmass0 * r0203;
-
-  double quad2_0101 = invmass0 * invmass0 * r01sq;
-  double quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
-  double quad2_0303 = invmass0 * invmass0 * r03sq;
-  double quad2_0102 = 2.0 * (invmass0 + invmass2) * invmass0 * r0102;
-  double quad2_0103 = 2.0 * invmass0 * invmass0 * r0103;
-  double quad2_0203 = 2.0 * (invmass0 + invmass2) * invmass0 * r0203;
-
-  double quad3_0101 = invmass0 * invmass0 * r01sq;
-  double quad3_0202 = invmass0 * invmass0 * r02sq;
-  double quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq;
-  double quad3_0102 = 2.0 * invmass0 * invmass0 * r0102;
-  double quad3_0103 = 2.0 * (invmass0 + invmass3) * invmass0 * r0103;
-  double quad3_0203 = 2.0 * (invmass0 + invmass3) * invmass0 * r0203;
-
-  // iterate until converged
-
-  double lamda01 = 0.0;
-  double lamda02 = 0.0;
-  double lamda03 = 0.0;
-  int niter = 0;
-  int done = 0;
-
-  double quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new;
-
-  while(!done && niter < max_iter) {
-    quad1 = quad1_0101 * lamda01 * lamda01 +
-            quad1_0202 * lamda02 * lamda02 +
-            quad1_0303 * lamda03 * lamda03 +
-            quad1_0102 * lamda01 * lamda02 +
-            quad1_0103 * lamda01 * lamda03 +
-            quad1_0203 * lamda02 * lamda03;
-
-    quad2 = quad2_0101 * lamda01 * lamda01 +
-            quad2_0202 * lamda02 * lamda02 +
-            quad2_0303 * lamda03 * lamda03 +
-            quad2_0102 * lamda01 * lamda02 +
-            quad2_0103 * lamda01 * lamda03 +
-            quad2_0203 * lamda02 * lamda03;
-
-    quad3 = quad3_0101 * lamda01 * lamda01 +
-            quad3_0202 * lamda02 * lamda02 +
-            quad3_0303 * lamda03 * lamda03 +
-            quad3_0102 * lamda01 * lamda02 +
-            quad3_0103 * lamda01 * lamda03 +
-            quad3_0203 * lamda02 * lamda03;
-
-    b1 = bond1 * bond1 - s01sq - quad1;
-    b2 = bond2 * bond2 - s02sq - quad2;
-    b3 = bond3 * bond3 - s03sq - quad3;
-
-    lamda01_new = a11inv * b1 + a12inv * b2 + a13inv * b3;
-    lamda02_new = a21inv * b1 + a22inv * b2 + a23inv * b3;
-    lamda03_new = a31inv * b1 + a32inv * b2 + a33inv * b3;
-
-    done = 1;
-
-    if(fabs(lamda01_new - lamda01) > tolerance) done = 0;
-
-    if(fabs(lamda02_new - lamda02) > tolerance) done = 0;
-
-    if(fabs(lamda03_new - lamda03) > tolerance) done = 0;
-
-    lamda01 = lamda01_new;
-    lamda02 = lamda02_new;
-    lamda03 = lamda03_new;
-    niter++;
-  }
-
-  // update forces if atom is owned by this processor
-
-  lamda01 = lamda01 / dtfsq;
-  lamda02 = lamda02 / dtfsq;
-  lamda03 = lamda03 / dtfsq;
-
-  if(i0 < nlocal) {
-    f[i0][0] += lamda01 * r01[0] + lamda02 * r02[0] + lamda03 * r03[0];
-    f[i0][1] += lamda01 * r01[1] + lamda02 * r02[1] + lamda03 * r03[1];
-    f[i0][2] += lamda01 * r01[2] + lamda02 * r02[2] + lamda03 * r03[2];
-  }
-
-  if(i1 < nlocal) {
-    f[i1][0] -= lamda01 * r01[0];
-    f[i1][1] -= lamda01 * r01[1];
-    f[i1][2] -= lamda01 * r01[2];
-  }
-
-  if(i2 < nlocal) {
-    f[i2][0] -= lamda02 * r02[0];
-    f[i2][1] -= lamda02 * r02[1];
-    f[i2][2] -= lamda02 * r02[2];
-  }
-
-  if(i3 < nlocal) {
-    f[i3][0] -= lamda03 * r03[0];
-    f[i3][1] -= lamda03 * r03[1];
-    f[i3][2] -= lamda03 * r03[2];
-  }
-
-  if(evflag) {
-    nlist = 0;
-
-    if(i0 < nlocal) list[nlist++] = i0;
-
-    if(i1 < nlocal) list[nlist++] = i1;
-
-    if(i2 < nlocal) list[nlist++] = i2;
-
-    if(i3 < nlocal) list[nlist++] = i3;
-
-    v[0] = lamda01 * r01[0] * r01[0] + lamda02 * r02[0] * r02[0] + lamda03 * r03[0] * r03[0];
-    v[1] = lamda01 * r01[1] * r01[1] + lamda02 * r02[1] * r02[1] + lamda03 * r03[1] * r03[1];
-    v[2] = lamda01 * r01[2] * r01[2] + lamda02 * r02[2] * r02[2] + lamda03 * r03[2] * r03[2];
-    v[3] = lamda01 * r01[0] * r01[1] + lamda02 * r02[0] * r02[1] + lamda03 * r03[0] * r03[1];
-    v[4] = lamda01 * r01[0] * r01[2] + lamda02 * r02[0] * r02[2] + lamda03 * r03[0] * r03[2];
-    v[5] = lamda01 * r01[1] * r01[2] + lamda02 * r02[1] * r02[2] + lamda03 * r03[1] * r03[2];
-    //if(i0==7271) printf("%lf %lf %lf %lf %lf %lf\n",v[0],v[1],v[2],v[3],v[4],v[5]);
-
-    v_tally(nlist, list, 4.0, v);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShakeCuda::shake3angle(int m)
-{
-  int nlist, list[3];
-  double v[6];
-  double invmass0, invmass1, invmass2;
-
-  // local atom IDs and constraint distances
-
-  int i0 = atom->map(shake_atom[m][0]);
-  int i1 = atom->map(shake_atom[m][1]);
-  int i2 = atom->map(shake_atom[m][2]);
-  double bond1 = bond_distance[shake_type[m][0]];
-  double bond2 = bond_distance[shake_type[m][1]];
-  double bond12 = angle_distance[shake_type[m][2]];
-
-  // r01,r02,r12 = distance vec between atoms, with PBC
-
-  double r01[3];
-  r01[0] = x[i0][0] - x[i1][0];
-  r01[1] = x[i0][1] - x[i1][1];
-  r01[2] = x[i0][2] - x[i1][2];
-  domain->minimum_image(r01);
-
-  double r02[3];
-  r02[0] = x[i0][0] - x[i2][0];
-  r02[1] = x[i0][1] - x[i2][1];
-  r02[2] = x[i0][2] - x[i2][2];
-  domain->minimum_image(r02);
-
-  double r12[3];
-  r12[0] = x[i1][0] - x[i2][0];
-  r12[1] = x[i1][1] - x[i2][1];
-  r12[2] = x[i1][2] - x[i2][2];
-  domain->minimum_image(r12);
-
-  // s01,s02,s12 = distance vec after unconstrained update, with PBC
-
-  double s01[3];
-  s01[0] = xshake[i0][0] - xshake[i1][0];
-  s01[1] = xshake[i0][1] - xshake[i1][1];
-  s01[2] = xshake[i0][2] - xshake[i1][2];
-  domain->minimum_image(s01);
-
-  double s02[3];
-  s02[0] = xshake[i0][0] - xshake[i2][0];
-  s02[1] = xshake[i0][1] - xshake[i2][1];
-  s02[2] = xshake[i0][2] - xshake[i2][2];
-  domain->minimum_image(s02);
-
-  double s12[3];
-  s12[0] = xshake[i1][0] - xshake[i2][0];
-  s12[1] = xshake[i1][1] - xshake[i2][1];
-  s12[2] = xshake[i1][2] - xshake[i2][2];
-  domain->minimum_image(s12);
-
-  // scalar distances between atoms
-
-  double r01sq = r01[0] * r01[0] + r01[1] * r01[1] + r01[2] * r01[2];
-  double r02sq = r02[0] * r02[0] + r02[1] * r02[1] + r02[2] * r02[2];
-  double r12sq = r12[0] * r12[0] + r12[1] * r12[1] + r12[2] * r12[2];
-  double s01sq = s01[0] * s01[0] + s01[1] * s01[1] + s01[2] * s01[2];
-  double s02sq = s02[0] * s02[0] + s02[1] * s02[1] + s02[2] * s02[2];
-  double s12sq = s12[0] * s12[0] + s12[1] * s12[1] + s12[2] * s12[2];
-
-  // matrix coeffs and rhs for lamda equations
-
-  if(rmass) {
-    invmass0 = 1.0 / rmass[i0];
-    invmass1 = 1.0 / rmass[i1];
-    invmass2 = 1.0 / rmass[i2];
-  } else {
-    invmass0 = 1.0 / mass[type[i0]];
-    invmass1 = 1.0 / mass[type[i1]];
-    invmass2 = 1.0 / mass[type[i2]];
-  }
-
-  double a11 = 2.0 * (invmass0 + invmass1) *
-               (s01[0] * r01[0] + s01[1] * r01[1] + s01[2] * r01[2]);
-  double a12 = 2.0 * invmass0 *
-               (s01[0] * r02[0] + s01[1] * r02[1] + s01[2] * r02[2]);
-  double a13 = - 2.0 * invmass1 *
-               (s01[0] * r12[0] + s01[1] * r12[1] + s01[2] * r12[2]);
-  double a21 = 2.0 * invmass0 *
-               (s02[0] * r01[0] + s02[1] * r01[1] + s02[2] * r01[2]);
-  double a22 = 2.0 * (invmass0 + invmass2) *
-               (s02[0] * r02[0] + s02[1] * r02[1] + s02[2] * r02[2]);
-  double a23 = 2.0 * invmass2 *
-               (s02[0] * r12[0] + s02[1] * r12[1] + s02[2] * r12[2]);
-  double a31 = - 2.0 * invmass1 *
-               (s12[0] * r01[0] + s12[1] * r01[1] + s12[2] * r01[2]);
-  double a32 = 2.0 * invmass2 *
-               (s12[0] * r02[0] + s12[1] * r02[1] + s12[2] * r02[2]);
-  double a33 = 2.0 * (invmass1 + invmass2) *
-               (s12[0] * r12[0] + s12[1] * r12[1] + s12[2] * r12[2]);
-
-  // inverse of matrix
-
-  double determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
-                  a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
-
-  if(determ == 0.0) error->one(FLERR, "Shake determinant = 0.0");
-
-  double determinv = 1.0 / determ;
-
-  double a11inv = determinv * (a22 * a33 - a23 * a32);
-  double a12inv = -determinv * (a12 * a33 - a13 * a32);
-  double a13inv = determinv * (a12 * a23 - a13 * a22);
-  double a21inv = -determinv * (a21 * a33 - a23 * a31);
-  double a22inv = determinv * (a11 * a33 - a13 * a31);
-  double a23inv = -determinv * (a11 * a23 - a13 * a21);
-  double a31inv = determinv * (a21 * a32 - a22 * a31);
-  double a32inv = -determinv * (a11 * a32 - a12 * a31);
-  double a33inv = determinv * (a11 * a22 - a12 * a21);
-
-  // quadratic correction coeffs
-
-  double r0102 = (r01[0] * r02[0] + r01[1] * r02[1] + r01[2] * r02[2]);
-  double r0112 = (r01[0] * r12[0] + r01[1] * r12[1] + r01[2] * r12[2]);
-  double r0212 = (r02[0] * r12[0] + r02[1] * r12[1] + r02[2] * r12[2]);
-
-  double quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
-  double quad1_0202 = invmass0 * invmass0 * r02sq;
-  double quad1_1212 = invmass1 * invmass1 * r12sq;
-  double quad1_0102 = 2.0 * (invmass0 + invmass1) * invmass0 * r0102;
-  double quad1_0112 = - 2.0 * (invmass0 + invmass1) * invmass1 * r0112;
-  double quad1_0212 = - 2.0 * invmass0 * invmass1 * r0212;
-
-  double quad2_0101 = invmass0 * invmass0 * r01sq;
-  double quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
-  double quad2_1212 = invmass2 * invmass2 * r12sq;
-  double quad2_0102 = 2.0 * (invmass0 + invmass2) * invmass0 * r0102;
-  double quad2_0112 = 2.0 * invmass0 * invmass2 * r0112;
-  double quad2_0212 = 2.0 * (invmass0 + invmass2) * invmass2 * r0212;
-
-  double quad3_0101 = invmass1 * invmass1 * r01sq;
-  double quad3_0202 = invmass2 * invmass2 * r02sq;
-  double quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq;
-  double quad3_0102 = - 2.0 * invmass1 * invmass2 * r0102;
-  double quad3_0112 = - 2.0 * (invmass1 + invmass2) * invmass1 * r0112;
-  double quad3_0212 = 2.0 * (invmass1 + invmass2) * invmass2 * r0212;
-
-  // iterate until converged
-
-  double lamda01 = 0.0;
-  double lamda02 = 0.0;
-  double lamda12 = 0.0;
-  int niter = 0;
-  int done = 0;
-
-  double quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new;
-
-  while(!done && niter < max_iter) {
-    quad1 = quad1_0101 * lamda01 * lamda01 +
-            quad1_0202 * lamda02 * lamda02 +
-            quad1_1212 * lamda12 * lamda12 +
-            quad1_0102 * lamda01 * lamda02 +
-            quad1_0112 * lamda01 * lamda12 +
-            quad1_0212 * lamda02 * lamda12;
-
-    quad2 = quad2_0101 * lamda01 * lamda01 +
-            quad2_0202 * lamda02 * lamda02 +
-            quad2_1212 * lamda12 * lamda12 +
-            quad2_0102 * lamda01 * lamda02 +
-            quad2_0112 * lamda01 * lamda12 +
-            quad2_0212 * lamda02 * lamda12;
-
-    quad3 = quad3_0101 * lamda01 * lamda01 +
-            quad3_0202 * lamda02 * lamda02 +
-            quad3_1212 * lamda12 * lamda12 +
-            quad3_0102 * lamda01 * lamda02 +
-            quad3_0112 * lamda01 * lamda12 +
-            quad3_0212 * lamda02 * lamda12;
-
-    b1 = bond1 * bond1 - s01sq - quad1;
-    b2 = bond2 * bond2 - s02sq - quad2;
-    b3 = bond12 * bond12 - s12sq - quad3;
-
-    lamda01_new = a11inv * b1 + a12inv * b2 + a13inv * b3;
-    lamda02_new = a21inv * b1 + a22inv * b2 + a23inv * b3;
-    lamda12_new = a31inv * b1 + a32inv * b2 + a33inv * b3;
-
-    done = 1;
-
-    if(fabs(lamda01_new - lamda01) > tolerance) done = 0;
-
-    if(fabs(lamda02_new - lamda02) > tolerance) done = 0;
-
-    if(fabs(lamda12_new - lamda12) > tolerance) done = 0;
-
-    lamda01 = lamda01_new;
-    lamda02 = lamda02_new;
-    lamda12 = lamda12_new;
-    niter++;
-  }
-
-  // update forces if atom is owned by this processor
-
-  lamda01 = lamda01 / dtfsq;
-  lamda02 = lamda02 / dtfsq;
-  lamda12 = lamda12 / dtfsq;
-
-  if(i0 < nlocal) {
-    f[i0][0] += lamda01 * r01[0] + lamda02 * r02[0];
-    f[i0][1] += lamda01 * r01[1] + lamda02 * r02[1];
-    f[i0][2] += lamda01 * r01[2] + lamda02 * r02[2];
-  }
-
-  if(i1 < nlocal) {
-    f[i1][0] -= lamda01 * r01[0] - lamda12 * r12[0];
-    f[i1][1] -= lamda01 * r01[1] - lamda12 * r12[1];
-    f[i1][2] -= lamda01 * r01[2] - lamda12 * r12[2];
-  }
-
-  if(i2 < nlocal) {
-    f[i2][0] -= lamda02 * r02[0] + lamda12 * r12[0];
-    f[i2][1] -= lamda02 * r02[1] + lamda12 * r12[1];
-    f[i2][2] -= lamda02 * r02[2] + lamda12 * r12[2];
-  }
-
-  if(evflag) {
-    nlist = 0;
-
-    if(i0 < nlocal) list[nlist++] = i0;
-
-    if(i1 < nlocal) list[nlist++] = i1;
-
-    if(i2 < nlocal) list[nlist++] = i2;
-
-    v[0] = lamda01 * r01[0] * r01[0] + lamda02 * r02[0] * r02[0] + lamda12 * r12[0] * r12[0];
-    v[1] = lamda01 * r01[1] * r01[1] + lamda02 * r02[1] * r02[1] + lamda12 * r12[1] * r12[1];
-    v[2] = lamda01 * r01[2] * r01[2] + lamda02 * r02[2] * r02[2] + lamda12 * r12[2] * r12[2];
-    v[3] = lamda01 * r01[0] * r01[1] + lamda02 * r02[0] * r02[1] + lamda12 * r12[0] * r12[1];
-    v[4] = lamda01 * r01[0] * r01[2] + lamda02 * r02[0] * r02[2] + lamda12 * r12[0] * r12[2];
-    v[5] = lamda01 * r01[1] * r01[2] + lamda02 * r02[1] * r02[2] + lamda12 * r12[1] * r12[2];
-
-    v_tally(nlist, list, 3.0, v);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   print-out bond & angle statistics
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::stats()
-{
-  int i, j, m, n, iatom, jatom, katom;
-  double delx, dely, delz;
-  double r, r1, r2, r3, angle;
-
-  // zero out accumulators
-
-  int nb = atom->nbondtypes + 1;
-  int na = atom->nangletypes + 1;
-
-  for(i = 0; i < nb; i++) {
-    b_count[i] = 0;
-    b_ave[i] = b_max[i] = 0.0;
-    b_min[i] = BIG;
-  }
-
-  for(i = 0; i < na; i++) {
-    a_count[i] = 0;
-    a_ave[i] = a_max[i] = 0.0;
-    a_min[i] = BIG;
-  }
-
-  // log stats for each bond & angle
-  // OK to double count since are just averaging
-
-  double** x = atom->x;
-  int nlocal = atom->nlocal;
-
-  for(i = 0; i < nlocal; i++) {
-    if(shake_flag[i] == 0) continue;
-
-    // bond stats
-
-    n = shake_flag[i];
-
-    if(n == 1) n = 3;
-
-    iatom = atom->map(shake_atom[i][0]);
-
-    for(j = 1; j < n; j++) {
-      jatom = atom->map(shake_atom[i][j]);
-      delx = x[iatom][0] - x[jatom][0];
-      dely = x[iatom][1] - x[jatom][1];
-      delz = x[iatom][2] - x[jatom][2];
-      domain->minimum_image(delx, dely, delz);
-      r = sqrt(delx * delx + dely * dely + delz * delz);
-
-      m = shake_type[i][j - 1];
-      b_count[m]++;
-      b_ave[m] += r;
-      b_max[m] = MAX(b_max[m], r);
-      b_min[m] = MIN(b_min[m], r);
-    }
-
-    // angle stats
-
-    if(shake_flag[i] == 1) {
-      iatom = atom->map(shake_atom[i][0]);
-      jatom = atom->map(shake_atom[i][1]);
-      katom = atom->map(shake_atom[i][2]);
-
-      delx = x[iatom][0] - x[jatom][0];
-      dely = x[iatom][1] - x[jatom][1];
-      delz = x[iatom][2] - x[jatom][2];
-      domain->minimum_image(delx, dely, delz);
-      r1 = sqrt(delx * delx + dely * dely + delz * delz);
-
-      delx = x[iatom][0] - x[katom][0];
-      dely = x[iatom][1] - x[katom][1];
-      delz = x[iatom][2] - x[katom][2];
-      domain->minimum_image(delx, dely, delz);
-      r2 = sqrt(delx * delx + dely * dely + delz * delz);
-
-      delx = x[jatom][0] - x[katom][0];
-      dely = x[jatom][1] - x[katom][1];
-      delz = x[jatom][2] - x[katom][2];
-      domain->minimum_image(delx, dely, delz);
-      r3 = sqrt(delx * delx + dely * dely + delz * delz);
-
-      angle = acos((r1 * r1 + r2 * r2 - r3 * r3) / (2.0 * r1 * r2));
-      angle *= 180.0 / MY_PI;
-      m = shake_type[i][2];
-      a_count[m]++;
-      a_ave[m] += angle;
-      a_max[m] = MAX(a_max[m], angle);
-      a_min[m] = MIN(a_min[m], angle);
-    }
-  }
-
-  // sum across all procs
-
-  MPI_Allreduce(b_count, b_count_all, nb, MPI_INT, MPI_SUM, world);
-  MPI_Allreduce(b_ave, b_ave_all, nb, MPI_DOUBLE, MPI_SUM, world);
-  MPI_Allreduce(b_max, b_max_all, nb, MPI_DOUBLE, MPI_MAX, world);
-  MPI_Allreduce(b_min, b_min_all, nb, MPI_DOUBLE, MPI_MIN, world);
-
-  MPI_Allreduce(a_count, a_count_all, na, MPI_INT, MPI_SUM, world);
-  MPI_Allreduce(a_ave, a_ave_all, na, MPI_DOUBLE, MPI_SUM, world);
-  MPI_Allreduce(a_max, a_max_all, na, MPI_DOUBLE, MPI_MAX, world);
-  MPI_Allreduce(a_min, a_min_all, na, MPI_DOUBLE, MPI_MIN, world);
-
-  // print stats only for non-zero counts
-
-  if(me == 0) {
-    if(screen) {
-      fprintf(screen,
-              "SHAKE stats (type/ave/delta) on step " BIGINT_FORMAT "\n",
-              update->ntimestep);
-
-      for(i = 1; i < nb; i++)
-        if(b_count_all[i])
-          fprintf(screen, "  %d %g %g\n", i,
-                  b_ave_all[i] / b_count_all[i], b_max_all[i] - b_min_all[i]);
-
-      for(i = 1; i < na; i++)
-        if(a_count_all[i])
-          fprintf(screen, "  %d %g %g\n", i,
-                  a_ave_all[i] / a_count_all[i], a_max_all[i] - a_min_all[i]);
-    }
-
-    if(logfile) {
-      fprintf(logfile,
-              "SHAKE stats (type/ave/delta) on step " BIGINT_FORMAT "\n",
-              update->ntimestep);
-
-      for(i = 0; i < nb; i++)
-        if(b_count_all[i])
-          fprintf(logfile, "  %d %g %g\n", i,
-                  b_ave_all[i] / b_count_all[i], b_max_all[i] - b_min_all[i]);
-
-      for(i = 0; i < na; i++)
-        if(a_count_all[i])
-          fprintf(logfile, "  %d %g %g\n", i,
-                  a_ave_all[i] / a_count_all[i], a_max_all[i] - a_min_all[i]);
-    }
-  }
-
-  // next timestep for stats
-
-  next_output += output_every;
-}
-
-/* ----------------------------------------------------------------------
-   find a bond between global tags n1 and n2 stored with local atom i
-   return -1 if don't find it
-   return bond index if do find it
-------------------------------------------------------------------------- */
-
-int FixShakeCuda::bondfind(int i, int n1, int n2)
-{
-  int* tag = atom->tag;
-  int** bond_atom = atom->bond_atom;
-  int nbonds = atom->num_bond[i];
-
-  int m;
-
-  for(m = 0; m < nbonds; m++) {
-    if(n1 == tag[i] && n2 == bond_atom[i][m]) break;
-
-    if(n1 == bond_atom[i][m] && n2 == tag[i]) break;
-  }
-
-  if(m < nbonds) return m;
-
-  return -1;
-}
-
-/* ----------------------------------------------------------------------
-   find an angle with global end atoms n1 and n2 stored with local atom i
-   return -1 if don't find it
-   return angle index if do find it
-------------------------------------------------------------------------- */
-
-int FixShakeCuda::anglefind(int i, int n1, int n2)
-{
-  int** angle_atom1 = atom->angle_atom1;
-  int** angle_atom3 = atom->angle_atom3;
-  int nangles = atom->num_angle[i];
-
-  int m;
-
-  for(m = 0; m < nangles; m++) {
-    if(n1 == angle_atom1[i][m] && n2 == angle_atom3[i][m]) break;
-
-    if(n1 == angle_atom3[i][m] && n2 == angle_atom1[i][m]) break;
-  }
-
-  if(m < nangles) return m;
-
-  return -1;
-}
-
-/* ----------------------------------------------------------------------
-   memory usage of local atom-based arrays
-------------------------------------------------------------------------- */
-
-double FixShakeCuda::memory_usage()
-{
-  int nmax = atom->nmax;
-  double bytes = nmax * sizeof(int);
-  bytes += nmax * 4 * sizeof(int);
-  bytes += nmax * 3 * sizeof(int);
-  bytes += nmax * 3 * sizeof(double);
-  bytes += maxvatom * 6 * sizeof(double);
-  return bytes;
-}
-
-/* ----------------------------------------------------------------------
-   allocate local atom-based arrays
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::grow_arrays(int nmax)
-{
-  memory->grow(shake_flag, nmax, "shake:shake_flag");
-  memory->grow(shake_atom, nmax, 4, "shake:shake_atom");
-  memory->grow(shake_type, nmax, 3, "shake:shake_type");
-  memory->destroy(xshake);
-  memory->create(xshake, nmax, 3, "shake:xshake");
-
-  delete cu_shake_flag;
-  cu_shake_flag = new cCudaData<int, int, xx > (shake_flag, nmax);
-  delete cu_shake_atom;
-  cu_shake_atom = new cCudaData<int, int, yx> ((int*)shake_atom, nmax, 4);
-  delete cu_shake_type;
-  cu_shake_type = new cCudaData<int, int, yx> ((int*)shake_type, nmax, 3);
-  delete cu_xshake;
-  cu_xshake = new cCudaData<double, X_CFLOAT, xy> ((double*)xshake, nmax, 3);
-  cu_shake_flag->upload();
-  cu_shake_atom->upload();
-  cu_shake_type->upload();
-
-  if(cu_bond_distance)
-    Cuda_FixShakeCuda_Init(&cuda->shared_data, dtv, dtfsq,
-                           cu_shake_flag->dev_data(), cu_shake_atom->dev_data(), cu_shake_type->dev_data(), cu_xshake->dev_data(),
-                           cu_bond_distance->dev_data(), cu_angle_distance->dev_data(), cu_virial->dev_data(),
-                           max_iter, tolerance);
-}
-
-/* ----------------------------------------------------------------------
-   copy values within local atom-based arrays
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::copy_arrays(int i, int j, int delflag)
-{
-  int flag = shake_flag[j] = shake_flag[i];
-
-  if(flag == 1) {
-    shake_atom[j][0] = shake_atom[i][0];
-    shake_atom[j][1] = shake_atom[i][1];
-    shake_atom[j][2] = shake_atom[i][2];
-    shake_type[j][0] = shake_type[i][0];
-    shake_type[j][1] = shake_type[i][1];
-    shake_type[j][2] = shake_type[i][2];
-  } else if(flag == 2) {
-    shake_atom[j][0] = shake_atom[i][0];
-    shake_atom[j][1] = shake_atom[i][1];
-    shake_type[j][0] = shake_type[i][0];
-  } else if(flag == 3) {
-    shake_atom[j][0] = shake_atom[i][0];
-    shake_atom[j][1] = shake_atom[i][1];
-    shake_atom[j][2] = shake_atom[i][2];
-    shake_type[j][0] = shake_type[i][0];
-    shake_type[j][1] = shake_type[i][1];
-  } else if(flag == 4) {
-    shake_atom[j][0] = shake_atom[i][0];
-    shake_atom[j][1] = shake_atom[i][1];
-    shake_atom[j][2] = shake_atom[i][2];
-    shake_atom[j][3] = shake_atom[i][3];
-    shake_type[j][0] = shake_type[i][0];
-    shake_type[j][1] = shake_type[i][1];
-    shake_type[j][2] = shake_type[i][2];
-  }
-}
-
-/* ----------------------------------------------------------------------
-   initialize one atom's array values, called when atom is created
-------------------------------------------------------------------------- */
-
-void FixShakeCuda::set_arrays(int i)
-{
-  shake_flag[i] = 0;
-}
-
-/* ----------------------------------------------------------------------
-   pack values in local atom-based arrays for exchange with another proc
-------------------------------------------------------------------------- */
-
-int FixShakeCuda::pack_exchange(int i, double* buf)
-{
-  int m = 0;
-  buf[m++] = shake_flag[i];
-  int flag = shake_flag[i];
-
-  if(flag == 1) {
-    buf[m++] = shake_atom[i][0];
-    buf[m++] = shake_atom[i][1];
-    buf[m++] = shake_atom[i][2];
-    buf[m++] = shake_type[i][0];
-    buf[m++] = shake_type[i][1];
-    buf[m++] = shake_type[i][2];
-  } else if(flag == 2) {
-    buf[m++] = shake_atom[i][0];
-    buf[m++] = shake_atom[i][1];
-    buf[m++] = shake_type[i][0];
-  } else if(flag == 3) {
-    buf[m++] = shake_atom[i][0];
-    buf[m++] = shake_atom[i][1];
-    buf[m++] = shake_atom[i][2];
-    buf[m++] = shake_type[i][0];
-    buf[m++] = shake_type[i][1];
-  } else if(flag == 4) {
-    buf[m++] = shake_atom[i][0];
-    buf[m++] = shake_atom[i][1];
-    buf[m++] = shake_atom[i][2];
-    buf[m++] = shake_atom[i][3];
-    buf[m++] = shake_type[i][0];
-    buf[m++] = shake_type[i][1];
-    buf[m++] = shake_type[i][2];
-  }
-
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   unpack values in local atom-based arrays from exchange with another proc
-------------------------------------------------------------------------- */
-
-int FixShakeCuda::unpack_exchange(int nlocal, double* buf)
-{
-  int m = 0;
-  int flag = shake_flag[nlocal] = static_cast<int>(buf[m++]);
-
-  if(flag == 1) {
-    shake_atom[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][1] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][2] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][1] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][2] = static_cast<int>(buf[m++]);
-  } else if(flag == 2) {
-    shake_atom[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][1] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][0] = static_cast<int>(buf[m++]);
-  } else if(flag == 3) {
-    shake_atom[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][1] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][2] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][1] = static_cast<int>(buf[m++]);
-  } else if(flag == 4) {
-    shake_atom[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][1] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][2] = static_cast<int>(buf[m++]);
-    shake_atom[nlocal][3] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][0] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][1] = static_cast<int>(buf[m++]);
-    shake_type[nlocal][2] = static_cast<int>(buf[m++]);
-  }
-
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   enforce SHAKE constraints from rRESPA
-   prediction portion is different than Verlet
-   rRESPA updating of atom coords is done with full v, but only portions of f
-------------------------------------------------------------------------- */
-#if 0
-void FixShakeCuda::post_force_respa(int vflag, int ilevel, int iloop)
-{
-  // call stats only on outermost level
-
-  if(ilevel == nlevels_respa - 1 && update->ntimestep == next_output) stats();
-
-  // perform SHAKE on every loop iteration of every rRESPA level
-  // except last loop iteration of inner levels
-
-  if(ilevel < nlevels_respa - 1 && iloop == loop_respa[ilevel] - 1) return;
-
-  // xshake = atom coords after next x update in innermost loop
-  // depends on rRESPA level
-  // for levels > 0 this includes more than one velocity update
-  // xshake = predicted position from call to this routine at level N =
-  // x + dt0 (v + dtN/m fN + 1/2 dt(N-1)/m f(N-1) + ... + 1/2 dt0/m f0)
-
-  double** *f_level = ((FixRespa*) modify->fix[ifix_respa])->f_level;
-  dtfsq = dtf_inner * step_respa[ilevel];
-
-  double invmass, dtfmsq;
-  int jlevel;
-
-  if(rmass) {
-    for(int i = 0; i < nlocal; i++) {
-      if(shake_flag[i]) {
-        invmass = 1.0 / rmass[i];
-        dtfmsq = dtfsq * invmass;
-        xshake[i][0] = x[i][0] + dtv * v[i][0] + dtfmsq * f[i][0];
-        xshake[i][1] = x[i][1] + dtv * v[i][1] + dtfmsq * f[i][1];
-        xshake[i][2] = x[i][2] + dtv * v[i][2] + dtfmsq * f[i][2];
-
-        for(jlevel = 0; jlevel < ilevel; jlevel++) {
-          dtfmsq = dtf_innerhalf * step_respa[jlevel] * invmass;
-          xshake[i][0] += dtfmsq * f_level[i][jlevel][0];
-          xshake[i][1] += dtfmsq * f_level[i][jlevel][1];
-          xshake[i][2] += dtfmsq * f_level[i][jlevel][2];
-        }
-      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
-    }
-
-  } else {
-    for(int i = 0; i < nlocal; i++) {
-      if(shake_flag[i]) {
-        invmass = 1.0 / mass[type[i]];
-        dtfmsq = dtfsq * invmass;
-        xshake[i][0] = x[i][0] + dtv * v[i][0] + dtfmsq * f[i][0];
-        xshake[i][1] = x[i][1] + dtv * v[i][1] + dtfmsq * f[i][1];
-        xshake[i][2] = x[i][2] + dtv * v[i][2] + dtfmsq * f[i][2];
-
-        for(jlevel = 0; jlevel < ilevel; jlevel++) {
-          dtfmsq = dtf_innerhalf * step_respa[jlevel] * invmass;
-          xshake[i][0] += dtfmsq * f_level[i][jlevel][0];
-          xshake[i][1] += dtfmsq * f_level[i][jlevel][1];
-          xshake[i][2] += dtfmsq * f_level[i][jlevel][2];
-        }
-      } else xshake[i][2] = xshake[i][1] = xshake[i][0] = 0.0;
-    }
-  }
-
-  // communicate results if necessary
-
-  if(nprocs > 1) comm->forward_comm_fix(this);
-
-  // virial setup
-
-  if(vflag) v_setup(vflag);
-  else evflag = 0;
-
-  // loop over clusters
-
-  int m;
-
-  for(int i = 0; i < nlist; i++) {
-    m = list[i];
-
-    if(shake_flag[m] == 2) shake2(m);
-    else if(shake_flag[m] == 3) shake3(m);
-    else if(shake_flag[m] == 4) shake4(m);
-    else shake3angle(m);
-  }
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-int FixShakeCuda::pack_forward_comm(int n, int* list, double* buf,
-                                    int pbc_flag, int* pbc)
-{
-  if(cuda->finished_setup) {
-    int iswap = *list;
-
-    if(iswap < 0) {
-      iswap = -iswap - 1;
-      int first = ((int*) buf)[0];
-      Cuda_FixShakeCuda_PackComm_Self(&cuda->shared_data, n, iswap, first, pbc, pbc_flag);
-    } else
-      Cuda_FixShakeCuda_PackComm(&cuda->shared_data, n, iswap, (void*) buf, pbc, pbc_flag);
-
-    return 3*n;
-  }
-
-  int i, j, m;
-  double dx, dy, dz;
-
-  m = 0;
-
-  if(pbc_flag == 0) {
-    for(i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = xshake[j][0];
-      buf[m++] = xshake[j][1];
-      buf[m++] = xshake[j][2];
-    }
-  } else {
-    if(domain->triclinic == 0) {
-      dx = pbc[0] * domain->xprd;
-      dy = pbc[1] * domain->yprd;
-      dz = pbc[2] * domain->zprd;
-    } else {
-      dx = pbc[0] * domain->xprd + pbc[5] * domain->xy + pbc[4] * domain->xz;
-      dy = pbc[1] * domain->yprd + pbc[3] * domain->yz;
-      dz = pbc[2] * domain->zprd;
-    }
-
-    for(i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = xshake[j][0] + dx;
-      buf[m++] = xshake[j][1] + dy;
-      buf[m++] = xshake[j][2] + dz;
-    }
-  }
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShakeCuda::unpack_forward_comm(int n, int first, double* buf)
-{
-  if(cuda->finished_setup) {
-    Cuda_FixShakeCuda_UnpackComm(&cuda->shared_data, n, first, (void*)buf);
-    return;
-  }
-
-  int i, m, last;
-
-  m = 0;
-  last = first + n;
-
-  for(i = first; i < last; i++) {
-    xshake[i][0] = buf[m++];
-    xshake[i][1] = buf[m++];
-    xshake[i][2] = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShakeCuda::reset_dt()
-{
-  if(strstr(update->integrate_style, "verlet")) {
-    dtv = update->dt;
-    dtfsq = update->dt * update->dt * force->ftm2v;
-  } else {
-    dtv = step_respa[0];
-    dtf_innerhalf = 0.5 * step_respa[0] * force->ftm2v;
-    dtf_inner = step_respa[0] * force->ftm2v;
-  }
-
-  if(cu_shake_atom)
-    Cuda_FixShakeCuda_Init(&cuda->shared_data, dtv, dtfsq,
-                           cu_shake_flag->dev_data(), cu_shake_atom->dev_data(), cu_shake_type->dev_data(), cu_xshake->dev_data(),
-                           cu_bond_distance->dev_data(), cu_angle_distance->dev_data(), cu_virial->dev_data(),
-                           max_iter, tolerance);
-}
diff --git a/src/USER-CUDA/fix_shake_cuda.h b/src/USER-CUDA/fix_shake_cuda.h
deleted file mode 100644
index 577ea1daa4..0000000000
--- a/src/USER-CUDA/fix_shake_cuda.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(shake/cuda,FixShakeCuda)
-
-#else
-
-#ifndef LMP_FIX_SHAKE_CUDA_H
-#define LMP_FIX_SHAKE_CUDA_H
-
-#include "fix.h"
-#include "cuda_data.h"
-#include "cuda_precision.h"
-
-namespace LAMMPS_NS {
-
-class FixShakeCuda : public Fix {
- public:
-  FixShakeCuda(class LAMMPS *, int, char **);
-  ~FixShakeCuda();
-  int setmask();
-  void init();
-  void setup(int);
-  void pre_neighbor();
-  void post_force(int);
-  //void post_force_respa(int, int, int);
-
-  double memory_usage();
-  void grow_arrays(int);
-  void copy_arrays(int, int, int);
-  void set_arrays(int);
-  int pack_exchange(int, double *);
-  int unpack_exchange(int, double *);
-  int pack_forward_comm(int, int *, double *, int, int *);
-  void unpack_forward_comm(int, int, double *);
-
-  int dof(int);
-  void reset_dt();
-
-  double time_postforce;
- private:
-  class Cuda *cuda;
-  int me,nprocs;
-  double tolerance;                      // SHAKE tolerance
-  int max_iter;                          // max # of SHAKE iterations
-  int output_every;                      // SHAKE stat output every so often
-  int next_output;                       // timestep for next output
-
-                                         // settings from input command
-  int *bond_flag,*angle_flag;            // bond/angle types to constrain
-  int *type_flag;                        // constrain bonds to these types
-  double *mass_list;                     // constrain bonds to these masses
-  int nmass;                             // # of masses in mass_list
-  bool neighbor_step;                                         // was neighboring done in this step -> need to run the Cuda_FixShake_Init
-
-  double *bond_distance,*angle_distance; // constraint distances
-  cCudaData<double           , X_CFLOAT , xx >* cu_bond_distance;
-  cCudaData<double           , X_CFLOAT , xx >* cu_angle_distance;
-
-  int ifix_respa;                        // rRESPA fix needed by SHAKE
-  int nlevels_respa;                     // copies of needed rRESPA variables
-  int *loop_respa;
-  double *step_respa;
-
-  double **x,**v,**f;                    // local ptrs to atom class quantities
-  double *mass,*rmass;
-  int *type;
-  int nlocal;
-                                         // atom-based arrays
-  int *shake_flag;                       // 0 if atom not in SHAKE cluster
-                                         // 1 = size 3 angle cluster
-                                         // 2,3,4 = size of bond-only cluster
-  int **shake_atom;                      // global IDs of atoms in cluster
-                                         // central atom is 1st
-                                         // lowest global ID is 1st for size 2
-
-  int **shake_type;                      // bondtype of each bond in cluster
-                                         // for angle cluster, 3rd value
-                                         //   is angletype
-  double **xshake;                       // unconstrained atom coords
-  cCudaData<int           , int            , xx >* cu_shake_flag;
-  cCudaData<int           , int            , yx >* cu_shake_atom;
-  cCudaData<int           , int            , yx >* cu_shake_type;
-  cCudaData<double           , X_CFLOAT , xy >* cu_xshake;
-  cCudaData<int           , int            , xx >* cu_list;
-  cCudaData<double           , ENERGY_CFLOAT , xx >* cu_virial;
-
-  double dtv,dtfsq;                     // timesteps for trial move
-  double dtf_inner,dtf_innerhalf;       // timesteps for rRESPA trial move
-
-  int *list;                            // list of clusters to SHAKE
-  int nlist,maxlist;                    // size and max-size of list
-
-                                        // stat quantities
-  int *b_count,*b_count_all;            // counts for each bond type
-  double *b_ave,*b_max,*b_min;          // ave/max/min dist for each bond type
-  double *b_ave_all,*b_max_all,*b_min_all;   // MPI summing arrays
-  int *a_count,*a_count_all;            // ditto for angle types
-  double *a_ave,*a_max,*a_min;
-  double *a_ave_all,*a_max_all,*a_min_all;
-
-  void find_clusters();
-  void swap_clusters(int i,int j);
-  int masscheck(double);
-  void unconstrained_update();
-  void shake2(int);
-  void shake3(int);
-  void shake4(int);
-  void shake3angle(int);
-  void stats();
-  int bondfind(int, int, int);
-  int anglefind(int, int, int);
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_temp_berendsen_cuda.cpp b/src/USER-CUDA/fix_temp_berendsen_cuda.cpp
deleted file mode 100644
index ee08aa3462..0000000000
--- a/src/USER-CUDA/fix_temp_berendsen_cuda.cpp
+++ /dev/null
@@ -1,219 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include "fix_temp_berendsen_cuda.h"
-#include "fix_temp_berendsen_cuda_cu.h"
-#include "atom.h"
-#include "force.h"
-#include "group.h"
-#include "update.h"
-#include "comm.h"
-#include "modify.h"
-#include "compute.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-enum{NOBIAS,BIAS};
-
-/* ---------------------------------------------------------------------- */
-
-FixTempBerendsenCuda::FixTempBerendsenCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg != 6) error->all(FLERR,"Illegal fix temp/berendsen/cuda command");
-
-  // Berendsen thermostat should be applied every step
-
-  nevery = 1;
-
-  t_start = force->numeric(FLERR,arg[3]);
-  t_stop = force->numeric(FLERR,arg[4]);
-  t_period = force->numeric(FLERR,arg[5]);
-
-  // error checks
-
-  if (t_period <= 0.0) error->all(FLERR,"Fix temp/berendsen/cuda period must be > 0.0");
-
-  // create a new compute temp style
-  // id = fix-ID + temp, compute group = fix group
-
-  int n = strlen(id) + 6;
-  id_temp = new char[n];
-  strcpy(id_temp,id);
-  strcat(id_temp,"_temp");
-
-  char **newarg = new char*[3];
-  newarg[0] = id_temp;
-  newarg[1] = group->names[igroup];
-  newarg[2] = (char *) "temp/cuda";
-  modify->add_compute(3,newarg);
-  delete [] newarg;
-  tflag = 1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixTempBerendsenCuda::~FixTempBerendsenCuda()
-{
-  // delete temperature if fix created it
-
-  if (tflag) modify->delete_compute(id_temp);
-  delete [] id_temp;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixTempBerendsenCuda::setmask()
-{
-  int mask = 0;
-  mask |= END_OF_STEP_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempBerendsenCuda::init()
-{
-  int icompute = modify->find_compute(id_temp);
-  if (icompute < 0)
-    error->all(FLERR,"Temperature ID for fix temp/berendsen/cuda does not exist");
-  temperature = modify->compute[icompute];
-  if(not temperature->cudable)
-        error->warning(FLERR,"Fix temp/berendsen/cuda uses non cudable temperature compute");
-  if (temperature->tempbias) which = BIAS;
-  else which = NOBIAS;
-
-  //temperature->init();        //not in original berendsen possible error?
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempBerendsenCuda::end_of_step()
-{
-  double t_current;
-  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
-  t_current = temperature->compute_scalar();
-  if (t_current == 0.0)
-    error->all(FLERR,"Computed temperature for fix temp/berendsen/cuda cannot be 0.0");
-
-  double delta = update->ntimestep - update->beginstep;
-  delta /= update->endstep - update->beginstep;
-  t_target = t_start + delta * (t_stop-t_start);
-
-  // rescale velocities by lamda
-
-  double lamda = sqrt(1.0 + update->dt/t_period*(t_target/t_current - 1.0));
-
-  double **v = atom->v;
-  int *mask = atom->mask;
-  int nlocal = atom->nlocal;
-
-  if (which == NOBIAS) {
-        Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda);
-
-    } else {
-      if(not temperature->cudable)
-      {
-              cuda->cu_x->download();cuda->cu_v->download();
-      for (int i = 0; i < nlocal; i++) {
-        if (mask[i] & groupbit) {
-          temperature->remove_bias(i,v[i]);
-           v[i][0] *= lamda;
-          v[i][1] *= lamda;
-          v[i][2] *= lamda;
-          temperature->restore_bias(i,v[i]);
-        }
-        }
-          cuda->cu_v->upload();
-      }
-      else
-          {
-              temperature->remove_bias_all();
-            Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda);
-            temperature->restore_bias_all();
-          }
-    }
-
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixTempBerendsenCuda::modify_param(int narg, char **arg)
-{
-  if (strcmp(arg[0],"temp") == 0) {
-    if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
-    if (tflag) {
-      modify->delete_compute(id_temp);
-      tflag = 0;
-    }
-    delete [] id_temp;
-    int n = strlen(arg[1]) + 1;
-    id_temp = new char[n];
-    strcpy(id_temp,arg[1]);
-
-    int icompute = modify->find_compute(id_temp);
-    if (icompute < 0) error->all(FLERR,"Could not find fix_modify temperature ID");
-    temperature = modify->compute[icompute];
-
-    if (temperature->tempflag == 0)
-      error->all(FLERR,"Fix_modify temperature ID does not compute temperature");
-    if (temperature->igroup != igroup && comm->me == 0)
-      error->warning(FLERR,"Group for fix_modify temp != fix group");
-    return 2;
-  }
-  return 0;
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempBerendsenCuda::reset_target(double t_new)
-{
-  t_start = t_stop = t_new;
-}
diff --git a/src/USER-CUDA/fix_temp_berendsen_cuda.h b/src/USER-CUDA/fix_temp_berendsen_cuda.h
deleted file mode 100644
index 610e5421e5..0000000000
--- a/src/USER-CUDA/fix_temp_berendsen_cuda.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-#ifdef FIX_CLASS
-
-FixStyle(temp/berendsen/cuda,FixTempBerendsenCuda)
-
-#else
-
-#ifndef LMP_FIX_TEMP_BERENDSEN_CUDA_H
-#define LMP_FIX_TEMP_BERENDSEN_CUDA_H
-
-#include "fix.h"
-
-namespace LAMMPS_NS {
-class FixTempBerendsenCuda : public Fix {
- public:
-  FixTempBerendsenCuda(class LAMMPS *, int, char **);
-  ~FixTempBerendsenCuda();
-  int setmask();
-  void init();
-  void end_of_step();
-  int modify_param(int, char **);
-  void reset_target(double);
-
- private:
-  class Cuda *cuda;
-  int which;
-  double t_start,t_stop,t_target,t_period;
-
-  char *id_temp;
-  class Compute *temperature;
-  int tflag;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_temp_rescale_cuda.cpp b/src/USER-CUDA/fix_temp_rescale_cuda.cpp
deleted file mode 100644
index a0ebb47d12..0000000000
--- a/src/USER-CUDA/fix_temp_rescale_cuda.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include "fix_temp_rescale_cuda.h"
-#include "fix_temp_rescale_cuda_cu.h"
-#include "atom.h"
-#include "force.h"
-#include "group.h"
-#include "update.h"
-#include "domain.h"
-#include "region.h"
-#include "comm.h"
-#include "modify.h"
-#include "compute.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-enum{NOBIAS,BIAS};
-
-/* ---------------------------------------------------------------------- */
-
-FixTempRescaleCuda::FixTempRescaleCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg < 8) error->all(FLERR,"Illegal fix temp/rescale/cuda command");
-
-  nevery = force->inumeric(FLERR,arg[3]);
-  if (nevery <= 0) error->all(FLERR,"Illegal fix temp/rescale/cuda command");
-
-  scalar_flag = 1;
-  global_freq = nevery;
-  extscalar = 1;
-
-  t_start = force->numeric(FLERR,arg[4]);
-  t_stop = force->numeric(FLERR,arg[5]);
-  t_window = force->numeric(FLERR,arg[6]);
-  fraction = force->numeric(FLERR,arg[7]);
-
-  // create a new compute temp
-  // id = fix-ID + temp, compute group = fix group
-
-  int n = strlen(id) + 6;
-  id_temp = new char[n];
-  strcpy(id_temp,id);
-  strcat(id_temp,"_temp");
-
-  char **newarg = new char*[6];
-  newarg[0] = id_temp;
-  newarg[1] = group->names[igroup];
-  newarg[2] = (char *) "temp/cuda";
-  modify->add_compute(3,newarg);
-  delete [] newarg;
-  tflag = 1;
-
-  energy = 0.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixTempRescaleCuda::~FixTempRescaleCuda()
-{
-  // delete temperature if fix created it
-
-  if (tflag) modify->delete_compute(id_temp);
-  delete [] id_temp;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixTempRescaleCuda::setmask()
-{
-  int mask = 0;
-  mask |= END_OF_STEP_CUDA;
-  mask |= THERMO_ENERGY_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempRescaleCuda::init()
-{
-  int icompute = modify->find_compute(id_temp);
-  if (icompute < 0)
-    error->all(FLERR,"Temperature ID for fix temp/rescale/cuda does not exist");
-  temperature = modify->compute[icompute];
-  if(not temperature->cudable)
-        error->warning(FLERR,"Fix temp/rescale/cuda uses non cudable temperature compute");
-  if (temperature->tempbias) which = BIAS;
-  else which = NOBIAS;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempRescaleCuda::end_of_step()
-{
-  double t_current;
-  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
-  t_current = temperature->compute_scalar();
-  if (t_current == 0.0)
-    error->all(FLERR,"Computed temperature for fix temp/rescale/cuda cannot be 0.0");
-
-  double delta = update->ntimestep - update->beginstep;
-  delta /= update->endstep - update->beginstep;
-  double t_target = t_start + delta * (t_stop-t_start);
-
-  // rescale velocity of appropriate atoms if outside window
-
-  if (fabs(t_current-t_target) > t_window) {
-    t_target = t_current - fraction*(t_current-t_target);
-    double factor = sqrt(t_target/t_current);
-    double efactor = 0.5 * force->boltz * temperature->dof;
-
-    double **v = atom->v;
-    int *mask = atom->mask;
-    int nlocal = atom->nlocal;
-
-    if (which == NOBIAS) {
-      energy += (t_current-t_target) * efactor;
-
-        Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor);
-
-    } else if (which == BIAS) {
-      energy += (t_current-t_target) * efactor;
-      if(not temperature->cudable)
-      {
-              cuda->cu_x->download();cuda->cu_v->download();
-      for (int i = 0; i < nlocal; i++) {
-        if (mask[i] & groupbit) {
-          temperature->remove_bias(i,v[i]);
-          v[i][0] *= factor;
-          v[i][1] *= factor;
-          v[i][2] *= factor;
-          temperature->restore_bias(i,v[i]);
-        }
-        }
-          cuda->cu_v->upload();
-      }
-      else
-      {
-            temperature->remove_bias_all();
-            Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor);
-            temperature->restore_bias_all();
-      }
-    }
-
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixTempRescaleCuda::modify_param(int narg, char **arg)
-{
-  if (strcmp(arg[0],"temp") == 0) {
-    if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
-    if (tflag) {
-      modify->delete_compute(id_temp);
-      tflag = 0;
-    }
-    delete [] id_temp;
-    int n = strlen(arg[1]) + 1;
-    id_temp = new char[n];
-    strcpy(id_temp,arg[1]);
-
-    int icompute = modify->find_compute(id_temp);
-    if (icompute < 0) error->all(FLERR,"Could not find fix_modify temperature ID");
-    temperature = modify->compute[icompute];
-
-    if (temperature->tempflag == 0)
-      error->all(FLERR,"Fix_modify temperature ID does not compute temperature");
-    if (temperature->igroup != igroup && comm->me == 0)
-      error->warning(FLERR,"Group for fix_modify temp != fix group");
-    if(not temperature->cudable)
-          error->warning(FLERR,"Fix temp/rescale/cuda uses non cudable temperature compute");
-    return 2;
-  }
-  return 0;
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempRescaleCuda::reset_target(double t_new)
-{
-  t_start = t_stop = t_new;
-}
-
-/* ---------------------------------------------------------------------- */
-
-double FixTempRescaleCuda::compute_scalar()
-{
-  return energy;
-}
diff --git a/src/USER-CUDA/fix_temp_rescale_cuda.h b/src/USER-CUDA/fix_temp_rescale_cuda.h
deleted file mode 100644
index 3bdc71a1a0..0000000000
--- a/src/USER-CUDA/fix_temp_rescale_cuda.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(temp/rescale/cuda,FixTempRescaleCuda)
-
-#else
-
-#ifndef FIX_TEMP_RESCALE_CUDA_H
-#define FIX_TEMP_RESCALE_CUDA_H
-
-#include "fix.h"
-
-namespace LAMMPS_NS {
-class FixTempRescaleCuda : public Fix {
- public:
-  FixTempRescaleCuda(class LAMMPS *, int, char **);
-  ~FixTempRescaleCuda();
-  int setmask();
-  void init();
-  void end_of_step();
-  int modify_param(int, char **);
-  void reset_target(double);
-  double compute_scalar();
-
- private:
-  class Cuda *cuda;
-  int which;
-  double t_start,t_stop,t_window;
-  double fraction,energy,efactor;
-
-  char *id_temp;
-  class Compute *temperature;
-  int tflag;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp b/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
deleted file mode 100644
index eb8cf8d948..0000000000
--- a/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cstring>
-#include <cstdlib>
-#include <cmath>
-#include "fix_temp_rescale_limit_cuda.h"
-#include "fix_temp_rescale_limit_cuda_cu.h"
-#include "atom.h"
-#include "force.h"
-#include "group.h"
-#include "update.h"
-#include "domain.h"
-#include "region.h"
-#include "comm.h"
-#include "modify.h"
-#include "compute.h"
-#include "error.h"
-#include "user_cuda.h"
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-enum{NOBIAS,BIAS};
-
-/* ---------------------------------------------------------------------- */
-
-FixTempRescaleLimitCuda::FixTempRescaleLimitCuda(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if (narg < 9) error->all(FLERR,"Illegal fix temp/rescale/limit/cuda command");
-
-  nevery = force->inumeric(FLERR,arg[3]);
-  if (nevery <= 0) error->all(FLERR,"Illegal fix temp/rescale/limit/cuda command");
-
-  scalar_flag = 1;
-  global_freq = nevery;
-  extscalar = 1;
-
-  t_start = force->numeric(FLERR,arg[4]);
-  t_stop = force->numeric(FLERR,arg[5]);
-  t_window = force->numeric(FLERR,arg[6]);
-  fraction = force->numeric(FLERR,arg[7]);
-  limit = force->numeric(FLERR,arg[8]);
-  if (limit <= 1.0) error->all(FLERR,"Illegal fix temp/rescale/limit/cuda command (limit must be > 1.0)");
-
-
-  // create a new compute temp
-  // id = fix-ID + temp, compute group = fix group
-
-  int n = strlen(id) + 6;
-  id_temp = new char[n];
-  strcpy(id_temp,id);
-  strcat(id_temp,"_temp");
-
-  char **newarg = new char*[6];
-  newarg[0] = id_temp;
-  newarg[1] = group->names[igroup];
-  newarg[2] = (char *) "temp/cuda";
-  modify->add_compute(3,newarg);
-  delete [] newarg;
-  tflag = 1;
-
-  energy = 0.0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixTempRescaleLimitCuda::~FixTempRescaleLimitCuda()
-{
-  // delete temperature if fix created it
-
-  if (tflag) modify->delete_compute(id_temp);
-  delete [] id_temp;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixTempRescaleLimitCuda::setmask()
-{
-  int mask = 0;
-  mask |= END_OF_STEP_CUDA;
-  mask |= THERMO_ENERGY_CUDA;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempRescaleLimitCuda::init()
-{
-  int icompute = modify->find_compute(id_temp);
-  if (icompute < 0)
-    error->all(FLERR,"Temperature ID for fix temp/rescale/limit/cuda does not exist");
-  temperature = modify->compute[icompute];
-  if(not temperature->cudable)
-        error->warning(FLERR,"Fix temp/rescale/limit/cuda uses non cudable temperature compute");
-  if (temperature->tempbias) which = BIAS;
-  else which = NOBIAS;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempRescaleLimitCuda::end_of_step()
-{
-  double t_current;
-  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
-  t_current = temperature->compute_scalar();
-  if (t_current == 0.0)
-    error->all(FLERR,"Computed temperature for fix temp/rescale/limit/cuda cannot be 0.0");
-
-  double delta = update->ntimestep - update->beginstep;
-  delta /= update->endstep - update->beginstep;
-  double t_target = t_start + delta * (t_stop-t_start);
-
-  // rescale velocity of appropriate atoms if outside window
-
-  if (fabs(t_current-t_target) > t_window) {
-    t_target = t_current - fraction*(t_current-t_target);
-    double factor = sqrt(t_target/t_current);
-    double efactor = 0.5 * force->boltz * temperature->dof;
-
-    double **v = atom->v;
-    int *mask = atom->mask;
-    int nlocal = atom->nlocal;
-
-    double massone;
-    if(atom->rmass) massone = atom->rmass[0];
-    else massone = atom->mass[0];
-
-    double current_limit=sqrt(limit*force->boltz*t_target*temperature->dof/massone/force->mvv2e);
-    if (which == NOBIAS) {
-      energy += (t_current-t_target) * efactor;
-
-
-        Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit);
-
-    } else if (which == BIAS) {
-      energy += (t_current-t_target) * efactor;
-      if(not temperature->cudable)
-      {
-              cuda->cu_x->download();cuda->cu_v->download();
-      for (int i = 0; i < nlocal; i++) {
-        if (mask[i] & groupbit) {
-          temperature->remove_bias(i,v[i]);
-          double vx = v[i][0] * factor;
-          double vy = v[i][1] * factor;
-          double vz = v[i][2] * factor;
-          v[i][0]=vx>0?MIN(vx,current_limit):MAX(vx,-current_limit);
-          v[i][1]=vy>0?MIN(vy,current_limit):MAX(vy,-current_limit);
-          v[i][2]=vz>0?MIN(vz,current_limit):MAX(vz,-current_limit);
-
-          temperature->restore_bias(i,v[i]);
-        }
-        }
-          cuda->cu_v->upload();
-      }
-      else
-      {
-               temperature->remove_bias_all();
-            Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit);
-            temperature->restore_bias_all();
-      }
-    }
-
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixTempRescaleLimitCuda::modify_param(int narg, char **arg)
-{
-  if (strcmp(arg[0],"temp") == 0) {
-    if (narg < 2) error->all(FLERR,"Illegal fix_modify command");
-    if (tflag) {
-      modify->delete_compute(id_temp);
-      tflag = 0;
-    }
-    delete [] id_temp;
-    int n = strlen(arg[1]) + 1;
-    id_temp = new char[n];
-    strcpy(id_temp,arg[1]);
-
-    int icompute = modify->find_compute(id_temp);
-    if (icompute < 0) error->all(FLERR,"Could not find fix_modify temperature ID");
-    temperature = modify->compute[icompute];
-
-    if (temperature->tempflag == 0)
-      error->all(FLERR,"Fix_modify temperature ID does not compute temperature");
-    if (temperature->igroup != igroup && comm->me == 0)
-      error->warning(FLERR,"Group for fix_modify temp != fix group");
-    if(not temperature->cudable)
-          error->warning(FLERR,"Fix temp/rescale/limit/cuda uses non cudable temperature compute");
-    return 2;
-  }
-  return 0;
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-void FixTempRescaleLimitCuda::reset_target(double t_new)
-{
-  t_start = t_stop = t_new;
-}
-
-/* ---------------------------------------------------------------------- */
-
-double FixTempRescaleLimitCuda::compute_scalar()
-{
-  return energy;
-}
diff --git a/src/USER-CUDA/fix_temp_rescale_limit_cuda.h b/src/USER-CUDA/fix_temp_rescale_limit_cuda.h
deleted file mode 100644
index b2bba2049a..0000000000
--- a/src/USER-CUDA/fix_temp_rescale_limit_cuda.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(temp/rescale/limit/cuda,FixTempRescaleLimitCuda)
-
-#else
-
-#ifndef FIX_TEMP_RESCALE_LIMIT_CUDA_H
-#define FIX_TEMP_RESCALE_LIMIT_CUDA_H
-
-#include "fix.h"
-
-namespace LAMMPS_NS {
-class FixTempRescaleLimitCuda : public Fix {
- public:
-  FixTempRescaleLimitCuda(class LAMMPS *, int, char **);
-  ~FixTempRescaleLimitCuda();
-  int setmask();
-  void init();
-  void end_of_step();
-  int modify_param(int, char **);
-  void reset_target(double);
-  double compute_scalar();
-
- private:
-  class Cuda *cuda;
-  int which;
-  double t_start,t_stop,t_window;
-  double fraction,energy,efactor;
-  double limit;
-  char *id_temp;
-  class Compute *temperature;
-  int tflag;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/fix_viscous_cuda.cpp b/src/USER-CUDA/fix_viscous_cuda.cpp
deleted file mode 100644
index 09871c86e3..0000000000
--- a/src/USER-CUDA/fix_viscous_cuda.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdlib>
-#include <cstring>
-#include "fix_viscous_cuda.h"
-#include "fix_viscous_cuda_cu.h"
-#include "atom.h"
-#include "update.h"
-#include "respa.h"
-#include "error.h"
-#include "cuda_modify_flags.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-/* ---------------------------------------------------------------------- */
-
-FixViscousCuda::FixViscousCuda(LAMMPS *lmp, int narg, char **arg) :
-  FixViscous(lmp, narg, arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        cu_gamma=NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixViscousCuda::~FixViscousCuda()
-{
-        delete cu_gamma;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixViscousCuda::setmask()
-{
-  int mask = 0;
-  mask |= POST_FORCE_CUDA;
- // mask |= POST_FORCE_RESPA;
- // mask |= MIN_POST_FORCE;
-  return mask;
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-void FixViscousCuda::setup(int vflag)
-{
-   if(not cu_gamma)
-   cu_gamma = new cCudaData<double, F_CFLOAT, x> (gamma,atom->ntypes+1);
-   Cuda_FixViscousCuda_Init(&cuda->shared_data);
-   cu_gamma->upload();
- // if (strcmp(update->integrate_style,"verlet/cuda") == 0)
-    post_force(vflag);
- /* else {
-    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
-    post_force_respa(vflag,nlevels_respa-1,0);
-    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
-  }*/
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixViscousCuda::min_setup(int vflag)
-{
-  Cuda_FixViscousCuda_Init(&cuda->shared_data);
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixViscousCuda::post_force(int vflag)
-{
-  // apply drag force to atoms in group
-  // direction is opposed to velocity vector
-  // magnitude depends on atom type
-
-  Cuda_FixViscousCuda_PostForce(&cuda->shared_data, groupbit,cu_gamma->dev_data());
-}
diff --git a/src/USER-CUDA/fix_viscous_cuda.h b/src/USER-CUDA/fix_viscous_cuda.h
deleted file mode 100644
index e0cb6ba4b0..0000000000
--- a/src/USER-CUDA/fix_viscous_cuda.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef FIX_CLASS
-
-FixStyle(viscous/cuda,FixViscousCuda)
-
-#else
-
-#ifndef LMP_FIX_VISCOUS_CUDA_H
-#define LMP_FIX_VISCOUS_CUDA_H
-
-#include "fix_viscous.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class FixViscousCuda : public FixViscous {
- public:
-  FixViscousCuda(class LAMMPS *, int, char **);
-  ~FixViscousCuda();
-  int setmask();
-  void setup(int);
-  void min_setup(int);
-  void post_force(int);
-  cCudaData<double, F_CFLOAT, x>* cu_gamma;
-
-  private:
-  class Cuda *cuda;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/modify_cuda.cpp b/src/USER-CUDA/modify_cuda.cpp
deleted file mode 100644
index 82d6d92036..0000000000
--- a/src/USER-CUDA/modify_cuda.cpp
+++ /dev/null
@@ -1,437 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <cstdio>
-#include <cstring>
-#include "modify_cuda.h"
-#include "style_compute.h"
-#include "style_fix.h"
-#include "atom.h"
-#include "comm.h"
-#include "fix.h"
-#include "compute.h"
-#include "group.h"
-#include "update.h"
-#include "domain.h"
-#include "user_cuda.h"
-#include "memory.h"
-#include "error.h"
-
-#include "cuda_modify_flags.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-using namespace FixConstCuda;
-
-#define DELTA 4
-
-#define BIG 1.0e20
-
-
-/* ---------------------------------------------------------------------- */
-
-ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  n_initial_integrate_cuda = 0;
-  n_post_integrate_cuda = 0;
-  n_pre_exchange = 0;
-  n_pre_neighbor_cuda = 0;
-  n_pre_force_cuda = 0;
-  n_post_force_cuda = 0;
-  n_final_integrate_cuda = 0;
-  n_end_of_step_cuda = 0;
-  n_thermo_energy_cuda = 0;
-
-  n_initial_integrate_host = 0;
-  n_post_integrate_host = 0;
-  n_pre_exchange = 0;
-  n_pre_neighbor_host = 0;
-  n_pre_force_host = 0;
-  n_post_force_host = 0;
-  n_final_integrate_host = 0;
-  n_end_of_step_host = 0;
-  n_thermo_energy_host = 0;
-
-  list_initial_integrate_cuda = NULL;
-  list_post_integrate_cuda = NULL;
-  list_pre_exchange_cuda = NULL;
-  list_pre_neighbor_cuda = NULL;
-  list_pre_force_cuda = NULL;
-  list_post_force_cuda = NULL;
-  list_final_integrate_cuda = NULL;
-  list_end_of_step_cuda = NULL;
-  list_thermo_energy_cuda = NULL;
-  end_of_step_every_cuda = NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-ModifyCuda::~ModifyCuda()
-{
-  delete [] list_initial_integrate_cuda;
-  delete [] list_post_integrate_cuda;
-  delete [] list_pre_exchange_cuda;
-  delete [] list_pre_neighbor_cuda;
-  delete [] list_pre_force_cuda;
-  delete [] list_post_force_cuda;
-  delete [] list_final_integrate_cuda;
-  delete [] list_end_of_step_cuda;
-  delete [] list_thermo_energy_cuda;
-  delete [] end_of_step_every_cuda;
-}
-
-/* ----------------------------------------------------------------------
-   initialize all fixes and computes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::init()
-{
-  int i,j;
-
-  // delete storage of restart info since it is not valid after 1st run
-
-  restart_deallocate();
-
-  // create lists of fixes to call at each stage of run
-
-  list_init(INITIAL_INTEGRATE,n_initial_integrate,list_initial_integrate);
-  list_init(POST_INTEGRATE,n_post_integrate,list_post_integrate);
-  list_init(PRE_EXCHANGE,n_pre_exchange,list_pre_exchange);
-  list_init(PRE_NEIGHBOR,n_pre_neighbor,list_pre_neighbor);
-  list_init(PRE_FORCE,n_pre_force,list_pre_force);
-  list_init(POST_FORCE,n_post_force,list_post_force);
-  list_init(FINAL_INTEGRATE,n_final_integrate,list_final_integrate);
-  list_init_end_of_step(END_OF_STEP,n_end_of_step,list_end_of_step);
-  list_init_thermo_energy(THERMO_ENERGY,n_thermo_energy,list_thermo_energy);
-
-  list_init(INITIAL_INTEGRATE_CUDA, n_initial_integrate_cuda, list_initial_integrate_cuda);
-  list_init(POST_INTEGRATE_CUDA, n_post_integrate_cuda, list_post_integrate_cuda);
-  list_init(PRE_EXCHANGE_CUDA, n_pre_exchange_cuda, list_pre_exchange_cuda);
-  list_init(PRE_NEIGHBOR_CUDA, n_pre_neighbor_cuda, list_pre_neighbor_cuda);
-  list_init(PRE_FORCE_CUDA, n_pre_force_cuda, list_pre_force_cuda);
-  list_init(POST_FORCE_CUDA, n_post_force_cuda, list_post_force_cuda);
-  list_init(FINAL_INTEGRATE_CUDA, n_final_integrate_cuda, list_final_integrate_cuda);
-  list_init_end_of_step_cuda(END_OF_STEP_CUDA, n_end_of_step_cuda, list_end_of_step_cuda);
-  list_init_thermo_energy(THERMO_ENERGY_CUDA, n_thermo_energy_cuda, list_thermo_energy_cuda);
-
-  n_initial_integrate_host = n_initial_integrate;
-  n_post_integrate_host = n_post_integrate;
-  n_pre_exchange_host = n_pre_exchange;
-  n_pre_neighbor_host = n_pre_neighbor;
-  n_pre_force_host = n_pre_force;
-  n_post_force_host = n_post_force;
-  n_final_integrate_host = n_final_integrate;
-  n_end_of_step_host = n_end_of_step;
-  n_thermo_energy_host = n_thermo_energy;
-
-  n_initial_integrate = n_initial_integrate_cuda+n_initial_integrate_host;
-  n_post_integrate = n_post_integrate_cuda+n_post_integrate_host;
-  n_pre_exchange = n_pre_exchange_cuda+n_pre_exchange_host;
-  n_pre_neighbor = n_pre_neighbor_cuda+n_pre_neighbor_host;
-  n_pre_force = n_pre_force_cuda+n_pre_force_host;
-  n_post_force = n_post_force_cuda+n_post_force_host;
-  n_final_integrate = n_final_integrate_cuda+n_final_integrate_host;
-  n_end_of_step = n_end_of_step_cuda+n_end_of_step_host;
-  n_thermo_energy = n_thermo_energy_cuda+n_thermo_energy_host;
-
-  list_init(INITIAL_INTEGRATE_RESPA,
-            n_initial_integrate_respa,list_initial_integrate_respa);
-  list_init(POST_INTEGRATE_RESPA,
-            n_post_integrate_respa,list_post_integrate_respa);
-  list_init(POST_FORCE_RESPA,
-            n_post_force_respa,list_post_force_respa);
-  list_init(PRE_FORCE_RESPA,
-            n_pre_force_respa,list_pre_force_respa);
-  list_init(FINAL_INTEGRATE_RESPA,
-            n_final_integrate_respa,list_final_integrate_respa);
-
-  list_init(MIN_PRE_EXCHANGE,n_min_pre_exchange,list_min_pre_exchange);
-  list_init(MIN_POST_FORCE,n_min_post_force,list_min_post_force);
-  list_init(MIN_ENERGY,n_min_energy,list_min_energy);
-
-  // init each fix
-  // needs to come before compute init
-  // this is b/c some computes call fix->dof()
-  // FixRigid::dof() depends on its own init having been called
-
-  for (i = 0; i < nfix; i++) fix[i]->init();
-
-  // set global flag if any fix has its restart_pbc flag set
-
-  restart_pbc_any = 0;
-  for (i = 0; i < nfix; i++)
-    if (fix[i]->restart_pbc) restart_pbc_any = 1;
-
-  // create list of computes that store invocation times
-
-  list_init_compute();
-
-  // init each compute
-  // set invoked_scalar,vector,etc to -1 to force new run to re-compute them
-  // add initial timestep to all computes that store invocation times
-  //   since any of them may be invoked by initial thermo
-  // do not clear out invocation times stored within a compute,
-  //   b/c some may be holdovers from previous run, like for ave fixes
-
-  for (i = 0; i < ncompute; i++) {
-    compute[i]->init();
-    compute[i]->invoked_scalar = -1;
-    compute[i]->invoked_vector = -1;
-    compute[i]->invoked_array = -1;
-    compute[i]->invoked_peratom = -1;
-    compute[i]->invoked_local = -1;
-  }
-  addstep_compute_all(update->ntimestep);
-
-  // warn if any particle is time integrated more than once
-
-  int nlocal = atom->nlocal;
-  int *mask = atom->mask;
-
-  int *flag = new int[nlocal];
-  for (i = 0; i < nlocal; i++) flag[i] = 0;
-
-  int groupbit;
-  for (i = 0; i < nfix; i++) {
-    if (fix[i]->time_integrate == 0) continue;
-    groupbit = fix[i]->groupbit;
-    for (j = 0; j < nlocal; j++)
-      if (mask[j] & groupbit) flag[j]++;
-  }
-
-  int check = 0;
-  for (i = 0; i < nlocal; i++)
-    if (flag[i] > 1) check = 1;
-
-  delete [] flag;
-
-  int checkall;
-  MPI_Allreduce(&check,&checkall,1,MPI_INT,MPI_SUM,world);
-  if (comm->me == 0 && checkall)
-    error->warning(FLERR,"One or more atoms are time integrated more than once");
-}
-
-/* ----------------------------------------------------------------------
-   1st half of integrate call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::initial_integrate(int vflag)
-{
-        for(int i = 0; i < n_initial_integrate_cuda; i++)
-                fix[list_initial_integrate_cuda[i]]->initial_integrate(vflag);
-
-        if(n_initial_integrate_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_initial_integrate_host; i++)
-                        fix[list_initial_integrate[i]]->initial_integrate(vflag);
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   post_integrate call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::post_integrate()
-{
-        for(int i = 0; i < n_post_integrate_cuda; i++)
-                fix[list_post_integrate_cuda[i]]->post_integrate();
-
-        if(n_post_integrate_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_post_integrate_host; i++)
-                        fix[list_post_integrate[i]]->post_integrate();
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   pre_exchange call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::pre_exchange()
-{
-        for(int i = 0; i < n_pre_exchange_cuda; i++)
-                fix[list_pre_exchange_cuda[i]]->pre_exchange();
-
-        if(n_pre_exchange_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_pre_exchange_host; i++)
-                        fix[list_pre_exchange[i]]->pre_exchange();
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   pre_neighbor call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::pre_neighbor()
-{
-        for(int i = 0; i < n_pre_neighbor_cuda; i++)
-                fix[list_pre_neighbor_cuda[i]]->pre_neighbor();
-
-        if(n_pre_neighbor_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_pre_neighbor_host; i++)
-                        fix[list_pre_neighbor[i]]->pre_neighbor();
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   pre_force call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::setup_pre_force(int vflag)
-{
-        for(int i = 0; i < n_pre_force_cuda; i++)
-                fix[list_pre_force_cuda[i]]->pre_force(vflag);
-
-        if(n_pre_force_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_pre_force_host; i++)
-                        fix[list_pre_force[i]]->pre_force(vflag);
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-void ModifyCuda::pre_force(int vflag)
-{
-        for(int i = 0; i < n_pre_force_cuda; i++)
-                fix[list_pre_force_cuda[i]]->pre_force(vflag);
-
-        if(n_pre_force_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_pre_force_host; i++)
-                        fix[list_pre_force[i]]->pre_force(vflag);
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   post_force call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::post_force(int vflag)
-{
-        for(int i = 0; i < n_post_force_cuda; i++)
-                        fix[list_post_force_cuda[i]]->post_force(vflag);
-
-        if(n_post_force_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_post_force_host; i++)
-                        fix[list_post_force[i]]->post_force(vflag);
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   2nd half of integrate call, only for relevant fixes
-------------------------------------------------------------------------- */
-
-void ModifyCuda::final_integrate()
-{
-        for (int i = 0; i < n_final_integrate_cuda; i++)
-                fix[list_final_integrate_cuda[i]]->final_integrate();
-
-        if(n_final_integrate_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_final_integrate_host; i++)
-                        fix[list_final_integrate[i]]->final_integrate();
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-}
-
-/* ----------------------------------------------------------------------
-   end-of-timestep call, only for relevant fixes
-   only call fix->end_of_step() on timesteps that are multiples of nevery
-------------------------------------------------------------------------- */
-
-void ModifyCuda::end_of_step()
-{
-        for (int i = 0; i < n_end_of_step_cuda; i++)
-                if (update->ntimestep % end_of_step_every_cuda[i] == 0)
-                        fix[list_end_of_step_cuda[i]]->end_of_step();
-
-        if(n_end_of_step_host != 0)
-        {
-                int do_thisstep=0;
-                for (int i = 0; i < n_end_of_step_host; i++)
-                        if (update->ntimestep % end_of_step_every[i] == 0) do_thisstep=1;
-                if(do_thisstep)
-                {
-                  cuda->downloadAll(); cuda->oncpu = true;
-                  for (int i = 0; i < n_end_of_step_host; i++)
-                         if (update->ntimestep % end_of_step_every[i] == 0)
-                                fix[list_end_of_step[i]]->end_of_step();
-                  cuda->uploadAll(); cuda->oncpu = false;
-                }
-        }
-}
-
-/* ----------------------------------------------------------------------
-   thermo energy call, only for relevant fixes
-   called by Thermo class
-   compute_scalar() is fix call to return energy
-------------------------------------------------------------------------- */
-
-double ModifyCuda::thermo_energy()
-{
-        double energy = 0.0;
-
-        for (int i = 0; i < n_thermo_energy_cuda; i++)
-                energy += fix[list_thermo_energy_cuda[i]]->compute_scalar();
-
-        if(n_thermo_energy_host != 0)
-        {
-                cuda->downloadAll(); cuda->oncpu = true;
-                for (int i = 0; i < n_thermo_energy_host; i++)
-                        energy += fix[list_thermo_energy[i]]->compute_scalar();
-                cuda->uploadAll(); cuda->oncpu = false;
-        }
-
-        return energy;
-}
-
-
-
-void ModifyCuda::list_init_end_of_step_cuda(int mask, int &n, int *&list)
-{
-  delete [] list;
-  delete [] end_of_step_every_cuda;
-
-  n = 0;
-  for (int i = 0; i < nfix; i++) if (fmask[i] & mask) n++;
-  list = new int[n];
-  end_of_step_every_cuda = new int[n];
-
-  n = 0;
-  for (int i = 0; i < nfix; i++)
-    if (fmask[i] & mask) {
-      list[n] = i;
-      end_of_step_every_cuda[n++] = fix[i]->nevery;
-    }
-}
diff --git a/src/USER-CUDA/modify_cuda.h b/src/USER-CUDA/modify_cuda.h
deleted file mode 100644
index bfea217046..0000000000
--- a/src/USER-CUDA/modify_cuda.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_MODIFY_CUDA_H
-#define LMP_MODIFY_CUDA_H
-
-#include <cstdio>
-#include "modify.h"
-
-namespace LAMMPS_NS {
-
-class ModifyCuda : public Modify {
- public:
-
-  int n_initial_integrate_cuda;
-  int n_post_integrate_cuda;
-  int n_pre_exchange_cuda;
-  int n_pre_neighbor_cuda;
-  int n_pre_force_cuda;
-  int n_post_force_cuda;
-  int n_final_integrate_cuda;
-  int n_end_of_step_cuda;
-  int n_thermo_energy_cuda;
-
-  int n_initial_integrate_host;
-  int n_post_integrate_host;
-  int n_pre_exchange_host;
-  int n_pre_neighbor_host;
-  int n_pre_force_host;
-  int n_post_force_host;
-  int n_final_integrate_host;
-  int n_end_of_step_host;
-  int n_thermo_energy_host;
-
-  ModifyCuda(class LAMMPS *);
-  ~ModifyCuda();
-  void init();
-  void initial_integrate(int);
-  void post_integrate();
-  //void pre_decide();
-  void pre_exchange();
-  void pre_neighbor();
-  void setup_pre_force(int);
-  void pre_force(int);
-  void post_force(int);
-  void final_integrate();
-  void end_of_step();
-  double thermo_energy();
-
-
- protected:
-  class Cuda *cuda;
-
-  // lists of fixes to apply at different stages of timestep
-
-  // list of cuda fixes
-  int *list_initial_integrate_cuda;
-  int *list_post_integrate_cuda;
-  int *list_pre_exchange_cuda;
-  int *list_pre_neighbor_cuda;
-  int *list_pre_force_cuda;
-  int *list_post_force_cuda;
-  int *list_final_integrate_cuda;
-  int *list_end_of_step_cuda;
-  int *list_thermo_energy_cuda;
-  int *end_of_step_every_cuda;
-
-  void list_init_end_of_step_cuda(int, int &, int *&);
-};
-
-}
-
-#endif
diff --git a/src/USER-CUDA/neigh_full_cuda.cpp b/src/USER-CUDA/neigh_full_cuda.cpp
deleted file mode 100644
index 5fd69f1105..0000000000
--- a/src/USER-CUDA/neigh_full_cuda.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include "neighbor_cuda.h"
-#include "neigh_list.h"
-#include "atom.h"
-#include "domain.h"
-#include "group.h"
-#include "error.h"
-#include "cuda_neigh_list.h"
-#include "user_cuda.h"
-#include "neighbor_cu.h"
-#include <cmath>
-using namespace LAMMPS_NS;
-
-/* ----------------------------------------------------------------------
-   N^2 search for all neighbors
-   every neighbor pair appears in list of both atoms i and j
-------------------------------------------------------------------------- */
-void NeighborCuda::full_bin_cuda(NeighList *list)
-{
-  MYDBG(printf(" # CUDA::NeighFullBinCuda ... start\n");)
-  if(includegroup) error->warning(FLERR,"Warning using inlcudegroup neighborbuild. This is not yet supported by CUDA neighborbuild styles.\n");
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-
-  if(nlocal==0) return;
-  CudaNeighList* clist=list->cuda_list;
-  cuda_shared_neighlist* slist=&clist->sneighlist;
-
-  if(not clist) cuda->registerNeighborList(list);
-
-  clist->build_cuda=true;
-
-  if(slist->bin_extraspace<0.09)
-  {
-    for(int i=1;i<=atom->ntypes;i++)
-    for(int j=1;j<=atom->ntypes;j++)
-    {
-            if(slist->maxcut<cutneighsq[i][j]) slist->maxcut=cutneighsq[i][j];
-    }
-    slist->maxcut=sqrt(slist->maxcut);
-  }
-  int bin_dim_tmp[3];
-  int bin_nmax_tmp;
-  do
-  {
-    do
-    {
-      bin_dim_tmp[0]=static_cast <int> ((domain->subhi[0]-domain->sublo[0])/slist->maxcut);
-      bin_dim_tmp[1]=static_cast <int> ((domain->subhi[1]-domain->sublo[1])/slist->maxcut);
-      bin_dim_tmp[2]=static_cast <int> ((domain->subhi[2]-domain->sublo[2])/slist->maxcut);
-      if(bin_dim_tmp[0]==0) bin_dim_tmp[0]+=1;
-      if(bin_dim_tmp[1]==0) bin_dim_tmp[1]+=1;
-      if(bin_dim_tmp[2]==0) bin_dim_tmp[2]+=1;
-      bin_nmax_tmp=static_cast <int> ((1.0+slist->bin_extraspace)*nlocal/(bin_dim_tmp[0]*bin_dim_tmp[1]*bin_dim_tmp[2]));
-      bin_dim_tmp[0]+=4;
-      bin_dim_tmp[1]+=4;
-      bin_dim_tmp[2]+=4;
-           if(bin_nmax_tmp<32) slist->maxcut*=1.2;
-          // printf("slist->maxcut: %lf\n", slist->maxcut);
-    } while(bin_nmax_tmp<32);
-    if((slist->bin_dim[0]!=bin_dim_tmp[0])||(slist->bin_dim[1]!=bin_dim_tmp[1])||(slist->bin_dim[2]!=bin_dim_tmp[2])||(slist->bin_nmax!=bin_nmax_tmp))
-    {
-            if(slist->binned_id!=NULL)
-            CudaWrapper_FreeCudaData(slist->binned_id,slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
-            slist->bin_dim[0] = bin_dim_tmp[0];
-            slist->bin_dim[1] = bin_dim_tmp[1];
-            slist->bin_dim[2] = bin_dim_tmp[2];
-            slist->bin_nmax = bin_nmax_tmp;
-            slist->binned_id=(int*) CudaWrapper_AllocCudaData(slist->bin_dim[0]*slist->bin_dim[1]*slist->bin_dim[2]*slist->bin_nmax*sizeof(int));
-           //printf("slist->bin: %i %i %i %i \n", bin_dim_tmp[0],bin_dim_tmp[1],bin_dim_tmp[2],bin_nmax_tmp);
-    }
-    //if(list->cuda_list->sneighlist.bin_nmax>512) error->all(FLERR,"To many atoms per bin. Likely cause is very long pair cutoff. This needs major rewrite of code and is not yet scheduled to be done.\n");
-  }while(Cuda_BinAtoms(&cuda->shared_data, &list->cuda_list->sneighlist));
-
- // cuda->cu_debugdata->memset_device(0);
-  int maxneighbors=slist->maxneighbors;
-
-  if((nex_type!=slist->nex_type)||
-  (nex_group!=slist->nex_group)||
-  (nex_mol!=slist->nex_mol))
-  {
-          slist->nex_type=nex_type;
-          slist->nex_group=nex_group;
-          slist->nex_mol=nex_mol;
-          //printf("%i %i %i\n",nex_type,nex_group,nex_mol);
-          if(nex_type)
-          {
-          delete clist->cu_ex_type;
-          clist->cu_ex_type=new cCudaData<int , int , x> (&ex_type[0][0]   , & slist->ex_type     , (atom->ntypes+1)*(atom->ntypes+1) );
-          clist->cu_ex_type->upload();
-          }
-         //printf("AA %i %i %i\n",nex_type,nex_group,nex_mol);
-          if(nex_group)
-          {
-           delete clist->cu_ex1_bit;
-          clist->cu_ex1_bit=new cCudaData<int , int , x> (ex1_bit   , & slist->ex1_bit     , nex_group );
-          clist->cu_ex1_bit->upload();
-          //printf("A %i %i %i\n",nex_type,nex_group,nex_mol);
-          delete clist->cu_ex2_bit;
-          clist->cu_ex2_bit=new cCudaData<int , int , x> (ex2_bit   , & slist->ex2_bit     , nex_group );
-          clist->cu_ex2_bit->upload();
-          }
-          //printf("B %i %i %i\n",nex_type,nex_group,nex_mol);
-          if(nex_mol)
-          {
-          delete clist->cu_ex_mol_bit;
-          clist->cu_ex_mol_bit=new cCudaData<int , int , x> (ex_mol_bit   , & slist->ex_mol_bit     , nex_mol );
-          clist->cu_ex_mol_bit->upload();
-          }
-          //printf("C %i %i %i\n",nex_type,nex_group,nex_mol);
-  }
-  int overflow = 0;
-  do
-  {
-    overflow=0;
-    clist->grow_device();
-    slist->cutneighsq=cutneighsq;
-    slist->maxneighbors=maxneighbors;
-    slist->inum = list->inum = nlocal;
-    //list->cuda_list->grow_device();
-    if(cuda->shared_data.overlap_comm)
-    {
-          list->cuda_list->inum_border=0;
-          list->cuda_list->cu_inum_border->upload();
-    }
-
-    cuda->shared_data.atom.nall=nall;
-    //Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
-    overflow= Cuda_NeighborBuildFullBin(&cuda->shared_data, &list->cuda_list->sneighlist);
-
-        /*cuda->cu_debugdata->download();
-        printf("Debugdata: %i ",cuda->debugdata[0]);
-        for(int i=0;i<cuda->debugdata[0];i+=3) printf("// %i %i %i",cuda->debugdata[i+1],cuda->debugdata[i+2],cuda->debugdata[i+3]);
-        printf("\n");*/
-        //printf("maxneighborsA: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
-
-    if(overflow<0)
-    {
-            maxneighbors+=32;
-            if(-overflow>maxneighbors) maxneighbors=((-overflow+37)/32)*32;
-            delete list->cuda_list->cu_neighbors;
-            delete [] list->cuda_list->neighbors;
-            list->cuda_list->neighbors= new int[slist->maxlocal*maxneighbors];
-            list->cuda_list->sneighlist.maxneighbors=maxneighbors;
-        //printf("maxneighborsA1: %i %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax,slist->maxlocal);
-            list->cuda_list->cu_neighbors= new cCudaData<int, int, x> (list->cuda_list->neighbors                          , & list->cuda_list->sneighlist.neighbors, slist->maxlocal*maxneighbors );
-        //printf("maxneighborsA2: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
-
-            if(cuda->shared_data.overlap_comm)
-            {
-              list->cuda_list->sneighlist.maxneighbors=maxneighbors;
-              list->cuda_list->dev_free();
-              list->cuda_list->dev_alloc();
-            }
-        //printf("maxneighborsA3: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
-    }
-        //printf("maxneighborsB: %i %i %i %i\n",maxneighbors,pgsize,oneatom,atom->nmax);
-    if(cuda->shared_data.overlap_comm)
-    {
-                  list->cuda_list->cu_inum_border->download();
-                  list->cuda_list->sneighlist.inum_border2=list->cuda_list->inum_border;
-    }
-  }
-  while(overflow<0);
-
-  //cuda->cu_debugdata->download();
- // printf("Differences in: %i\n",cuda->debugdata[0]);
- // for(int i=0;i<20;i++) printf("%i %i %i %i// ",cuda->debugdata[4*i+1],cuda->debugdata[4*i+2],cuda->debugdata[4*i+3],cuda->debugdata[4*i+4]);
-//  printf("\n");
-/*for(int i=0;i<10;i++)
-{
-        printf("%i %i // ",i,numneigh[i]);
-        for(int j=0;j<numneigh[i];j++)
-         printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
-        printf("\n");
-}*/
-/*  int count=0;
-  if(cuda->shared_data.overlap_comm)
-  {
-  list->cuda_list->cu_inum_border->download();
-  list->cuda_list->cu_ilist_border->download();
-  list->cuda_list->cu_numneigh_border->download();
-  list->cuda_list->cu_numneigh_inner->download();
-  list->cuda_list->cu_neighbors->download();
-  list->cuda_list->cu_neighbors_inner->download();
-  list->cuda_list->cu_neighbors_border->download();
-
-  //list->cuda_list->cu_firstneigh->download();
- // list->cuda_list->nl_download();
-  list->cuda_list->cu_numneigh->download();
-  int diff=0;
-  //for(int i=0;i<nlocal;i++)*/
- /* int i=123;
-  {
-          int k=-1;
-          //printf("inum_border: %i\n",list->cuda_list->inum_border);
-          //for(int j=0;j<list->numneigh[i];j++) printf("%i ",list->firstneigh[i][j]);printf("\n");
-          for(int j=0;j<list->cuda_list->inum_border;j++)
-          if(list->cuda_list->ilist_border[j]==i) k=j;
-          int d=numneigh[i]-list->cuda_list->numneigh_inner[i];
-          if(k>-1) d-=list->cuda_list->numneigh_border[k];
-          if(d!=0) {printf("Error at %i %i %i %i %i\n",i,k,d,numneigh[i],list->cuda_list->numneigh_inner[i]); diff++;}
-          if(k>-1 && count<10)
-          {
-                  printf("Numneighs: %i %i %i  Border_i: %i %i\n",numneigh[i],list->cuda_list->numneigh_inner[i],list->cuda_list->numneigh_border[k],k,(int)list->cuda_list->cu_ilist_border->dev_data());
-        cuda->shared_data.me=k;
-        for(int j=0;j<numneigh[i];j++)
-         printf("%i ",list->cuda_list->neighbors[i+j*nlocal]);
-           printf("\n");
-        for(int j=0;j<list->cuda_list->numneigh_inner[i];j++)
-         printf("%i ",list->cuda_list->neighbors_inner[i+j*nlocal]);
-         printf(" // ");
-        for(int j=0;j<list->cuda_list->numneigh_border[k];j++)
-         printf("%i ",list->cuda_list->neighbors_border[k+j*nlocal]);
-           printf("\n");
-           count++;
-          }
-  }
-  printf("%i\n",diff);
-  }*/
-  list->cuda_list->cu_numneigh->download();
-  list->cuda_list->cu_ilist->download();
-  cuda->shared_data.atom.update_neigh=2;
-        //printf("Done\n");
-
-  MYDBG(printf(" # CUDA::NeighFullBinCuda ... end\n");)
-
-}
-
-
-void NeighborCuda::full_nsq_cuda(NeighList *list)
-{
-        printf("Full_Nsq cuda neighbor list build is not implemented anymore.\n");
-return;
-/*
-  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... start\n");)
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-
-  if(cuda->cu_xhold) cuda->cu_xhold->upload();
-
-
-  if(not list->cuda_list) cuda->registerNeighborList(list);
-  list->cuda_list->build_cuda=true;
-  int maxneighbors=list->cuda_list->sneighlist.maxneighbors;
-  int neigh_lists_per_page=pgsize/maxneighbors;
-  int *ilist = list->ilist;
-  int *numneigh = list->numneigh;
-  int **firstneigh = list->firstneigh;
-  int **pages = list->pages;
-
-  int overflow = 0;
-  int inum = 0;
-  int npage = 0;
-  int npnt = 0;
-  do
-  {
-          npage=0;
-          npnt=0;
-          inum=0;
-    overflow=0;
-          neigh_lists_per_page=pgsize/maxneighbors;
-    npage=(2*nlocal*maxneighbors-1)/pgsize;
-    while(npage>list->maxpage) list->add_pages();
-    pages = list->pages;
-    npage=0;
-          list->cuda_list->sneighlist.neigh_lists_per_page=pgsize/maxneighbors;
-    list->cuda_list->grow_device();
-    list->cuda_list->sneighlist.cutneighsq=cutneighsq;
-    list->cuda_list->sneighlist.maxneighbors=maxneighbors;
-    list->cuda_list->sneighlist.inum = list->inum = nlocal;
-
-    cuda->shared_data.atom.nall=nall;
-    Cuda_NeighborReBuildFirstneigh(&cuda->shared_data, &list->cuda_list->sneighlist);
-    overflow= not Cuda_NeighborBuildFullNsq(&cuda->shared_data, &list->cuda_list->sneighlist);
-
-
-
-     if(overflow) maxneighbors+=32;
-  }
-  while(overflow);
-   if(not cudable) list->cuda_list->nl_download();
-  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
-  */
-}
diff --git a/src/USER-CUDA/neighbor_cuda.cpp b/src/USER-CUDA/neighbor_cuda.cpp
deleted file mode 100644
index 015e85ff07..0000000000
--- a/src/USER-CUDA/neighbor_cuda.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include <math.h>
-#include "neighbor_cuda.h"
-#include "user_cuda.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "domain.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "force.h"
-#include "group.h"
-#include "memory.h"
-#include "error.h"
-#include "update.h"
-
-using namespace LAMMPS_NS;
-
-
-
-
-enum {NSQ, BIN, MULTI};  // also in neigh_list.cpp
-
-/* ---------------------------------------------------------------------- */
-
-NeighborCuda::NeighborCuda(LAMMPS* lmp) : Neighbor(lmp)
-{
-  cuda = lmp->cuda;
-
-  if(cuda == NULL)
-    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-}
-
-/* ---------------------------------------------------------------------- */
-
-void NeighborCuda::init()
-{
-  cuda->set_neighinit(dist_check, 0.25 * skin * skin);
-  cudable = 1;
-
-  Neighbor::init();
-}
-
-/* ----------------------------------------------------------------------
-   overwrite either full_nsq or full_bin with CUDA-equivalent methods
-   any other neighbor build method is unchanged
-------------------------------------------------------------------------- */
-
-void NeighborCuda::choose_build(int index, NeighRequest* rq)
-{
-  Neighbor::choose_build(index, rq);
-
-  if(rq->full && style == NSQ && rq->cudable)
-    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_nsq_cuda;
-  else if(rq->full && style == BIN && rq->cudable)
-    pair_build[index] = (Neighbor::PairPtr) &NeighborCuda::full_bin_cuda;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int NeighborCuda::check_distance()
-{
-  double delx, dely, delz, rsq;
-  double delta, deltasq, delta1, delta2;
-
-  if(boxcheck) {
-    if(triclinic == 0) {
-      delx = bboxlo[0] - boxlo_hold[0];
-      dely = bboxlo[1] - boxlo_hold[1];
-      delz = bboxlo[2] - boxlo_hold[2];
-      delta1 = sqrt(delx * delx + dely * dely + delz * delz);
-      delx = bboxhi[0] - boxhi_hold[0];
-      dely = bboxhi[1] - boxhi_hold[1];
-      delz = bboxhi[2] - boxhi_hold[2];
-      delta2 = sqrt(delx * delx + dely * dely + delz * delz);
-      delta = 0.5 * (skin - (delta1 + delta2));
-      deltasq = delta * delta;
-    } else {
-      domain->box_corners();
-      delta1 = delta2 = 0.0;
-
-      for(int i = 0; i < 8; i++) {
-        delx = corners[i][0] - corners_hold[i][0];
-        dely = corners[i][1] - corners_hold[i][1];
-        delz = corners[i][2] - corners_hold[i][2];
-        delta = sqrt(delx * delx + dely * dely + delz * delz);
-
-        if(delta > delta1) delta1 = delta;
-        else if(delta > delta2) delta2 = delta;
-      }
-
-      delta = 0.5 * (skin - (delta1 + delta2));
-      deltasq = delta * delta;
-    }
-  } else deltasq = triggersq;
-
-  double** x = atom->x;
-  int nlocal = atom->nlocal;
-
-  if(includegroup) nlocal = atom->nfirst;
-
-  int flag = 0;
-
-  if(not cuda->neighbor_decide_by_integrator) {
-    cuda->cu_x_download();
-
-    for(int i = 0; i < nlocal; i++) {
-      delx = x[i][0] - xhold[i][0];
-      dely = x[i][1] - xhold[i][1];
-      delz = x[i][2] - xhold[i][2];
-      rsq = delx * delx + dely * dely + delz * delz;
-
-      if(rsq > deltasq) flag = 1;
-    }
-  } else flag = cuda->shared_data.atom.reneigh_flag;
-
-  int flagall;
-  MPI_Allreduce(&flag, &flagall, 1, MPI_INT, MPI_MAX, world);
-
-  if(flagall && ago == MAX(every, delay)) ndanger++;
-
-  return flagall;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void NeighborCuda::build(int topoflag)
-{
-  int i;
-
-  ago = 0;
-  ncalls++;
-  lastcall = update->ntimestep;
-  // store current atom positions and box size if needed
-
-  if(dist_check) {
-    if(cuda->decide_by_integrator())
-      cuda->update_xhold(maxhold, &xhold[0][0]);
-    else {
-      if(cuda->finished_setup) cuda->cu_x_download();
-
-      double** x = atom->x;
-      int nlocal = atom->nlocal;
-
-      if(includegroup) nlocal = atom->nfirst;
-
-      if(nlocal > maxhold) {
-        maxhold = atom->nmax;
-        memory->destroy(xhold);
-        memory->create(xhold, maxhold, 3, "neigh:xhold");
-      }
-
-      for(i = 0; i < nlocal; i++) {
-        xhold[i][0] = x[i][0];
-        xhold[i][1] = x[i][1];
-        xhold[i][2] = x[i][2];
-      }
-
-      if(boxcheck) {
-        if(triclinic == 0) {
-          boxlo_hold[0] = bboxlo[0];
-          boxlo_hold[1] = bboxlo[1];
-          boxlo_hold[2] = bboxlo[2];
-          boxhi_hold[0] = bboxhi[0];
-          boxhi_hold[1] = bboxhi[1];
-          boxhi_hold[2] = bboxhi[2];
-        } else {
-          domain->box_corners();
-          corners = domain->corners;
-
-          for(i = 0; i < 8; i++) {
-            corners_hold[i][0] = corners[i][0];
-            corners_hold[i][1] = corners[i][1];
-            corners_hold[i][2] = corners[i][2];
-          }
-        }
-      }
-    }
-  }
-
-  if(not cudable && cuda->finished_setup && atom->avec->cudable)
-    cuda->downloadAll();
-
-  if(cudable && (not cuda->finished_setup)) {
-    cuda->checkResize();
-    cuda->uploadAll();
-  }
-
-  // if any lists store neighbors of ghosts:
-  // invoke grow() if nlocal+nghost exceeds previous list size
-  // else only invoke grow() if nlocal exceeds previous list size
-  // only done for lists with growflag set and which are perpetual
-
-  if(anyghostlist && atom->nlocal + atom->nghost > maxatom) {
-    maxatom = atom->nmax;
-
-    for(i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
-  } else if(atom->nlocal > maxatom) {
-    maxatom = atom->nmax;
-
-    for(i = 0; i < nglist; i++) lists[glist[i]]->grow(maxatom);
-  }
-
-  // extend atom bin list if necessary
-
-  if(style != NSQ && atom->nmax > maxbin) {
-    maxbin = atom->nmax;
-    memory->destroy(bins);
-    memory->create(bins, maxbin, "bins");
-  }
-
-  // check that neighbor list with special bond flags will not overflow
-
-  if(atom->nlocal + atom->nghost > NEIGHMASK)
-    error->one(FLERR, "Too many local+ghost atoms for neighbor list");
-
-  // invoke building of pair and molecular neighbor lists
-  // only for pairwise lists with buildflag set
-
-  for(i = 0; i < nblist; i++)
-    (this->*pair_build[blist[i]])(lists[blist[i]]);
-
-  if(atom->molecular && topoflag) {
-    if(force->bond)(this->*bond_build)();
-    if(force->angle)(this->*angle_build)();
-    if(force->dihedral)(this->*dihedral_build)();
-    if(force->improper)(this->*improper_build)();
-  }
-}
diff --git a/src/USER-CUDA/neighbor_cuda.h b/src/USER-CUDA/neighbor_cuda.h
deleted file mode 100644
index 708f45fde2..0000000000
--- a/src/USER-CUDA/neighbor_cuda.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifndef LMP_NEIGHBOR_CUDA_H
-#define LMP_NEIGHBOR_CUDA_H
-
-#include "neighbor.h"
-
-namespace LAMMPS_NS {
-
-class NeighborCuda : public Neighbor {
- public:
-  NeighborCuda(class LAMMPS *);
-  void init();
-  int check_distance();
-  void build(int do_build_bonded=1);
-
- private:
-  class Cuda *cuda;
-
-  void choose_build(int, class NeighRequest *);
-  typedef void (NeighborCuda::*PairPtr)(class NeighList *);
-  void full_nsq_cuda(class NeighList *);
-  void full_bin_cuda(class NeighList *);
-};
-
-}
-
-#endif
diff --git a/src/USER-CUDA/pair_born_coul_long_cuda.cpp b/src/USER-CUDA/pair_born_coul_long_cuda.cpp
deleted file mode 100644
index d01a5bf47f..0000000000
--- a/src/USER-CUDA/pair_born_coul_long_cuda.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_born_coul_long_cuda.h"
-#include "pair_born_coul_long_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define A1        0.254829592
-#define A2       -0.284496736
-#define A3        1.421413741
-#define A4       -1.453152027
-#define A5        1.061405429
-/* ---------------------------------------------------------------------- */
-
-PairBornCoulLongCuda::PairBornCoulLongCuda(LAMMPS *lmp) : PairBornCoulLong(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairBornCoulLongCuda::allocate()
-{
-        if(! allocated) PairBornCoulLong::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.coeff1  = rhoinv;
-                cuda->shared_data.pair.coeff2  = sigma;
-                cuda->shared_data.pair.coeff3  = a;
-                cuda->shared_data.pair.coeff4  = c;
-                cuda->shared_data.pair.coeff5  = d;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBornCoulLongCuda::compute(int eflag, int vflag)
-{
-        MYDBG( printf("PairBornCoulLongCuda compute start\n"); fflush(stdout);)
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-        #ifdef CUDA_USE_BINNING
-        Cuda_PairBornCoulLongCuda(& cuda->shared_data, eflag, vflag);
-        #else
-        Cuda_PairBornCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-        #endif
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-        MYDBG( printf("PairBornCoulLongCuda compute end\n"); fflush(stdout);)
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBornCoulLongCuda::settings(int narg, char **arg)
-{
-        PairBornCoulLong::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBornCoulLongCuda::coeff(int narg, char **arg)
-{
-        PairBornCoulLong::coeff(narg, arg);
-        allocate();
-}
-
-void PairBornCoulLongCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style born/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (strstr(update->integrate_style,"respa")) error->all(FLERR,"Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-
-  cut_coulsq = cut_coul * cut_coul;
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-
-  if (force->kspace == NULL)
-    error->all(FLERR,"Pair style is incompatible with KSpace style");
-  g_ewald = force->kspace->g_ewald;
-  cuda->shared_data.pair.g_ewald=g_ewald;
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairBornCoulLongCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list\n");)
-        PairBornCoulLong::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list end\n");)
-}
-
-void PairBornCoulLongCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairBornCoulLong::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_born_coul_long_cuda.h b/src/USER-CUDA/pair_born_coul_long_cuda.h
deleted file mode 100644
index 6e4f42cf3b..0000000000
--- a/src/USER-CUDA/pair_born_coul_long_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(born/coul/long/cuda,PairBornCoulLongCuda)
-
-#else
-
-#ifndef LMP_PAIR_BORN_COUL_LONG_CUDA_H
-#define LMP_PAIR_BORN_COUL_LONG_CUDA_H
-
-#include "pair_born_coul_long.h"
-
-namespace LAMMPS_NS {
-
-class PairBornCoulLongCuda : public PairBornCoulLong
-{
-        public:
-                PairBornCoulLongCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp b/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
deleted file mode 100644
index 4291b82752..0000000000
--- a/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_buck_coul_cut_cuda.h"
-#include "pair_buck_coul_cut_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairBuckCoulCutCuda::PairBuckCoulCutCuda(LAMMPS *lmp) : PairBuckCoulCut(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairBuckCoulCutCuda::allocate()
-{
-        if(! allocated) PairBuckCoulCut::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut_coul     = cut_coul;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.coeff1  = rhoinv;
-                cuda->shared_data.pair.coeff2  = buck1;
-                cuda->shared_data.pair.coeff3  = buck2;
-                cuda->shared_data.pair.coeff4  = a;
-                cuda->shared_data.pair.coeff5  = c;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCoulCutCuda::compute(int eflag, int vflag)
-{
-        MYDBG( printf("PairBuckCoulCutCuda compute start\n"); fflush(stdout);)
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairBuckCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-        MYDBG( printf("PairBuckCoulCutCuda compute end\n"); fflush(stdout);)
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCoulCutCuda::settings(int narg, char **arg)
-{
-        PairBuckCoulCut::settings(narg, arg);
-        cuda->shared_data.pair.cut_coul_global = (F_CFLOAT) cut_coul_global;
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCoulCutCuda::coeff(int narg, char **arg)
-{
-        PairBuckCoulCut::coeff(narg, arg);
-        allocate();
-}
-
-void PairBuckCoulCutCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style buck/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (strstr(update->integrate_style,"respa")) error->all(FLERR,"Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-   cuda->shared_data.pair.cut_coulsq_global=cut_coul_global * cut_coul_global;
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairBuckCoulCutCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list\n");)
-        PairBuckCoulCut::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list end\n");)
-}
-
-void PairBuckCoulCutCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairBuckCoulCut::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_buck_coul_cut_cuda.h b/src/USER-CUDA/pair_buck_coul_cut_cuda.h
deleted file mode 100644
index f66b70fb00..0000000000
--- a/src/USER-CUDA/pair_buck_coul_cut_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(buck/coul/cut/cuda,PairBuckCoulCutCuda)
-
-#else
-
-#ifndef LMP_PAIR_BUCK_COUL_CUT_CUDA_H
-#define LMP_PAIR_BUCK_COUL_CUT_CUDA_H
-
-#include "pair_buck_coul_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairBuckCoulCutCuda : public PairBuckCoulCut
-{
-        public:
-                PairBuckCoulCutCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_buck_coul_long_cuda.cpp b/src/USER-CUDA/pair_buck_coul_long_cuda.cpp
deleted file mode 100644
index 8c8d667165..0000000000
--- a/src/USER-CUDA/pair_buck_coul_long_cuda.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_buck_coul_long_cuda.h"
-#include "pair_buck_coul_long_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define A1        0.254829592
-#define A2       -0.284496736
-#define A3        1.421413741
-#define A4       -1.453152027
-#define A5        1.061405429
-/* ---------------------------------------------------------------------- */
-
-PairBuckCoulLongCuda::PairBuckCoulLongCuda(LAMMPS *lmp) : PairBuckCoulLong(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairBuckCoulLongCuda::allocate()
-{
-        if(! allocated) PairBuckCoulLong::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.coeff1  = rhoinv;
-                cuda->shared_data.pair.coeff2  = buck1;
-                cuda->shared_data.pair.coeff3  = buck2;
-                cuda->shared_data.pair.coeff4  = a;
-                cuda->shared_data.pair.coeff5  = c;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCoulLongCuda::compute(int eflag, int vflag)
-{
-        MYDBG( printf("PairBuckCoulLongCuda compute start\n"); fflush(stdout);)
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairBuckCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-        MYDBG( printf("PairBuckCoulLongCuda compute end\n"); fflush(stdout);)
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCoulLongCuda::settings(int narg, char **arg)
-{
-        PairBuckCoulLong::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCoulLongCuda::coeff(int narg, char **arg)
-{
-        PairBuckCoulLong::coeff(narg, arg);
-        allocate();
-}
-
-void PairBuckCoulLongCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style buck/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (strstr(update->integrate_style,"respa")) error->all(FLERR,"Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-
-  cut_coulsq = cut_coul * cut_coul;
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-
-  if (force->kspace == NULL)
-    error->all(FLERR,"Pair style is incompatible with KSpace style");
-  g_ewald = force->kspace->g_ewald;
-  cuda->shared_data.pair.g_ewald=g_ewald;
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairBuckCoulLongCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list\n");)
-        PairBuckCoulLong::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list end\n");)
-}
-
-void PairBuckCoulLongCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairBuckCoulLong::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_buck_coul_long_cuda.h b/src/USER-CUDA/pair_buck_coul_long_cuda.h
deleted file mode 100644
index 41d4637d9a..0000000000
--- a/src/USER-CUDA/pair_buck_coul_long_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(buck/coul/long/cuda,PairBuckCoulLongCuda)
-
-#else
-
-#ifndef LMP_PAIR_BUCK_COUL_LONG_CUDA_H
-#define LMP_PAIR_BUCK_COUL_LONG_CUDA_H
-
-#include "pair_buck_coul_long.h"
-
-namespace LAMMPS_NS {
-
-class PairBuckCoulLongCuda : public PairBuckCoulLong
-{
-        public:
-                PairBuckCoulLongCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_buck_cuda.cpp b/src/USER-CUDA/pair_buck_cuda.cpp
deleted file mode 100644
index bcb9314c5f..0000000000
--- a/src/USER-CUDA/pair_buck_cuda.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_buck_cuda.h"
-#include "pair_buck_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairBuckCuda::PairBuckCuda(LAMMPS *lmp) : PairBuck(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairBuckCuda::allocate()
-{
-        if(! allocated) PairBuck::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = rhoinv;
-                cuda->shared_data.pair.coeff2  = buck1;
-                cuda->shared_data.pair.coeff3  = buck2;
-                cuda->shared_data.pair.coeff4  = a;
-                cuda->shared_data.pair.coeff5  = c;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCuda::compute(int eflag, int vflag)
-{
-        MYDBG( printf("PairBuckCuda compute start\n"); fflush(stdout);)
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairBuckCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-        MYDBG( printf("PairBuckCuda compute end\n"); fflush(stdout);)
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCuda::settings(int narg, char **arg)
-{
-        PairBuck::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairBuckCuda::coeff(int narg, char **arg)
-{
-        PairBuck::coeff(narg, arg);
-        allocate();
-}
-
-void PairBuckCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style buck/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (strstr(update->integrate_style,"respa")) error->all(FLERR,"Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairBuckCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairBuckCuda::init_list\n");)
-        PairBuck::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairBuckCuda::init_list end\n");)
-}
-
-void PairBuckCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairBuck::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_buck_cuda.h b/src/USER-CUDA/pair_buck_cuda.h
deleted file mode 100644
index 9dfb742ed0..0000000000
--- a/src/USER-CUDA/pair_buck_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(buck/cuda,PairBuckCuda)
-
-#else
-
-#ifndef LMP_PAIR_BUCK_CUDA_H
-#define LMP_PAIR_BUCK_CUDA_H
-
-#include "pair_buck.h"
-
-namespace LAMMPS_NS {
-
-class PairBuckCuda : public PairBuck
-{
-        public:
-                PairBuckCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_eam_alloy_cuda.cpp b/src/USER-CUDA/pair_eam_alloy_cuda.cpp
deleted file mode 100644
index c0b76c7e11..0000000000
--- a/src/USER-CUDA/pair_eam_alloy_cuda.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
-------------------------------------------------------------------------- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_eam_alloy_cuda.h"
-#include "atom.h"
-#include "comm.h"
-#include "memory.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-#define MAXLINE 1024
-
-/* ---------------------------------------------------------------------- */
-
-PairEAMAlloyCuda::PairEAMAlloyCuda(LAMMPS *lmp) : PairEAMCuda(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  one_coeff = 1;
-}
-
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-   read DYNAMO setfl file
-------------------------------------------------------------------------- */
-
-void PairEAMAlloyCuda::coeff(int narg, char **arg)
-{
-  int i,j;
-
-  if (!allocated) allocate();
-
-  if (narg != 3 + atom->ntypes)
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  // insure I,J args are * *
-
-  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  // read EAM setfl file
-
-  if (setfl) {
-    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
-    delete [] setfl->elements;
-    delete [] setfl->mass;
-    memory->destroy(setfl->frho);
-    memory->destroy(setfl->rhor);
-    memory->destroy(setfl->z2r);
-    delete setfl;
-  }
-  setfl = new Setfl();
-  read_file(arg[2]);
-
-  // read args that map atom types to elements in potential file
-  // map[i] = which element the Ith atom type is, -1 if NULL
-
-  for (i = 3; i < narg; i++) {
-    if (strcmp(arg[i],"NULL") == 0) {
-      map[i-2] = -1;
-      continue;
-    }
-    for (j = 0; j < setfl->nelements; j++)
-      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
-    if (j < setfl->nelements) map[i-2] = j;
-    else error->all(FLERR,"No matching element in EAM potential file");
-  }
-
-  // clear setflag since coeff() called once with I,J = * *
-
-  int n = atom->ntypes;
-  for (i = 1; i <= n; i++)
-    for (j = i; j <= n; j++)
-      setflag[i][j] = 0;
-
-  // set setflag i,j for type pairs where both are mapped to elements
-  // set mass of atom type if i = j
-
-  int count = 0;
-  for (i = 1; i <= n; i++) {
-    for (j = i; j <= n; j++) {
-      if (map[i] >= 0 && map[j] >= 0) {
-        setflag[i][j] = 1;
-        if (i == j) atom->set_mass(i,setfl->mass[map[i]]);
-        count++;
-      }
-    }
-  }
-
-  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
-}
-
-/* ----------------------------------------------------------------------
-   read a multi-element DYNAMO setfl file
-------------------------------------------------------------------------- */
-
-void PairEAMAlloyCuda::read_file(char *filename)
-{
-  Setfl *file = setfl;
-
-  // open potential file
-
-  int me = comm->me;
-  FILE *fptr;
-  char line[MAXLINE];
-
-  if (me == 0) {
-    fptr = fopen(filename,"r");
-    if (fptr == NULL) {
-      char str[128];
-      sprintf(str,"Cannot open EAM potential file %s",filename);
-      error->one(FLERR,str);
-    }
-  }
-
-  // read and broadcast header
-  // extract element names from nelements line
-
-  int n;
-  if (me == 0) {
-    fgets(line,MAXLINE,fptr);
-    fgets(line,MAXLINE,fptr);
-    fgets(line,MAXLINE,fptr);
-    fgets(line,MAXLINE,fptr);
-    n = strlen(line) + 1;
-  }
-  MPI_Bcast(&n,1,MPI_INT,0,world);
-  MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-  sscanf(line,"%d",&file->nelements);
-  int nwords = atom->count_words(line);
-  if (nwords != file->nelements + 1)
-    error->all(FLERR,"Incorrect element names in EAM potential file");
-
-  char **words = new char*[file->nelements+1];
-  nwords = 0;
-  strtok(line," \t\n\r\f");
-  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
-
-  file->elements = new char*[file->nelements];
-  for (int i = 0; i < file->nelements; i++) {
-    n = strlen(words[i]) + 1;
-    file->elements[i] = new char[n];
-    strcpy(file->elements[i],words[i]);
-  }
-  delete [] words;
-
-  if (me == 0) {
-    fgets(line,MAXLINE,fptr);
-    sscanf(line,"%d %lg %d %lg %lg",
-           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
-  }
-
-  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
-  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
-  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
-  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
-  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
-
-  file->mass = new double[file->nelements];
-  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
-  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
-  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
-                 "pair:z2r");
-  int i,j,tmp;
-  for (i = 0; i < file->nelements; i++) {
-    if (me == 0) {
-      fgets(line,MAXLINE,fptr);
-      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
-    }
-    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
-
-    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
-    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
-    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
-    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
-  }
-
-  for (i = 0; i < file->nelements; i++)
-    for (j = 0; j <= i; j++) {
-      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
-      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
-    }
-
-  // close the potential file
-
-  if (me == 0) fclose(fptr);
-}
-
-/* ----------------------------------------------------------------------
-   copy read-in setfl potential to standard array format
-------------------------------------------------------------------------- */
-
-void PairEAMAlloyCuda::file2array()
-{
-  int i,j,m,n;
-  int ntypes = atom->ntypes;
-
-  // set function params directly from setfl file
-
-  nrho = setfl->nrho;
-  nr = setfl->nr;
-  drho = setfl->drho;
-  dr = setfl->dr;
-
-  // ------------------------------------------------------------------
-  // setup frho arrays
-  // ------------------------------------------------------------------
-
-  // allocate frho arrays
-  // nfrho = # of setfl elements + 1 for zero array
-
-  nfrho = setfl->nelements + 1;
-  memory->destroy(frho);
-  memory->create(frho,nfrho,nrho+1,"pair:frho");
-
-  // copy each element's frho to global frho
-
-  for (i = 0; i < setfl->nelements; i++)
-    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
-
-  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
-  // this is necessary b/c fp is still computed for non-EAM atoms
-
-  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
-
-  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
-  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
-  // then map it to last frho array of zeroes
-
-  for (i = 1; i <= ntypes; i++)
-    if (map[i] >= 0) type2frho[i] = map[i];
-    else type2frho[i] = nfrho-1;
-
-  // ------------------------------------------------------------------
-  // setup rhor arrays
-  // ------------------------------------------------------------------
-
-  // allocate rhor arrays
-  // nrhor = # of setfl elements
-
-  nrhor = setfl->nelements;
-  memory->destroy(rhor);
-  memory->create(rhor,nrhor,nr+1,"pair:rhor");
-
-  // copy each element's rhor to global rhor
-
-  for (i = 0; i < setfl->nelements; i++)
-    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
-
-  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
-  // for setfl files, I,J mapping only depends on I
-  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
-
-  for (i = 1; i <= ntypes; i++)
-    for (j = 1; j <= ntypes; j++)
-      type2rhor[i][j] = map[i];
-
-  // ------------------------------------------------------------------
-  // setup z2r arrays
-  // ------------------------------------------------------------------
-
-  // allocate z2r arrays
-  // nz2r = N*(N+1)/2 where N = # of setfl elements
-
-  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
-  memory->destroy(z2r);
-  memory->create(z2r,nz2r,nr+1,"pair:z2r");
-
-  // copy each element pair z2r to global z2r, only for I >= J
-
-  n = 0;
-  for (i = 0; i < setfl->nelements; i++)
-    for (j = 0; j <= i; j++) {
-      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
-      n++;
-    }
-
-  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
-  // set of z2r arrays only fill lower triangular Nelement matrix
-  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
-  // swap indices when irow < icol to stay lower triangular
-  // if map = -1 (non-EAM atom in pair hybrid):
-  //   type2z2r is not used by non-opt
-  //   but set type2z2r to 0 since accessed by opt
-
-  int irow,icol;
-  for (i = 1; i <= ntypes; i++) {
-    for (j = 1; j <= ntypes; j++) {
-      irow = map[i];
-      icol = map[j];
-      if (irow == -1 || icol == -1) {
-        type2z2r[i][j] = 0;
-        continue;
-      }
-      if (irow < icol) {
-        irow = map[j];
-        icol = map[i];
-      }
-      n = 0;
-      for (m = 0; m < irow; m++) n += m + 1;
-      n += icol;
-      type2z2r[i][j] = n;
-    }
-  }
-}
diff --git a/src/USER-CUDA/pair_eam_alloy_cuda.h b/src/USER-CUDA/pair_eam_alloy_cuda.h
deleted file mode 100644
index c46755d0f8..0000000000
--- a/src/USER-CUDA/pair_eam_alloy_cuda.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(eam/alloy/cuda,PairEAMAlloyCuda)
-
-#else
-
-#ifndef LMP_PAIR_EAM_CUDA_ALLOY_H
-#define LMP_PAIR_EAM_CUDA_ALLOY_H
-
-#include "pair_eam_cuda.h"
-
-namespace LAMMPS_NS {
-
-// use virtual public since this class is parent in multiple inheritance
-
-class PairEAMAlloyCuda : virtual public PairEAMCuda {
- public:
-  PairEAMAlloyCuda(class LAMMPS *);
-  virtual ~PairEAMAlloyCuda() {}
-  void coeff(int, char **);
-
- protected:
-  class Cuda *cuda;
-  void read_file(char *);
-  void file2array();
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_eam_cuda.cpp b/src/USER-CUDA/pair_eam_cuda.cpp
deleted file mode 100644
index 3db0c66cd6..0000000000
--- a/src/USER-CUDA/pair_eam_cuda.cpp
+++ /dev/null
@@ -1,265 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_eam_cuda.h"
-#include "pair_eam_cuda_cu.h"
-#include "pair_virial_compute_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairEAMCuda::PairEAMCuda(LAMMPS* lmp) : PairEAM(lmp)
-{
-  cuda = lmp->cuda;
-
-  if(cuda == NULL)
-    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  allocated2 = false;
-  cuda->shared_data.pair.cudable_force = 1;
-  cuda->shared_data.pair.override_block_per_atom = 0;
-
-  cuda->setSystemParams();
-  cu_rho = NULL;
-  cu_fp = NULL;
-  cu_frho_spline = NULL;
-  cu_z2r_spline = NULL;
-  cu_rhor_spline = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairEAMCuda::allocate()
-{
-  if(! allocated) PairEAM::allocate();
-
-  cuda->shared_data.pair.cutsq     = cutsq;
-  cuda->shared_data.pair.cut_global = (F_CFLOAT) cutforcesq;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairEAMCuda::compute(int eflag, int vflag)
-{
-  cuda->shared_data.pair.cut_global = (F_CFLOAT) cutforcesq;
-  cuda->shared_data.pair.use_block_per_atom = 0;
-  cuda->shared_data.pair.collect_forces_later = 0;
-
-  if(atom->nmax > nmax || cuda->finished_setup == false) {
-    memory->destroy(rho);
-    memory->destroy(fp);
-    nmax = atom->nmax;
-    memory->create(rho, nmax, "pair:rho");
-    memory->create(fp, nmax, "pair:fp");
-    delete cu_rho;
-    delete cu_fp;
-    cu_rho = new cCudaData<double, F_CFLOAT, x> (rho, atom->nmax);
-    cu_fp  = new cCudaData<double, F_CFLOAT, x> (fp, atom->nmax);
-    Cuda_PairEAMCuda_Init(&cuda->shared_data, rdr, rdrho, nfrho, nrhor, nr, nrho, nz2r,
-                          cu_frho_spline->dev_data(), cu_rhor_spline->dev_data(), cu_z2r_spline->dev_data(),
-                          cu_rho->dev_data(), cu_fp->dev_data(), type2frho, type2z2r, type2rhor);
-  }
-
-
-
-  if(eflag || vflag) ev_setup(eflag, vflag);
-
-  if(eflag) cuda->cu_eng_vdwl->upload();
-
-  if(vflag) cuda->cu_virial->upload();
-
-  Cuda_PairEAM1Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-  comm->forward_comm_pair(this);
-
-  Cuda_PairEAM2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-  if(eflag) cuda->cu_eng_vdwl->download();
-
-  if(vflag) cuda->cu_virial->download();
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairEAMCuda::settings(int narg, char** arg)
-{
-  PairEAM::settings(narg, arg);
-  cuda->shared_data.pair.cut_global = (F_CFLOAT) cutforcesq;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairEAMCuda::coeff(int narg, char** arg)
-{
-  PairEAM::coeff(narg, arg);
-  allocate();
-}
-
-void PairEAMCuda::init_style()
-{
-  MYDBG(printf("# CUDA PairEAMCuda::init_style start\n");)
-  // request regular or rRESPA neighbor lists
-  file2array();
-  array2spline();
-  int irequest;
-
-
-  irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-
-  delete cu_rhor_spline;
-  delete cu_z2r_spline;
-  delete cu_frho_spline;
-
-  cu_rhor_spline = new cCudaData<double, F_CFLOAT, xyz>((double*)rhor_spline, nrhor, nr + 1, EAM_COEFF_LENGTH);
-  cu_z2r_spline = new cCudaData<double, F_CFLOAT, xyz>((double*)z2r_spline, nz2r, nr + 1, EAM_COEFF_LENGTH);
-  cu_frho_spline = new cCudaData<double, F_CFLOAT, xyz>((double*)frho_spline, nfrho, nrho + 1, EAM_COEFF_LENGTH);
-
-  cu_rhor_spline->upload();
-  cu_z2r_spline->upload();
-  cu_frho_spline->upload();
-
-  MYDBG(printf("# CUDA PairEAMCuda::init_style end\n");)
-}
-
-void PairEAMCuda::init_list(int id, NeighList* ptr)
-{
-  MYDBG(printf("# CUDA PairEAMCuda::init_list\n");)
-  PairEAM::init_list(id, ptr);
-
-  // right now we can only handle verlet (id 0), not respa
-  if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-
-  // see Neighbor::init() for details on lammps lists' logic
-  MYDBG(printf("# CUDA PairEAMCuda::init_list end\n");)
-}
-
-void PairEAMCuda::array2spline()
-{
-  rdr = 1.0 / dr;
-  rdrho = 1.0 / drho;
-
-  memory->destroy(frho_spline);
-  memory->destroy(rhor_spline);
-  memory->destroy(z2r_spline);
-
-  memory->create(frho_spline, nfrho, nrho + 1, 8, "pair:frho");
-  memory->create(rhor_spline, nrhor, nr + 1, 8, "pair:rhor");
-  memory->create(z2r_spline, nz2r, nr + 1, 8, "pair:z2r");
-
-  for(int i = 0; i < nfrho; i++) {
-    interpolate(nrho, drho, frho[i], frho_spline[i]);
-
-    for(int j = 0; j < nrho + 1; j++)
-      frho_spline[i][j][7] = frho_spline[i][j][3];
-  }
-
-  for(int i = 0; i < nrhor; i++) {
-    interpolate(nr, dr, rhor[i], rhor_spline[i]);
-
-    for(int j = 0; j < nr + 1; j++)
-      rhor_spline[i][j][7] = rhor_spline[i][j][3];
-  }
-
-  for(int i = 0; i < nz2r; i++) {
-    interpolate(nr, dr, z2r[i], z2r_spline[i]);
-
-    for(int j = 0; j < nr + 1; j++)
-      z2r_spline[i][j][7] = z2r_spline[i][j][3];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int PairEAMCuda::pack_forward_comm(int n, int* iswap, double* buf,
-                                   int pbc_flag, int* pbc)
-{
-  Cuda_PairEAMCuda_PackComm(&cuda->shared_data, n, *iswap, buf);
-
-  if(sizeof(F_CFLOAT) < sizeof(double)) return n;
-  else return n;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairEAMCuda::unpack_forward_comm(int n, int first, double* buf)
-{
-  Cuda_PairEAMCuda_UnpackComm(&cuda->shared_data, n, first, buf, cu_fp->dev_data());
-}
-
-void PairEAMCuda::ev_setup(int eflag, int vflag)
-{
-  int maxeatomold = maxeatom;
-  PairEAM::ev_setup(eflag, vflag);
-
-  if(eflag_atom && atom->nmax > maxeatomold) {
-    delete cuda->cu_eatom;
-    cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax);
-  }
-
-  if(vflag_atom && atom->nmax > maxeatomold) {
-    delete cuda->cu_vatom;
-    cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6);
-  }
-
-}
diff --git a/src/USER-CUDA/pair_eam_cuda.h b/src/USER-CUDA/pair_eam_cuda.h
deleted file mode 100644
index 973fc20a45..0000000000
--- a/src/USER-CUDA/pair_eam_cuda.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-#ifdef PAIR_CLASS
-
-PairStyle(eam/cuda,PairEAMCuda)
-
-#else
-
-#ifndef PAIR_EAM_CUDA_H
-#define PAIR_EAM_CUDA_H
-
-#include "cuda_data.h"
-#include "pair_eam.h"
-
-namespace LAMMPS_NS {
-
-class PairEAMCuda : public PairEAM
-{
-        public:
-                PairEAMCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void array2spline();
-                int pack_forward_comm(int n, int *iswap, double *buf,
-                                      int pbc_flag, int *pbc);
-                void unpack_forward_comm(int n, int first, double *buf);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                virtual void ev_setup(int eflag, int vflag);
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double, F_CFLOAT, x>* cu_rho;
-                cCudaData<double, F_CFLOAT, x>* cu_fp;
-            cCudaData<double, F_CFLOAT, xyz>* cu_rhor_spline;
-            cCudaData<double, F_CFLOAT, xyz>* cu_z2r_spline;
-            cCudaData<double, F_CFLOAT, xyz>* cu_frho_spline;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_eam_fs_cuda.cpp b/src/USER-CUDA/pair_eam_fs_cuda.cpp
deleted file mode 100644
index 6190213402..0000000000
--- a/src/USER-CUDA/pair_eam_fs_cuda.cpp
+++ /dev/null
@@ -1,335 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Tim Lau (MIT)
-------------------------------------------------------------------------- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_eam_fs_cuda.h"
-#include "atom.h"
-#include "comm.h"
-#include "memory.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-#define MAXLINE 1024
-
-/* ---------------------------------------------------------------------- */
-
-PairEAMFSCuda::PairEAMFSCuda(LAMMPS *lmp) : PairEAMCuda(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  one_coeff = 1;
-}
-
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-   read EAM Finnis-Sinclair file
-------------------------------------------------------------------------- */
-
-void PairEAMFSCuda::coeff(int narg, char **arg)
-{
-  int i,j;
-
-  if (!allocated) allocate();
-
-  if (narg != 3 + atom->ntypes)
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  // insure I,J args are * *
-
-  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  // read EAM Finnis-Sinclair file
-
-  if (fs) {
-    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
-    delete [] fs->elements;
-    delete [] fs->mass;
-    memory->destroy(fs->frho);
-    memory->destroy(fs->rhor);
-    memory->destroy(fs->z2r);
-    delete fs;
-  }
-  fs = new Fs();
-  read_file(arg[2]);
-
-  // read args that map atom types to elements in potential file
-  // map[i] = which element the Ith atom type is, -1 if NULL
-
-  for (i = 3; i < narg; i++) {
-    if (strcmp(arg[i],"NULL") == 0) {
-      map[i-2] = -1;
-      continue;
-    }
-    for (j = 0; j < fs->nelements; j++)
-      if (strcmp(arg[i],fs->elements[j]) == 0) break;
-    if (j < fs->nelements) map[i-2] = j;
-    else error->all(FLERR,"No matching element in EAM potential file");
-  }
-
-  // clear setflag since coeff() called once with I,J = * *
-
-  int n = atom->ntypes;
-  for (i = 1; i <= n; i++)
-    for (j = i; j <= n; j++)
-      setflag[i][j] = 0;
-
-  // set setflag i,j for type pairs where both are mapped to elements
-  // set mass of atom type if i = j
-
-  int count = 0;
-  for (i = 1; i <= n; i++) {
-    for (j = i; j <= n; j++) {
-      if (map[i] >= 0 && map[j] >= 0) {
-        setflag[i][j] = 1;
-        if (i == j) atom->set_mass(i,fs->mass[map[i]]);
-        count++;
-      }
-    }
-  }
-
-  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
-}
-
-/* ----------------------------------------------------------------------
-   read a multi-element DYNAMO setfl file
-------------------------------------------------------------------------- */
-
-void PairEAMFSCuda::read_file(char *filename)
-{
-  Fs *file = fs;
-
-  // open potential file
-
-  int me = comm->me;
-  FILE *fptr;
-  char line[MAXLINE];
-
-  if (me == 0) {
-    fptr = fopen(filename,"r");
-    if (fptr == NULL) {
-      char str[128];
-      sprintf(str,"Cannot open EAM potential file %s",filename);
-      error->one(FLERR,str);
-    }
-  }
-
-  // read and broadcast header
-  // extract element names from nelements line
-
-  int n;
-  if (me == 0) {
-    fgets(line,MAXLINE,fptr);
-    fgets(line,MAXLINE,fptr);
-    fgets(line,MAXLINE,fptr);
-    fgets(line,MAXLINE,fptr);
-    n = strlen(line) + 1;
-  }
-  MPI_Bcast(&n,1,MPI_INT,0,world);
-  MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-  sscanf(line,"%d",&file->nelements);
-  int nwords = atom->count_words(line);
-  if (nwords != file->nelements + 1)
-    error->all(FLERR,"Incorrect element names in EAM potential file");
-
-  char **words = new char*[file->nelements+1];
-  nwords = 0;
-  strtok(line," \t\n\r\f");
-  while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
-
-  file->elements = new char*[file->nelements];
-  for (int i = 0; i < file->nelements; i++) {
-    n = strlen(words[i]) + 1;
-    file->elements[i] = new char[n];
-    strcpy(file->elements[i],words[i]);
-  }
-  delete [] words;
-
-  if (me == 0) {
-    fgets(line,MAXLINE,fptr);
-    sscanf(line,"%d %lg %d %lg %lg",
-           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
-  }
-
-  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
-  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
-  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
-  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
-  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
-
-  file->mass = new double[file->nelements];
-  memory->create(file->frho,file->nelements,file->nrho+1,
-                                              "pair:frho");
-  memory->create(file->rhor,file->nelements,file->nelements,
-                 file->nr+1,"pair:rhor");
-  memory->create(file->z2r,file->nelements,file->nelements,
-                 file->nr+1,"pair:z2r");
-  int i,j,tmp;
-  for (i = 0; i < file->nelements; i++) {
-    if (me == 0) {
-      fgets(line,MAXLINE,fptr);
-      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
-    }
-    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
-
-    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
-    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
-
-    for (j = 0; j < file->nelements; j++) {
-      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
-      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
-    }
-  }
-
-  for (i = 0; i < file->nelements; i++)
-    for (j = 0; j <= i; j++) {
-      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
-      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
-    }
-
-  // close the potential file
-
-  if (me == 0) fclose(fptr);
-}
-
-/* ----------------------------------------------------------------------
-   copy read-in setfl potential to standard array format
-------------------------------------------------------------------------- */
-
-void PairEAMFSCuda::file2array()
-{
-  int i,j,m,n;
-  int ntypes = atom->ntypes;
-
-  // set function params directly from fs file
-
-  nrho = fs->nrho;
-  nr = fs->nr;
-  drho = fs->drho;
-  dr = fs->dr;
-
-  // ------------------------------------------------------------------
-  // setup frho arrays
-  // ------------------------------------------------------------------
-
-  // allocate frho arrays
-  // nfrho = # of fs elements + 1 for zero array
-
-  nfrho = fs->nelements + 1;
-  memory->destroy(frho);
-  memory->create(frho,nfrho,nrho+1,"pair:frho");
-
-  // copy each element's frho to global frho
-
-  for (i = 0; i < fs->nelements; i++)
-    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
-
-  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
-  // this is necessary b/c fp is still computed for non-EAM atoms
-
-  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
-
-  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
-  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
-  // then map it to last frho array of zeroes
-
-  for (i = 1; i <= ntypes; i++)
-    if (map[i] >= 0) type2frho[i] = map[i];
-    else type2frho[i] = nfrho-1;
-
-  // ------------------------------------------------------------------
-  // setup rhor arrays
-  // ------------------------------------------------------------------
-
-  // allocate rhor arrays
-  // nrhor = square of # of fs elements
-
-  nrhor = fs->nelements * fs->nelements;
-  memory->destroy(rhor);
-  memory->create(rhor,nrhor,nr+1,"pair:rhor");
-
-  // copy each element pair rhor to global rhor
-
-  n = 0;
-  for (i = 0; i < fs->nelements; i++)
-    for (j = 0; j < fs->nelements; j++) {
-      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
-      n++;
-    }
-
-  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
-  // for fs files, there is a full NxN set of rhor arrays
-  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
-
-  for (i = 1; i <= ntypes; i++)
-    for (j = 1; j <= ntypes; j++)
-      type2rhor[i][j] = map[i] * fs->nelements + map[j];
-
-  // ------------------------------------------------------------------
-  // setup z2r arrays
-  // ------------------------------------------------------------------
-
-  // allocate z2r arrays
-  // nz2r = N*(N+1)/2 where N = # of fs elements
-
-  nz2r = fs->nelements * (fs->nelements+1) / 2;
-  memory->destroy(z2r);
-  memory->create(z2r,nz2r,nr+1,"pair:z2r");
-
-  // copy each element pair z2r to global z2r, only for I >= J
-
-  n = 0;
-  for (i = 0; i < fs->nelements; i++)
-    for (j = 0; j <= i; j++) {
-      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
-      n++;
-    }
-
-  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
-  // set of z2r arrays only fill lower triangular Nelement matrix
-  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
-  // swap indices when irow < icol to stay lower triangular
-  // if map = -1 (non-EAM atom in pair hybrid):
-  //   type2z2r is not used by non-opt
-  //   but set type2z2r to 0 since accessed by opt
-
-  int irow,icol;
-  for (i = 1; i <= ntypes; i++) {
-    for (j = 1; j <= ntypes; j++) {
-      irow = map[i];
-      icol = map[j];
-      if (irow == -1 || icol == -1) {
-        type2z2r[i][j] = 0;
-        continue;
-      }
-      if (irow < icol) {
-        irow = map[j];
-        icol = map[i];
-      }
-      n = 0;
-      for (m = 0; m < irow; m++) n += m + 1;
-      n += icol;
-      type2z2r[i][j] = n;
-    }
-  }
-}
diff --git a/src/USER-CUDA/pair_eam_fs_cuda.h b/src/USER-CUDA/pair_eam_fs_cuda.h
deleted file mode 100644
index 698b485875..0000000000
--- a/src/USER-CUDA/pair_eam_fs_cuda.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(eam/fs/cuda,PairEAMFSCuda)
-
-#else
-
-#ifndef LMP_PAIR_EAM_FS_CUDA_H
-#define LMP_PAIR_EAM_FS_CUDA_H
-
-#include "pair_eam_cuda.h"
-
-namespace LAMMPS_NS {
-
-// use virtual public since this class is parent in multiple inheritance
-
-class PairEAMFSCuda : virtual public PairEAMCuda {
- public:
-  PairEAMFSCuda(class LAMMPS *);
-  virtual ~PairEAMFSCuda() {}
-  void coeff(int, char **);
-
- protected:
-  class Cuda *cuda;
-  void read_file(char *);
-  void file2array();
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.cpp b/src/USER-CUDA/pair_gran_hooke_cuda.cpp
deleted file mode 100644
index 3f60475ad5..0000000000
--- a/src/USER-CUDA/pair_gran_hooke_cuda.cpp
+++ /dev/null
@@ -1,250 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_gran_hooke_cuda.h"
-#include "pair_gran_hooke_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "modify.h"
-#include "fix_pour.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairGranHookeCuda::PairGranHookeCuda(LAMMPS *lmp) : PairGranHooke(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairGranHookeCuda::allocate()
-{
-        if(! allocated) PairGranHooke::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                 int n = atom->ntypes;
-                cuda->shared_data.pair.cutsq     = cutsq;
-                memory->create(cuda->shared_data.pair.coeff1,n+1,n+1,
-                               "pair:cuda_coeff1");
-                memory->create(cuda->shared_data.pair.coeff2,
-                               n+1,n+1,"pair:cuda_coeff2");
-                cuda->shared_data.pair.coeff1[0][0]=kn;
-                cuda->shared_data.pair.coeff1[0][1]=kt;
-                cuda->shared_data.pair.coeff1[1][0]=gamman;
-                cuda->shared_data.pair.coeff1[1][1]=gammat;
-                cuda->shared_data.pair.coeff2[0][0]=xmu;
-                cuda->shared_data.pair.coeff2[0][1]=dampflag;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairGranHookeCuda::compute(int eflag, int vflag)
-{
-             cuda->shared_data.pair.use_block_per_atom = 0;
-        //cuda->cu_debugdata->memset_device(0);
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairGranHookeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-        //cuda->cu_debugdata->download();
-        //printf("%lf %lf %lf %lf %lf %lf\n",1.0e-6*cuda->debugdata[0],1.0e-6*cuda->debugdata[1],1.0e-6*cuda->debugdata[2],1.0e-6*cuda->debugdata[3],1.0e-6*cuda->debugdata[4],1.0e-6*cuda->debugdata[5]);
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairGranHookeCuda::settings(int narg, char **arg)
-{
-        PairGranHooke::settings(narg, arg);
- }
-
-/* ---------------------------------------------------------------------- */
-
-void PairGranHookeCuda::coeff(int narg, char **arg)
-{
-        PairGranHooke::coeff(narg, arg);
-        allocate();
-}
-
-void PairGranHookeCuda::init_style()
-{
-        int i;
-        MYDBG(printf("# CUDA PairGranHookeCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->gran = 1;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  }
-
-  if (!atom->radius_flag || !atom->omega_flag || !atom->torque_flag)
-    error->all(FLERR,"Pair granular requires atom attributes radius, omega, torque");
-  if (comm->ghost_velocity == 0)
-    error->all(FLERR,"Pair granular requires ghost atoms store velocity");
-
-  // need a half neigh list and optionally a granular history neigh list
-
-  dt = update->dt;
-
-  // check for Fix freeze and set freeze_group_bit
-
-  for (i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"freeze") == 0) break;
-  if (i < modify->nfix) freeze_group_bit = modify->fix[i]->groupbit;
-  else freeze_group_bit = 0;
-
-  cuda->shared_data.pair.freeze_group_bit=freeze_group_bit;
-
-  // check for FixPour and FixDeposit so can extract particle radii
-
-  int ipour;
-  for (ipour = 0; ipour < modify->nfix; ipour++)
-    if (strcmp(modify->fix[ipour]->style,"pour") == 0) break;
-  if (ipour == modify->nfix) ipour = -1;
-
-  int idep;
-  for (idep = 0; idep < modify->nfix; idep++)
-    if (strcmp(modify->fix[idep]->style,"deposit") == 0) break;
-  if (idep == modify->nfix) idep = -1;
-
-  // set maxrad_dynamic and maxrad_frozen for each type
-  // include future FixPour and FixDeposit particles as dynamic
-
-  int itype;
-  for (i = 1; i <= atom->ntypes; i++) {
-    onerad_dynamic[i] = onerad_frozen[i] = 0.0;
-    if (ipour >= 0) {
-      itype = i;
-      onerad_dynamic[i] =
-        *((double *) modify->fix[ipour]->extract("radius",itype));
-    }
-    if (idep >= 0) {
-      itype = i;
-      onerad_dynamic[i] =
-        *((double *) modify->fix[idep]->extract("radius",itype));
-    }
-  }
-
-  double *radius = atom->radius;
-  int *mask = atom->mask;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++)
-    if (mask[i] & freeze_group_bit)
-      onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]],radius[i]);
-    else
-      onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]],radius[i]);
-
-  MPI_Allreduce(&onerad_dynamic[1],&maxrad_dynamic[1],atom->ntypes,
-                MPI_DOUBLE,MPI_MAX,world);
-  MPI_Allreduce(&onerad_frozen[1],&maxrad_frozen[1],atom->ntypes,
-                MPI_DOUBLE,MPI_MAX,world);
-}
-
-void PairGranHookeCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairGranHookeCuda::init_list\n");)
-        PairGranHooke::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairGranHookeCuda::init_list end\n");)
-}
-
-void PairGranHookeCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairGranHooke::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_gran_hooke_cuda.h b/src/USER-CUDA/pair_gran_hooke_cuda.h
deleted file mode 100644
index 6fa622ab40..0000000000
--- a/src/USER-CUDA/pair_gran_hooke_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(gran/hooke/cuda,PairGranHookeCuda)
-
-#else
-
-#ifndef PAIR_GRAN_HOOKE_CUDA_H
-#define PAIR_GRAN_HOOKE_CUDA_H
-
-#include "pair_gran_hooke.h"
-
-namespace LAMMPS_NS {
-
-class PairGranHookeCuda : public PairGranHooke
-{
-        public:
-                PairGranHookeCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj96_cut_cuda.cpp b/src/USER-CUDA/pair_lj96_cut_cuda.cpp
deleted file mode 100644
index 7edb722d36..0000000000
--- a/src/USER-CUDA/pair_lj96_cut_cuda.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj96_cut_cuda.h"
-#include "pair_lj96_cut_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJ96CutCuda::PairLJ96CutCuda(LAMMPS *lmp) : PairLJ96Cut(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJ96CutCuda::allocate()
-{
-        if(! allocated) PairLJ96Cut::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJ96CutCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJ96CutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJ96CutCuda::settings(int narg, char **arg)
-{
-        PairLJ96Cut::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJ96CutCuda::coeff(int narg, char **arg)
-{
-        PairLJ96Cut::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJ96CutCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairLJ96CutCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  }
-
-
-  cut_respa = NULL;
-  MYDBG(printf("# CUDA PairLJ96CutCuda::init_style end\n"); )
-}
-
-void PairLJ96CutCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJ96CutCuda::init_list\n");)
-        PairLJ96Cut::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJ96CutCuda::init_list end\n");)
-}
-
-void PairLJ96CutCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJ96Cut::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj96_cut_cuda.h b/src/USER-CUDA/pair_lj96_cut_cuda.h
deleted file mode 100644
index 8a8f36e504..0000000000
--- a/src/USER-CUDA/pair_lj96_cut_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj96/cut/cuda,PairLJ96CutCuda)
-
-#else
-
-#ifndef PAIR_LJ96_CUT_CUDA_H
-#define PAIR_LJ96_CUT_CUDA_H
-
-#include "pair_lj96_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairLJ96CutCuda : public PairLJ96Cut
-{
-        public:
-                PairLJ96CutCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
deleted file mode 100644
index 3a0ad0e288..0000000000
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_charmm_coul_charmm_cuda.h"
-#include "pair_lj_charmm_coul_charmm_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCharmmCoulCharmmCuda::PairLJCharmmCoulCharmmCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmm(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmCuda::allocate()
-{
-        if(! allocated) PairLJCharmmCoulCharmm::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-            cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmCuda::compute(int eflag, int vflag)
-{
-          if (eflag || vflag) ev_setup(eflag,vflag);
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(eflag) cuda->cu_eng_coul->upload();
-          if(vflag) cuda->cu_virial->upload();
-        }
-
-        Cuda_PairLJCharmmCoulCharmmCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul);
-
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmCuda::settings(int narg, char **arg)
-{
-        PairLJCharmmCoulCharmm::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (X_CFLOAT) cut_lj;
-        cuda->shared_data.pair.cut_coulsq_global = (X_CFLOAT) cut_coulsq;
-        cuda->shared_data.pair.cut_inner_global = (F_CFLOAT) cut_lj_inner;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmCuda::coeff(int narg, char **arg)
-{
-        PairLJCharmmCoulCharmm::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCharmmCoulCharmmCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/charmm/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-        if(atom->molecular)
-        {
-          cuda->shared_data.pair.collect_forces_later = 1;
-        }
-
-  int irequest;
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
-    error->all(FLERR,"Pair inner cutoff >= Pair outer cutoff");
-
-  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
-  cut_ljsq = cut_lj * cut_lj;
-  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
-  cut_coulsq = cut_coul * cut_coul;
-  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
-
-  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) *
-    (cut_ljsq-cut_lj_innersq);
-  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) *
-    (cut_coulsq-cut_coul_innersq);
-
-  cut_coulsq = cut_coul * cut_coul;
-
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-}
-
-void PairLJCharmmCoulCharmmCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list\n");)
-        PairLJCharmmCoulCharmm::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list end\n");)
-}
-
-void PairLJCharmmCoulCharmmCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCharmmCoulCharmm::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
deleted file mode 100644
index c19411f03c..0000000000
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/charmm/coul/charmm/cuda,PairLJCharmmCoulCharmmCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H
-#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H
-
-#include "pair_lj_charmm_coul_charmm.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCharmmCoulCharmmCuda : public PairLJCharmmCoulCharmm
-{
-        public:
-                PairLJCharmmCoulCharmmCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
deleted file mode 100644
index c2f2ca871f..0000000000
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_charmm_coul_charmm_implicit_cuda.h"
-#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCharmmCoulCharmmImplicitCuda::PairLJCharmmCoulCharmmImplicitCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmmImplicit(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.collect_forces_later = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmImplicitCuda::allocate()
-{
-        if(! allocated) PairLJCharmmCoulCharmmImplicit::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-            cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmImplicitCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(eflag) cuda->cu_eng_coul->upload();
-          if(vflag) cuda->cu_virial->upload();
-        }
-
-        Cuda_PairLJCharmmCoulCharmmImplicitCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul);
-
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmImplicitCuda::settings(int narg, char **arg)
-{
-        PairLJCharmmCoulCharmmImplicit::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (X_CFLOAT) cut_lj;
-        cuda->shared_data.pair.cut_coulsq_global = (X_CFLOAT) cut_coulsq;
-        cuda->shared_data.pair.cut_inner_global = (F_CFLOAT) cut_lj_inner;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulCharmmImplicitCuda::coeff(int narg, char **arg)
-{
-        PairLJCharmmCoulCharmmImplicit::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCharmmCoulCharmmImplicitCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/charmm/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
-    error->all(FLERR,"Pair inner cutoff >= Pair outer cutoff");
-
-  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
-  cut_ljsq = cut_lj * cut_lj;
-  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
-  cut_coulsq = cut_coul * cut_coul;
-  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
-
-  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) *
-    (cut_ljsq-cut_lj_innersq);
-  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) *
-    (cut_coulsq-cut_coul_innersq);
-
-  cut_coulsq = cut_coul * cut_coul;
-
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-}
-
-void PairLJCharmmCoulCharmmImplicitCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list\n");)
-        PairLJCharmmCoulCharmmImplicit::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list end\n");)
-}
-
-void PairLJCharmmCoulCharmmImplicitCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCharmmCoulCharmmImplicit::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
deleted file mode 100644
index b3cc8c9336..0000000000
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/charmm/coul/charmm/implicit/cuda,PairLJCharmmCoulCharmmImplicitCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H
-#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H
-
-#include "pair_lj_charmm_coul_charmm_implicit.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCharmmCoulCharmmImplicitCuda : public PairLJCharmmCoulCharmmImplicit
-{
-        public:
-                PairLJCharmmCoulCharmmImplicitCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
deleted file mode 100644
index b228bd6f41..0000000000
--- a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_charmm_coul_long_cuda.h"
-#include "pair_lj_charmm_coul_long_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define A1        0.254829592
-#define A2       -0.284496736
-#define A3        1.421413741
-#define A4       -1.453152027
-#define A5        1.061405429
-/* ---------------------------------------------------------------------- */
-
-PairLJCharmmCoulLongCuda::PairLJCharmmCoulLongCuda(LAMMPS *lmp) : PairLJCharmmCoulLong(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.collect_forces_later = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongCuda::allocate()
-{
-        if(! allocated) PairLJCharmmCoulLong::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                //cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-            cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(eflag) cuda->cu_eng_coul->upload();
-          if(vflag) cuda->cu_virial->upload();
-        }
-
-        Cuda_PairLJCharmmCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj);
-
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongCuda::settings(int narg, char **arg)
-{
-        PairLJCharmmCoulLong::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (X_CFLOAT) cut_lj;
-        cuda->shared_data.pair.cut_coulsq_global = (X_CFLOAT) cut_coulsq;
-        cuda->shared_data.pair.cut_inner_global = (F_CFLOAT) cut_lj_inner;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongCuda::coeff(int narg, char **arg)
-{
-        PairLJCharmmCoulLong::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCharmmCoulLongCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/charmm/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-  if (cut_lj_inner >= cut_lj)
-    error->all(FLERR,"Pair inner cutoff >= Pair outer cutoff");
-
-  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
-  cut_ljsq = cut_lj * cut_lj;
-  cut_coulsq = cut_coul * cut_coul;
-  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
-
-  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) *
-    (cut_ljsq-cut_lj_innersq);
-
-  cut_coulsq = cut_coul * cut_coul;
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-
-  if (force->kspace == NULL)
-    error->all(FLERR,"Pair style is incompatible with KSpace style");
-  g_ewald = force->kspace->g_ewald;
-  cuda->shared_data.pair.g_ewald=g_ewald;
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairLJCharmmCoulLongCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list\n");)
-        PairLJCharmmCoulLong::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list end\n");)
-}
-
-void PairLJCharmmCoulLongCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCharmmCoulLong::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
deleted file mode 100644
index 8d9048a341..0000000000
--- a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/charmm/coul/long/cuda,PairLJCharmmCoulLongCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H
-#define LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H
-
-#include "pair_lj_charmm_coul_long.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCharmmCoulLongCuda : public PairLJCharmmCoulLong
-{
-        public:
-                PairLJCharmmCoulLongCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
deleted file mode 100644
index 01b6dc071f..0000000000
--- a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_class2_coul_cut_cuda.h"
-#include "pair_lj_class2_coul_cut_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJClass2CoulCutCuda::PairLJClass2CoulCutCuda(LAMMPS *lmp) : PairLJClass2CoulCut(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJClass2CoulCutCuda::allocate()
-{
-        if(! allocated) PairLJClass2CoulCut::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.cut_coul= cut_coul;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2CoulCutCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJClass2CoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2CoulCutCuda::settings(int narg, char **arg)
-{
-        PairLJClass2CoulCut::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-        cuda->shared_data.pair.cut_coul_global = (F_CFLOAT) cut_coul_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2CoulCutCuda::coeff(int narg, char **arg)
-{
-        PairLJClass2CoulCut::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJClass2CoulCutCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/cut/coul/cut/cuda requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-
-  irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-}
-
-void PairLJClass2CoulCutCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list\n");)
-        PairLJClass2CoulCut::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list end\n");)
-}
-
-void PairLJClass2CoulCutCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJClass2CoulCut::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
deleted file mode 100644
index 6601e2797c..0000000000
--- a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/class2/coul/cut/cuda,PairLJClass2CoulCutCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H
-#define LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H
-
-#include "pair_lj_class2_coul_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairLJClass2CoulCutCuda : public PairLJClass2CoulCut
-{
-        public:
-                PairLJClass2CoulCutCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
deleted file mode 100644
index 20f257ffea..0000000000
--- a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
+++ /dev/null
@@ -1,175 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_class2_coul_long_cuda.h"
-#include "pair_lj_class2_coul_long_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define A1        0.254829592
-#define A2       -0.284496736
-#define A3        1.421413741
-#define A4       -1.453152027
-#define A5        1.061405429
-/* ---------------------------------------------------------------------- */
-
-PairLJClass2CoulLongCuda::PairLJClass2CoulLongCuda(LAMMPS *lmp) : PairLJClass2CoulLong(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJClass2CoulLongCuda::allocate()
-{
-        if(! allocated) PairLJClass2CoulLong::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2CoulLongCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJClass2CoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2CoulLongCuda::settings(int narg, char **arg)
-{
-        PairLJClass2CoulLong::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2CoulLongCuda::coeff(int narg, char **arg)
-{
-        PairLJClass2CoulLong::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJClass2CoulLongCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/cut/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-  cut_coulsq = cut_coul * cut_coul;
-  cuda->shared_data.pair.cut_coul_global=cut_coul;
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-  // set rRESPA cutoffs
-
-  if (force->newton) error->warning(FLERR,"Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
-  if (force->kspace == NULL)
-    error->all(FLERR,"Pair style is incompatible with KSpace style");
-  g_ewald = force->kspace->g_ewald;
-  cuda->shared_data.pair.g_ewald=g_ewald;
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairLJClass2CoulLongCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list\n");)
-        PairLJClass2CoulLong::init_list(id, ptr);
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list end\n");)
-}
-
-void PairLJClass2CoulLongCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJClass2CoulLong::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
deleted file mode 100644
index 43af51b4ed..0000000000
--- a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/class2/coul/long/cuda,PairLJClass2CoulLongCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H
-#define LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H
-
-#include "pair_lj_class2_coul_long.h"
-
-namespace LAMMPS_NS {
-
-class PairLJClass2CoulLongCuda : public PairLJClass2CoulLong
-{
-        public:
-                PairLJClass2CoulLongCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_class2_cuda.cpp b/src/USER-CUDA/pair_lj_class2_cuda.cpp
deleted file mode 100644
index 6b9f686c13..0000000000
--- a/src/USER-CUDA/pair_lj_class2_cuda.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_class2_cuda.h"
-#include "pair_lj_class2_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJClass2Cuda::PairLJClass2Cuda(LAMMPS *lmp) : PairLJClass2(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJClass2Cuda::allocate()
-{
-        if(! allocated) PairLJClass2::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2Cuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJClass2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2Cuda::settings(int narg, char **arg)
-{
-        PairLJClass2::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJClass2Cuda::coeff(int narg, char **arg)
-{
-        PairLJClass2::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJClass2Cuda::init_style()
-{
-        MYDBG(printf("# CUDA PairLJClass2Cuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-        irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  MYDBG(printf("# CUDA PairLJClass2Cuda::init_style end\n"); )
-}
-
-void PairLJClass2Cuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJClass2Cuda::init_list\n");)
-        PairLJClass2::init_list(id, ptr);
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        MYDBG(printf("# CUDA PairLJClass2Cuda::init_list end\n");)
-}
-
-void PairLJClass2Cuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJClass2::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_class2_cuda.h b/src/USER-CUDA/pair_lj_class2_cuda.h
deleted file mode 100644
index 6f2673c4a3..0000000000
--- a/src/USER-CUDA/pair_lj_class2_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/class2/cuda,PairLJClass2Cuda)
-
-#else
-
-#ifndef PAIR_LJ_CLASS2_CUDA_H
-#define PAIR_LJ_CLASS2_CUDA_H
-
-#include "pair_lj_class2.h"
-
-namespace LAMMPS_NS {
-
-class PairLJClass2Cuda : public PairLJClass2
-{
-        public:
-                PairLJClass2Cuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
deleted file mode 100644
index 3872be0d0e..0000000000
--- a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_cut_coul_cut_cuda.h"
-#include "pair_lj_cut_coul_cut_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCutCoulCutCuda::PairLJCutCoulCutCuda(LAMMPS *lmp) : PairLJCutCoulCut(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCutCoulCutCuda::allocate()
-{
-        if(! allocated) PairLJCutCoulCut::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.cut_coul= cut_coul;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulCutCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJCutCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulCutCuda::settings(int narg, char **arg)
-{
-        PairLJCutCoulCut::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-        cuda->shared_data.pair.cut_coul_global = (F_CFLOAT) cut_coul_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulCutCuda::coeff(int narg, char **arg)
-{
-        PairLJCutCoulCut::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCutCoulCutCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/cut/coul/cut/cuda requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-
-  irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-}
-
-void PairLJCutCoulCutCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list\n");)
-        PairLJCutCoulCut::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list end\n");)
-}
-
-void PairLJCutCoulCutCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCutCoulCut::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
deleted file mode 100644
index 10f44c76da..0000000000
--- a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/cut/coul/cut/cuda,PairLJCutCoulCutCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H
-#define LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H
-
-#include "pair_lj_cut_coul_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCutCoulCutCuda : public PairLJCutCoulCut
-{
-        public:
-                PairLJCutCoulCutCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
deleted file mode 100644
index 43bcce68c9..0000000000
--- a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_cut_coul_debye_cuda.h"
-#include "pair_lj_cut_coul_debye_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCutCoulDebyeCuda::PairLJCutCoulDebyeCuda(LAMMPS *lmp) : PairLJCutCoulDebye(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCutCoulDebyeCuda::allocate()
-{
-        if(! allocated) PairLJCutCoulDebye::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.cut_coul= cut_coul;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulDebyeCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJCutCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulDebyeCuda::settings(int narg, char **arg)
-{
-        PairLJCutCoulDebye::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-        cuda->shared_data.pair.cut_coul_global = (F_CFLOAT) cut_coul_global;
-        cuda->shared_data.pair.kappa = (F_CFLOAT) kappa;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulDebyeCuda::coeff(int narg, char **arg)
-{
-        PairLJCutCoulDebye::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCutCoulDebyeCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/cut/coul/debye/cuda requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-
-  irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-}
-
-void PairLJCutCoulDebyeCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list\n");)
-        PairLJCutCoulDebye::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list end\n");)
-}
-
-void PairLJCutCoulDebyeCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCutCoulDebye::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h
deleted file mode 100644
index aea3a42f66..0000000000
--- a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/cut/coul/debye/cuda,PairLJCutCoulDebyeCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CUT_COUL_DEBYE_CUDA_H
-#define LMP_PAIR_LJ_CUT_COUL_DEBYE_CUDA_H
-
-#include "pair_lj_cut_coul_debye.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCutCoulDebyeCuda : public PairLJCutCoulDebye
-{
-        public:
-                PairLJCutCoulDebyeCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp
deleted file mode 100644
index 52397f9429..0000000000
--- a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_cut_coul_long_cuda.h"
-#include "pair_lj_cut_coul_long_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define A1        0.254829592
-#define A2       -0.284496736
-#define A3        1.421413741
-#define A4       -1.453152027
-#define A5        1.061405429
-/* ---------------------------------------------------------------------- */
-
-PairLJCutCoulLongCuda::PairLJCutCoulLongCuda(LAMMPS *lmp) : PairLJCutCoulLong(lmp)
-{
-  cuda = lmp->cuda;
-  if(cuda == NULL)
-      error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCutCoulLongCuda::allocate()
-{
-        if(! allocated) PairLJCutCoulLong::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulLongCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJCutCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulLongCuda::settings(int narg, char **arg)
-{
-        PairLJCutCoulLong::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulLongCuda::coeff(int narg, char **arg)
-{
-        PairLJCutCoulLong::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCutCoulLongCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/cut/coul/long requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-    int respa = 0;
-    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
-    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
-
-    if (respa == 0) irequest = neighbor->request(this,instance_me);
-    else if (respa == 1) {
-      irequest = neighbor->request(this,instance_me);
-      neighbor->requests[irequest]->id = 1;
-      neighbor->requests[irequest]->half = 0;
-      neighbor->requests[irequest]->respainner = 1;
-      irequest = neighbor->request(this,instance_me);
-      neighbor->requests[irequest]->id = 3;
-      neighbor->requests[irequest]->half = 0;
-      neighbor->requests[irequest]->respaouter = 1;
-    } else {
-      irequest = neighbor->request(this,instance_me);
-      neighbor->requests[irequest]->id = 1;
-      neighbor->requests[irequest]->half = 0;
-      neighbor->requests[irequest]->respainner = 1;
-      irequest = neighbor->request(this,instance_me);
-      neighbor->requests[irequest]->id = 2;
-      neighbor->requests[irequest]->half = 0;
-      neighbor->requests[irequest]->respamiddle = 1;
-      irequest = neighbor->request(this,instance_me);
-      neighbor->requests[irequest]->id = 3;
-      neighbor->requests[irequest]->half = 0;
-      neighbor->requests[irequest]->respaouter = 1;
-    }
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-  }
-
-  cut_coulsq = cut_coul * cut_coul;
-  cuda->shared_data.pair.cut_coul_global=cut_coul;
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-  // set rRESPA cutoffs
-
-  if (strstr(update->integrate_style,"respa") &&
-      ((Respa *) update->integrate)->level_inner >= 0)
-    cut_respa = ((Respa *) update->integrate)->cutoff;
-  else cut_respa = NULL;
-
-  if (force->newton) error->warning(FLERR,"Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
-  if (force->kspace == NULL)
-    error->all(FLERR,"Pair style is incompatible with KSpace style");
-  g_ewald = force->kspace->g_ewald;
-  cuda->shared_data.pair.g_ewald=g_ewald;
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-
-
-  if(ncoultablebits) error->warning(FLERR,"# CUDA: You asked for the usage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
-}
-
-void PairLJCutCoulLongCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCutCoulLongCuda::init_list\n");)
-        PairLJCutCoulLong::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCutCoulLongCuda::init_list end\n");)
-}
-
-void PairLJCutCoulLongCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCutCoulLong::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h b/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h
deleted file mode 100644
index 2c7e55eb1e..0000000000
--- a/src/USER-CUDA/pair_lj_cut_coul_long_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/cut/coul/long/cuda,PairLJCutCoulLongCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_CUDA_H
-#define LMP_PAIR_LJ_CUT_COUL_LONG_CUDA_H
-
-#include "pair_lj_cut_coul_long.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCutCoulLongCuda : public PairLJCutCoulLong
-{
-        public:
-                PairLJCutCoulLongCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_cut_cuda.cpp b/src/USER-CUDA/pair_lj_cut_cuda.cpp
deleted file mode 100644
index a5d4f47a51..0000000000
--- a/src/USER-CUDA/pair_lj_cut_cuda.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_cut_cuda.h"
-#include "pair_lj_cut_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCutCuda::PairLJCutCuda(LAMMPS *lmp) : PairLJCut(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCutCuda::allocate()
-{
-        if(! allocated) PairLJCut::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCuda::settings(int narg, char **arg)
-{
-        PairLJCut::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCuda::coeff(int narg, char **arg)
-{
-        PairLJCut::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCutCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairLJCutCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  }
-
-
-  cut_respa = NULL;
-  MYDBG(printf("# CUDA PairLJCutCuda::init_style end\n"); )
-}
-
-void PairLJCutCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCutCuda::init_list\n");)
-        PairLJCut::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCutCuda::init_list end\n");)
-}
-
-void PairLJCutCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCut::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_cut_cuda.h b/src/USER-CUDA/pair_lj_cut_cuda.h
deleted file mode 100644
index f42c7d04c0..0000000000
--- a/src/USER-CUDA/pair_lj_cut_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/cut/cuda,PairLJCutCuda)
-
-#else
-
-#ifndef PAIR_LJ_CUT_CUDA_H
-#define PAIR_LJ_CUT_CUDA_H
-
-#include "pair_lj_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCutCuda : public PairLJCut
-{
-        public:
-                PairLJCutCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp b/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp
deleted file mode 100644
index f60aaa6f38..0000000000
--- a/src/USER-CUDA/pair_lj_cut_experimental_cuda.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_cut_experimental_cuda.h"
-#include "pair_lj_cut_experimental_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJCutExperimentalCuda::PairLJCutExperimentalCuda(LAMMPS *lmp) : PairLJCut(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJCutExperimentalCuda::allocate()
-{
-        if(! allocated) PairLJCut::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutExperimentalCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-        Cuda_PairLJCutExperimentalCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          CudaWrapper_Sync();
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
- }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutExperimentalCuda::settings(int narg, char **arg)
-{
-        PairLJCut::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutExperimentalCuda::coeff(int narg, char **arg)
-{
-        PairLJCut::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJCutExperimentalCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  }
-
-
-  cut_respa = NULL;
-  MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_style end\n"); )
-}
-
-void PairLJCutExperimentalCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_list\n");)
-        PairLJCut::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJCutExperimentalCuda::init_list end\n");)
-}
-
-void PairLJCutExperimentalCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJCut::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_cut_experimental_cuda.h b/src/USER-CUDA/pair_lj_cut_experimental_cuda.h
deleted file mode 100644
index fafb2d63d4..0000000000
--- a/src/USER-CUDA/pair_lj_cut_experimental_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/cut/experimental/cuda,PairLJCutExperimentalCuda)
-
-#else
-
-#ifndef PAIR_LJ_CUT_EXPERIMENTAL_CUDA_H
-#define PAIR_LJ_CUT_EXPERIMENTAL_CUDA_H
-
-#include "pair_lj_cut.h"
-
-namespace LAMMPS_NS {
-
-class PairLJCutExperimentalCuda : public PairLJCut
-{
-        public:
-                PairLJCutExperimentalCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_expand_cuda.cpp b/src/USER-CUDA/pair_lj_expand_cuda.cpp
deleted file mode 100644
index a102dea75a..0000000000
--- a/src/USER-CUDA/pair_lj_expand_cuda.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_expand_cuda.h"
-#include "pair_lj_expand_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJExpandCuda::PairLJExpandCuda(LAMMPS *lmp) : PairLJExpand(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJExpandCuda::allocate()
-{
-        if(! allocated) PairLJExpand::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.cutsq   = cutsq;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.coeff5  = shift;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJExpandCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJExpandCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJExpandCuda::settings(int narg, char **arg)
-{
-        PairLJExpand::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJExpandCuda::coeff(int narg, char **arg)
-{
-        PairLJExpand::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJExpandCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairLJExpandCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  }
-
-
-  MYDBG(printf("# CUDA PairLJExpandCuda::init_style end\n"); )
-}
-
-void PairLJExpandCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJExpandCuda::init_list\n");)
-        PairLJExpand::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJExpandCuda::init_list end\n");)
-}
-
-void PairLJExpandCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJExpand::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_expand_cuda.h b/src/USER-CUDA/pair_lj_expand_cuda.h
deleted file mode 100644
index b61578c295..0000000000
--- a/src/USER-CUDA/pair_lj_expand_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/expand/cuda,PairLJExpandCuda)
-
-#else
-
-#ifndef PAIR_LJ_EXPAND_CUDA_H
-#define PAIR_LJ_EXPAND_CUDA_H
-
-#include "pair_lj_expand.h"
-
-namespace LAMMPS_NS {
-
-class PairLJExpandCuda : public PairLJExpand
-{
-        public:
-                PairLJExpandCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
deleted file mode 100644
index 73df6a66cb..0000000000
--- a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_gromacs_coul_gromacs_cuda.h"
-#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJGromacsCoulGromacsCuda::PairLJGromacsCoulGromacsCuda(LAMMPS *lmp) : PairLJGromacsCoulGromacs(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJGromacsCoulGromacsCuda::allocate()
-{
-        if(! allocated) PairLJGromacsCoulGromacs::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.coeff5  = ljsw1;
-                cuda->shared_data.pair.coeff6  = ljsw2;
-                cuda->shared_data.pair.coeff7  = ljsw3;
-                cuda->shared_data.pair.coeff8  = ljsw4;
-                cuda->shared_data.pair.coeff9  = ljsw5;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-            cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw5_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw5, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1));
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJGromacsCoulGromacsCuda::compute(int eflag, int vflag)
-{
-          if (eflag || vflag) ev_setup(eflag,vflag);
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(eflag) cuda->cu_eng_coul->upload();
-          if(vflag) cuda->cu_virial->upload();
-        }
-
-        Cuda_PairLJGromacsCoulGromacsCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,cut_coul_inner,coulsw1,coulsw2,coulsw5);
-
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJGromacsCoulGromacsCuda::settings(int narg, char **arg)
-{
-        PairLJGromacsCoulGromacs::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (X_CFLOAT) cut_lj;
-        cuda->shared_data.pair.cut_coulsq_global = (X_CFLOAT) cut_coulsq;
-        cuda->shared_data.pair.cut_inner_global = (F_CFLOAT) cut_lj_inner;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJGromacsCoulGromacsCuda::coeff(int narg, char **arg)
-{
-        PairLJGromacsCoulGromacs::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJGromacsCoulGromacsCuda::init_style()
-{
-  if (!atom->q_flag)
-    error->all(FLERR,"Pair style lj/gromacs/coul/gromacs requires atom attribute q");
-  // request regular or rRESPA neighbor lists
-
-        if(atom->molecular)
-        {
-          cuda->shared_data.pair.collect_forces_later = 1;
-        }
-
-  int irequest;
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
-    error->all(FLERR,"Pair inner cutoff >= Pair outer cutoff");
-
-  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
-  cut_ljsq = cut_lj * cut_lj;
-  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
-  cut_coulsq = cut_coul * cut_coul;
-  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
-
-
-  cut_coulsq = cut_coul * cut_coul;
-
-  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
-
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-}
-
-void PairLJGromacsCoulGromacsCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJGromacsCoulGromacsCuda::init_list\n");)
-        PairLJGromacsCoulGromacs::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJGromacsCoulGromacsCuda::init_list end\n");)
-}
-
-void PairLJGromacsCoulGromacsCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJGromacsCoulGromacs::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h b/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h
deleted file mode 100644
index 6e48df1931..0000000000
--- a/src/USER-CUDA/pair_lj_gromacs_coul_gromacs_cuda.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/gromacs/coul/gromacs/cuda,PairLJGromacsCoulGromacsCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_GROMACS_COUL_GROMACS_CUDA_H
-#define LMP_PAIR_LJ_GROMACS_COUL_GROMACS_CUDA_H
-
-#include "pair_lj_gromacs_coul_gromacs.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJGromacsCoulGromacsCuda : public PairLJGromacsCoulGromacs
-{
-        public:
-                PairLJGromacsCoulGromacsCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw4_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw5_gm;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.cpp b/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
deleted file mode 100644
index b2786d81ab..0000000000
--- a/src/USER-CUDA/pair_lj_gromacs_cuda.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_gromacs_cuda.h"
-#include "pair_lj_gromacs_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJGromacsCuda::PairLJGromacsCuda(LAMMPS *lmp) : PairLJGromacs(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJGromacsCuda::allocate()
-{
-        if(! allocated) PairLJGromacs::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut = cut;
-                cuda->shared_data.pair.cut_inner = cut_inner;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.coeff5  = ljsw1;
-                cuda->shared_data.pair.coeff6  = ljsw2;
-                cuda->shared_data.pair.coeff7  = ljsw3;
-                cuda->shared_data.pair.coeff8  = ljsw4;
-                cuda->shared_data.pair.coeff9  = ljsw5;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-            cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw5_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw5, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1));
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJGromacsCuda::compute(int eflag, int vflag)
-{
-          if (eflag || vflag) ev_setup(eflag,vflag);
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(vflag) cuda->cu_virial->upload();
-        }
-
-        Cuda_PairLJGromacsCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJGromacsCuda::settings(int narg, char **arg)
-{
-        PairLJGromacs::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-        cuda->shared_data.pair.cut_inner_global = (F_CFLOAT) cut_inner_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJGromacsCuda::coeff(int narg, char **arg)
-{
-        PairLJGromacs::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJGromacsCuda::init_style()
-{
-  // request regular or rRESPA neighbor lists
-
-        if(atom->molecular)
-        {
-          cuda->shared_data.pair.collect_forces_later = 1;
-        }
-
-  int irequest;
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-
-
-}
-
-void PairLJGromacsCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJGromacsCuda::init_list\n");)
-        PairLJGromacs::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJGromacsCuda::init_list end\n");)
-}
-
-void PairLJGromacsCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJGromacs::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_gromacs_cuda.h b/src/USER-CUDA/pair_lj_gromacs_cuda.h
deleted file mode 100644
index b4bbc15c6d..0000000000
--- a/src/USER-CUDA/pair_lj_gromacs_cuda.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/gromacs/cuda,PairLJGromacsCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_GROMACS_CUDA_H
-#define LMP_PAIR_LJ_GROMACS_CUDA_H
-
-#include "pair_lj_gromacs.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJGromacsCuda : public PairLJGromacs
-{
-        public:
-                PairLJGromacsCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw4_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw5_gm;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_sdk_coul_long_cuda.cpp b/src/USER-CUDA/pair_lj_sdk_coul_long_cuda.cpp
deleted file mode 100644
index aca0f6d013..0000000000
--- a/src/USER-CUDA/pair_lj_sdk_coul_long_cuda.cpp
+++ /dev/null
@@ -1,193 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_sdk_coul_long_cuda.h"
-#include "pair_lj_sdk_coul_long_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJSDKCoulLongCuda::PairLJSDKCoulLongCuda(LAMMPS *lmp) : PairLJSDKCoulLong(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        lj_type_double = NULL;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJSDKCoulLongCuda::allocate()
-{
-        if(! allocated) PairLJSDKCoulLong::allocate();
-        int n = atom->ntypes;
-        if(! allocated2)
-        {
-                allocated2 = true;
-
-
-                  memory->create(lj_type_double,n+1,n+1,"pairlj:ljtypedouble");
-
-                cuda->shared_data.pair.cut     = cut_lj;
-                cuda->shared_data.pair.cut_coul= NULL;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.coeff5  = lj_type_double;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-
-        }
-          for (int i = 1; i <= n; i++) {
-      for (int j = i; j <= n; j++) {
-        lj_type_double[i][j] = lj_type[i][j];
-        lj_type_double[j][i] = lj_type[i][j];
-      }
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSDKCoulLongCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(eflag) cuda->cu_eng_coul->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJSDKCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(eflag) cuda->cu_eng_coul->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSDKCoulLongCuda::settings(int narg, char **arg)
-{
-        PairLJSDKCoulLong::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_lj_global;
-        cuda->shared_data.pair.cut_coul_global = (F_CFLOAT) cut_coul;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSDKCoulLongCuda::coeff(int narg, char **arg)
-{
-        PairLJSDKCoulLong::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJSDKCoulLongCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairLJSDKCoulLongCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-        irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-  g_ewald = force->kspace->g_ewald;
-  cuda->shared_data.pair.g_ewald=g_ewald;
-  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
-  if (force->newton) error->warning(FLERR,"Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
-  MYDBG(printf("# CUDA PairLJSDKCoulLongCuda::init_style end\n"); )
-}
-
-void PairLJSDKCoulLongCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJSDKCoulLongCuda::init_list\n");)
-        PairLJSDKCoulLong::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJSDKCoulLongCuda::init_list end\n");)
-}
-
-void PairLJSDKCoulLongCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJSDKCoulLong::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_sdk_coul_long_cuda.h b/src/USER-CUDA/pair_lj_sdk_coul_long_cuda.h
deleted file mode 100644
index 4b5c07c79d..0000000000
--- a/src/USER-CUDA/pair_lj_sdk_coul_long_cuda.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(cg/cmm/coul/long/cuda,PairLJSDKCoulLongCuda)
-PairStyle(lj/sdk/coul/long/cuda,PairLJSDKCoulLongCuda)
-
-#else
-
-#ifndef PAIR_LJ_SDK_COUL_LONG_CUDA_H
-#define PAIR_LJ_SDK_COUL_LONG_CUDA_H
-
-#include "pair_lj_sdk_coul_long.h"
-
-namespace LAMMPS_NS {
-
-class PairLJSDKCoulLongCuda : public PairLJSDKCoulLong
-{
-        public:
-                PairLJSDKCoulLongCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                double** lj_type_double;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_sdk_cuda.cpp b/src/USER-CUDA/pair_lj_sdk_cuda.cpp
deleted file mode 100644
index f6eba1ba7b..0000000000
--- a/src/USER-CUDA/pair_lj_sdk_cuda.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_sdk_cuda.h"
-#include "pair_lj_sdk_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJSDKCuda::PairLJSDKCuda(LAMMPS *lmp) : PairLJSDK(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        lj_type_double = NULL;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJSDKCuda::allocate()
-{
-        if(! allocated) PairLJSDK::allocate();
-        int n = atom->ntypes;
-        if(! allocated2)
-        {
-                allocated2 = true;
-
-
-                  memory->create(lj_type_double,n+1,n+1,"pairlj:ljtypedouble");
-
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.coeff5  = lj_type_double;
-            /*cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj_type_double_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj_type_double, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));*/
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-        }
-          for (int i = 1; i <= n; i++) {
-      for (int j = i; j <= n; j++) {
-        lj_type_double[i][j] = lj_type[i][j];
-        lj_type_double[j][i] = lj_type[i][j];
-      }
-    }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSDKCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairLJSDKCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSDKCuda::settings(int narg, char **arg)
-{
-        PairLJSDK::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSDKCuda::coeff(int narg, char **arg)
-{
-        PairLJSDK::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJSDKCuda::init_style()
-{
-  MYDBG(printf("# CUDA PairLJSDKCuda::init_style start\n"); )
-
-    int irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-
-  MYDBG(printf("# CUDA PairLJSDKCuda::init_style end\n"); )
-}
-
-void PairLJSDKCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJSDKCuda::init_list\n");)
-        PairLJSDK::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJSDKCuda::init_list end\n");)
-}
-
-void PairLJSDKCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJSDK::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_sdk_cuda.h b/src/USER-CUDA/pair_lj_sdk_cuda.h
deleted file mode 100644
index 5e7807cbd7..0000000000
--- a/src/USER-CUDA/pair_lj_sdk_cuda.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/sdk/cuda,PairLJSDKCuda)
-PairStyle(cg/cmm/cuda,PairLJSDKCuda)
-
-#else
-
-#ifndef PAIR_LJ_SDK_CUDA_H
-#define PAIR_LJ_SDK_CUDA_H
-
-#include "pair_lj_sdk.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJSDKCuda : public PairLJSDK
-{
-        public:
-                PairLJSDKCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                double** lj_type_double;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj_type_double_gm;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.cpp b/src/USER-CUDA/pair_lj_smooth_cuda.cpp
deleted file mode 100644
index 3a51e94fef..0000000000
--- a/src/USER-CUDA/pair_lj_smooth_cuda.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Paul Crozier (SNL)
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_lj_smooth_cuda.h"
-#include "pair_lj_smooth_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "kspace.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairLJSmoothCuda::PairLJSmoothCuda(LAMMPS *lmp) : PairLJSmooth(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->shared_data.pair.use_block_per_atom = 0;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairLJSmoothCuda::allocate()
-{
-        if(! allocated) PairLJSmooth::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut = cut;
-                cuda->shared_data.pair.cut_inner = cut_inner;
-                cuda->shared_data.pair.coeff1  = lj1;
-                cuda->shared_data.pair.coeff2  = lj2;
-                cuda->shared_data.pair.coeff3  = lj3;
-                cuda->shared_data.pair.coeff4  = lj4;
-                cuda->shared_data.pair.coeff5  = ljsw1;
-                cuda->shared_data.pair.coeff6  = ljsw2;
-                cuda->shared_data.pair.coeff7  = ljsw3;
-                cuda->shared_data.pair.coeff8  = ljsw4;
-                cuda->shared_data.pair.coeff9  = ljsw0;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-            cu_lj1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_lj4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw0_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw0, &cuda->shared_data.pair.coeff9_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw1_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw1, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw2_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw2, &cuda->shared_data.pair.coeff6_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw3_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw3, &cuda->shared_data.pair.coeff7_gm, (atom->ntypes+1)*(atom->ntypes+1));
-            cu_ljsw4_gm = new cCudaData<double, F_CFLOAT, x> ((double*)ljsw4, &cuda->shared_data.pair.coeff8_gm, (atom->ntypes+1)*(atom->ntypes+1));
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSmoothCuda::compute(int eflag, int vflag)
-{
-          if (eflag || vflag) ev_setup(eflag,vflag);
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->upload();
-          if(vflag) cuda->cu_virial->upload();
-        }
-
-        Cuda_PairLJSmoothCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-        if(not cuda->shared_data.pair.collect_forces_later)
-        {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSmoothCuda::settings(int narg, char **arg)
-{
-        PairLJSmooth::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-        cuda->shared_data.pair.cut_inner_global = (F_CFLOAT) cut_inner_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJSmoothCuda::coeff(int narg, char **arg)
-{
-        PairLJSmooth::coeff(narg, arg);
-        allocate();
-}
-
-void PairLJSmoothCuda::init_style()
-{
-  // request regular or rRESPA neighbor lists
-
-        if(atom->molecular)
-        {
-          cuda->shared_data.pair.collect_forces_later = 1;
-        }
-
-  int irequest;
-
-  irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-
-
-
-}
-
-void PairLJSmoothCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairLJSmoothCuda::init_list\n");)
-        PairLJSmooth::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairLJSmoothCuda::init_list end\n");)
-}
-
-void PairLJSmoothCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairLJSmooth::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_lj_smooth_cuda.h b/src/USER-CUDA/pair_lj_smooth_cuda.h
deleted file mode 100644
index 0a57e6f663..0000000000
--- a/src/USER-CUDA/pair_lj_smooth_cuda.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(lj/smooth/cuda,PairLJSmoothCuda)
-
-#else
-
-#ifndef LMP_PAIR_LJ_SMOOTH_CUDA_H
-#define LMP_PAIR_LJ_SMOOTH_CUDA_H
-
-#include "pair_lj_smooth.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairLJSmoothCuda : public PairLJSmooth
-{
-        public:
-                PairLJSmoothCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_lj4_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw0_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw1_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw2_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw3_gm;
-                cCudaData<double  , F_CFLOAT , x >* cu_ljsw4_gm;
-
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_morse_cuda.cpp b/src/USER-CUDA/pair_morse_cuda.cpp
deleted file mode 100644
index a38712aabe..0000000000
--- a/src/USER-CUDA/pair_morse_cuda.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_morse_cuda.h"
-#include "pair_morse_cuda_cu.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-/* ---------------------------------------------------------------------- */
-
-PairMorseCuda::PairMorseCuda(LAMMPS *lmp) : PairMorse(lmp)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        cuda->shared_data.pair.cudable_force = 1;
-        cuda->setSystemParams();
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairMorseCuda::allocate()
-{
-        if(! allocated) PairMorse::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cut     = cut;
-                cuda->shared_data.pair.coeff1  = r0;
-                cuda->shared_data.pair.coeff2  = alpha;
-                cuda->shared_data.pair.coeff3  = morse1;
-                cuda->shared_data.pair.coeff4  = d0;
-                cuda->shared_data.pair.offset  = offset;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMorseCuda::compute(int eflag, int vflag)
-{
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairMorseCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
-
-    if(not cuda->shared_data.pair.collect_forces_later)
-    {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-    }
-
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMorseCuda::settings(int narg, char **arg)
-{
-        PairMorse::settings(narg, arg);
-        cuda->shared_data.pair.cut_global = (F_CFLOAT) cut_global;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMorseCuda::coeff(int narg, char **arg)
-{
-        PairMorse::coeff(narg, arg);
-        allocate();
-}
-
-void PairMorseCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairMorseCuda::init_style start\n"); )
-  // request regular or rRESPA neighbor lists
-
-  int irequest;
-
-  if (update->whichflag == 0 && strstr(update->integrate_style,"respa")) {
-
-  }
-  else
-  {
-    irequest = neighbor->request(this,instance_me);
-    neighbor->requests[irequest]->full = 1;
-    neighbor->requests[irequest]->half = 0;
-    neighbor->requests[irequest]->cudable = 1;
-    //neighbor->style=0; //0=NSQ neighboring
-  }
-
-
-  MYDBG(printf("# CUDA PairMorseCuda::init_style end\n"); )
-}
-
-void PairMorseCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairMorseCuda::init_list\n");)
-        PairMorse::init_list(id, ptr);
-        #ifndef CUDA_USE_BINNING
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        #endif
-        MYDBG(printf("# CUDA PairMorseCuda::init_list end\n");)
-}
-
-void PairMorseCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairMorse::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-
-}
diff --git a/src/USER-CUDA/pair_morse_cuda.h b/src/USER-CUDA/pair_morse_cuda.h
deleted file mode 100644
index f76e687527..0000000000
--- a/src/USER-CUDA/pair_morse_cuda.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(morse/cuda,PairMorseCuda)
-
-#else
-
-#ifndef PAIR_MORSE_CUDA_H
-#define PAIR_MORSE_CUDA_H
-
-#include "pair_morse.h"
-
-namespace LAMMPS_NS {
-
-class PairMorseCuda : public PairMorse
-{
-        public:
-                PairMorseCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-                class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_sw_cuda.cpp b/src/USER-CUDA/pair_sw_cuda.cpp
deleted file mode 100644
index 1dfccc4e7b..0000000000
--- a/src/USER-CUDA/pair_sw_cuda.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_sw_cuda.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-
-
-
-/* ---------------------------------------------------------------------- */
-
-PairSWCuda::PairSWCuda(LAMMPS *lmp) : PairSW(lmp)
-{
-  cuda = lmp->cuda;
-  if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        params_f = NULL;
-        cuda->setSystemParams();
-  cuda->shared_data.pair.cudable_force = 1;
-  cuda->shared_data.pair.override_block_per_atom = 0;
-  cuda->shared_data.pair.neighall = true;
-  init = false;
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairSWCuda::allocate()
-{
-        if(! allocated) PairSW::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cutsq   = cutsq;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairSWCuda::compute(int eflag, int vflag)
-{
-  if(!init) {Cuda_PairSWCuda_Init(&cuda->shared_data,params_f,map, &elem2param[0][0][0],nelements); init=true;}
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairSWCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);//,&elem2param[0][0][0],map
-  if(not cuda->shared_data.pair.collect_forces_later)
-  {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairSWCuda::settings(int narg, char **arg)
-{
-        PairSW::settings(narg, arg);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairSWCuda::coeff(int narg, char **arg)
-{
-        PairSW::coeff(narg, arg);
-        allocate();
-  params_f = (ParamSW_Float *) memory->srealloc(params_f,maxparam*sizeof(ParamSW_Float),
-        "pair:params_f");
-  for(int i=0;i<maxparam;i++)
-  {
-    printf("%e %e\n",params[i].cut,params[i].cutsq);
-    params_f[i].cut = params[i].cut;
-    params_f[i].cutsq = params[i].cutsq;
-    params_f[i].c1 = params[i].c1;
-    params_f[i].c2 = params[i].c2;
-    params_f[i].c3 = params[i].c3;
-    params_f[i].c4 = params[i].c4;
-    params_f[i].c5 = params[i].c5;
-    params_f[i].c6 = params[i].c6;
-    params_f[i].ielement = params[i].ielement;
-    params_f[i].jelement = params[i].jelement;
-    params_f[i].kelement = params[i].kelement;
-    params_f[i].epsilon = params[i].epsilon;
-    params_f[i].sigma = params[i].sigma;
-    params_f[i].littlea = params[i].littlea;
-    params_f[i].lambda = params[i].lambda;
-    params_f[i].costheta = params[i].costheta;
-    params_f[i].tol = params[i].tol;
-    params_f[i].sigma_gamma = params[i].sigma_gamma;
-    params_f[i].lambda_epsilon = params[i].lambda_epsilon;
-    params_f[i].lambda_epsilon2 = params[i].lambda_epsilon2;
-    params_f[i].gamma = params[i].gamma;
-
-    params_f[i].biga = params[i].biga;
-    params_f[i].bigb = params[i].bigb;
-    params_f[i].gamma = params[i].gamma;
-    params_f[i].powerp = params[i].powerp;
-    params_f[i].powerq = params[i].powerq;
-  }
-  cuda->shared_data.pair.cut_global = cutmax;
-}
-
-void PairSWCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairSWCuda::init_style start\n"); )
-
-  int irequest;
-
-        irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-  neighbor->requests[irequest]->ghost = 1;
-
-
-  MYDBG(printf("# CUDA PairSWCuda::init_style end\n"); )
-}
-
-void PairSWCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairSWCuda::init_list\n");)
-        PairSW::init_list(id, ptr);
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        MYDBG(printf("# CUDA PairSWCuda::init_list end\n");)
-  cu_params_f = (ParamSW_Float*) CudaWrapper_AllocCudaData(sizeof(ParamSW_Float)*maxparam);
-  CudaWrapper_UploadCudaData((void*) params_f,(void*) cu_params_f,sizeof(ParamSW_Float)*maxparam);
-  cu_elem2param = new cCudaData<int, int, xyz > ((int*) elem2param, nelements,nelements,nelements);
-  cu_elem2param->upload();
-  cu_map = new cCudaData<int, int, x > ( map,atom->ntypes+1 );
-  cu_map->upload();
-}
-
-void PairSWCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairSW::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-}
diff --git a/src/USER-CUDA/pair_sw_cuda.h b/src/USER-CUDA/pair_sw_cuda.h
deleted file mode 100644
index c61f849a1f..0000000000
--- a/src/USER-CUDA/pair_sw_cuda.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(sw/cuda,PairSWCuda)
-
-#else
-
-#ifndef PAIR_SW_CUDA_H
-#define PAIR_SW_CUDA_H
-
-#include "pair_sw_cuda_cu.h"
-#include "pair_sw.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairSWCuda : public PairSW
-{
-        public:
-                PairSWCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-
-          class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                ParamSW_Float* params_f;
-                ParamSW_Float* cu_params_f;
-                cCudaData<int, int, xyz >* cu_elem2param;
-    cCudaData<int, int, x >* cu_map;
-    bool init;
-    bool iszbl;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_tersoff_cuda.cpp b/src/USER-CUDA/pair_tersoff_cuda.cpp
deleted file mode 100644
index f22b551284..0000000000
--- a/src/USER-CUDA/pair_tersoff_cuda.cpp
+++ /dev/null
@@ -1,204 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include "pair_tersoff_cuda.h"
-#include "cuda_data.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "cuda_neigh_list.h"
-#include "update.h"
-#include "integrate.h"
-#include "respa.h"
-#include "memory.h"
-#include "error.h"
-#include "user_cuda.h"
-
-using namespace LAMMPS_NS;
-
-
-
-
-/* ---------------------------------------------------------------------- */
-
-PairTersoffCuda::PairTersoffCuda(LAMMPS *lmp) : PairTersoff(lmp)
-{
-  cuda = lmp->cuda;
-  if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-        allocated2 = false;
-        params_f = NULL;
-        cuda->setSystemParams();
-  cuda->shared_data.pair.cudable_force = 1;
-  cuda->shared_data.pair.override_block_per_atom = 0;
-  cuda->shared_data.pair.neighall = true;
-  init = false;
-  iszbl = false;
-}
-
-/* ----------------------------------------------------------------------
-   remember pointer to arrays in cuda shared data
-------------------------------------------------------------------------- */
-
-void PairTersoffCuda::allocate()
-{
-        if(! allocated) PairTersoff::allocate();
-        if(! allocated2)
-        {
-                allocated2 = true;
-                cuda->shared_data.pair.cutsq   = cutsq;
-                cuda->shared_data.pair.special_lj  = force->special_lj;
-                cuda->shared_data.pair.special_coul  = force->special_coul;
-        }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairTersoffCuda::compute(int eflag, int vflag)
-{
-  if(!init) {Cuda_PairTersoffCuda_Init(&cuda->shared_data,params_f,map, &elem2param[0][0][0],nelements,iszbl); init=true;}
-        if (eflag || vflag) ev_setup(eflag,vflag);
-        if(eflag) cuda->cu_eng_vdwl->upload();
-        if(vflag) cuda->cu_virial->upload();
-
-        Cuda_PairTersoffCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);//,&elem2param[0][0][0],map
-  if(not cuda->shared_data.pair.collect_forces_later)
-  {
-          if(eflag) cuda->cu_eng_vdwl->download();
-          if(vflag) cuda->cu_virial->download();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairTersoffCuda::settings(int narg, char **arg)
-{
-        PairTersoff::settings(narg, arg);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairTersoffCuda::coeff(int narg, char **arg)
-{
-        PairTersoff::coeff(narg, arg);
-        allocate();
-  params_f = (Param_Float *) memory->srealloc(params_f,maxparam*sizeof(Param_Float),
-        "pair:params_f");
-  for(int i=0;i<maxparam;i++)
-  {
-    params_f[i].lam1 = params[i].lam1;
-    params_f[i].lam2 = params[i].lam2;
-    params_f[i].lam3 = params[i].lam3;
-    params_f[i].c = params[i].c;
-    params_f[i].d = params[i].d;
-    params_f[i].h = params[i].h;
-    params_f[i].gamma = params[i].gamma;
-    params_f[i].powerm = params[i].powerm;
-    params_f[i].powern = params[i].powern;
-    params_f[i].beta = params[i].beta;
-    params_f[i].biga = params[i].biga;
-    params_f[i].bigb = params[i].bigb;
-    params_f[i].bigd = params[i].bigd;
-    params_f[i].bigr = params[i].bigr;
-    params_f[i].cut = params[i].cut;
-    params_f[i].cutsq = params[i].cutsq;
-    params_f[i].c1 = params[i].c1;
-    params_f[i].c2 = params[i].c2;
-    params_f[i].c3 = params[i].c3;
-    params_f[i].c4 = params[i].c4;
-    params_f[i].ielement = params[i].ielement;
-    params_f[i].jelement = params[i].jelement;
-    params_f[i].kelement = params[i].kelement;
-    params_f[i].powermint = params[i].powermint;
-  }
-  cuda->shared_data.pair.cut_global = cutmax;
-}
-
-void PairTersoffCuda::init_style()
-{
-        MYDBG(printf("# CUDA PairTersoffCuda::init_style start\n"); )
-
-  int irequest;
-
-        irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->full = 1;
-  neighbor->requests[irequest]->half = 0;
-  neighbor->requests[irequest]->cudable = 1;
-  neighbor->requests[irequest]->ghost = 1;
-
-
-  MYDBG(printf("# CUDA PairTersoffCuda::init_style end\n"); )
-}
-
-void PairTersoffCuda::init_list(int id, NeighList *ptr)
-{
-        MYDBG(printf("# CUDA PairTersoffCuda::init_list\n");)
-        PairTersoff::init_list(id, ptr);
-        // right now we can only handle verlet (id 0), not respa
-        if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
-        // see Neighbor::init() for details on lammps lists' logic
-        MYDBG(printf("# CUDA PairTersoffCuda::init_list end\n");)
-  cu_params_f = (Param_Float*) CudaWrapper_AllocCudaData(sizeof(Param_Float)*maxparam);
-  CudaWrapper_UploadCudaData((void*) params_f,(void*) cu_params_f,sizeof(Param_Float)*maxparam);
-  cu_elem2param = new cCudaData<int, int, xyz > ((int*) elem2param, nelements,nelements,nelements);
-  cu_elem2param->upload();
-  cu_map = new cCudaData<int, int, x > ( map,atom->ntypes+1 );
-  cu_map->upload();
-}
-
-void PairTersoffCuda::ev_setup(int eflag, int vflag)
-{
-        int maxeatomold=maxeatom;
-        PairTersoff::ev_setup(eflag,vflag);
-
-  if (eflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_CFLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
-
-  if (vflag_atom && atom->nmax > maxeatomold)
-        {delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_CFLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.vatom , atom->nmax, 6  );}
-}
diff --git a/src/USER-CUDA/pair_tersoff_cuda.h b/src/USER-CUDA/pair_tersoff_cuda.h
deleted file mode 100644
index 5b829114f2..0000000000
--- a/src/USER-CUDA/pair_tersoff_cuda.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(tersoff/cuda,PairTersoffCuda)
-
-#else
-
-#ifndef PAIR_TERSOFF_CUDA_H
-#define PAIR_TERSOFF_CUDA_H
-
-#include "pair_tersoff_cuda_cu.h"
-#include "pair_tersoff.h"
-#include "cuda_data.h"
-
-namespace LAMMPS_NS {
-
-class PairTersoffCuda : public PairTersoff
-{
-        public:
-                PairTersoffCuda(class LAMMPS *);
-                void compute(int, int);
-                void settings(int, char **);
-                void coeff(int, char **);
-                void init_list(int, class NeighList *);
-                void init_style();
-                void ev_setup(int eflag, int vflag);
-        protected:
-
-          class Cuda *cuda;
-                void allocate();
-                bool allocated2;
-                class CudaNeighList* cuda_neigh_list;
-                Param_Float* params_f;
-                Param_Float* cu_params_f;
-                cCudaData<int, int, xyz >* cu_elem2param;
-    cCudaData<int, int, x >* cu_map;
-    bool init;
-    bool iszbl;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pair_tersoff_zbl_cuda.cpp b/src/USER-CUDA/pair_tersoff_zbl_cuda.cpp
deleted file mode 100644
index 91dcf6189e..0000000000
--- a/src/USER-CUDA/pair_tersoff_zbl_cuda.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing author: Aidan Thompson (SNL) - original Tersoff implementation
-                        David Farrell (NWU) - ZBL addition
-------------------------------------------------------------------------- */
-
-#include <math.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include "pair_tersoff_zbl_cuda.h"
-#include "atom.h"
-#include "update.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-#include "force.h"
-#include "comm.h"
-#include "memory.h"
-#include "error.h"
-#include "math_const.h"
-
-using namespace LAMMPS_NS;
-using namespace MathConst;
-
-#define MAXLINE 1024
-#define DELTA 4
-
-/* ---------------------------------------------------------------------- */
-
-PairTersoffZBLCuda::PairTersoffZBLCuda(LAMMPS *lmp) : PairTersoffCuda(lmp)
-{
-  // hard-wired constants in metal or real units
-  // a0 = Bohr radius
-  // epsilon0 = permittivity of vacuum = q / energy-distance units
-  // e = unit charge
-  // 1 Kcal/mole = 0.043365121 eV
-
-  if (strcmp(update->unit_style,"metal") == 0) {
-    global_a_0 = 0.529;
-    global_epsilon_0 = 0.00552635;
-    global_e = 1.0;
-  } else if (strcmp(update->unit_style,"real") == 0) {
-    global_a_0 = 0.529;
-    global_epsilon_0 = 0.00552635 * 0.043365121;
-    global_e = 1.0;
-  } else error->all(FLERR,"Pair tersoff/zbl requires metal or real units");
-  iszbl = true;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairTersoffZBLCuda::read_file(char *file)
-{
-  int params_per_line = 21;
-  char **words = new char*[params_per_line+1];
-
-  delete [] params;
-  params = NULL;
-  nparams = 0;
-
-  // open file on proc 0
-
-  FILE *fp;
-  if (comm->me == 0) {
-    fp = fopen(file,"r");
-    if (fp == NULL) {
-      char str[128];
-      sprintf(str,"Cannot open Tersoff potential file %s",file);
-      error->one(FLERR,str);
-    }
-  }
-
-  // read each line out of file, skipping blank lines or leading '#'
-  // store line of params if all 3 element tags are in element list
-
-  int n,nwords,ielement,jelement,kelement;
-  char line[MAXLINE],*ptr;
-  int eof = 0;
-
-  while (1) {
-    if (comm->me == 0) {
-      ptr = fgets(line,MAXLINE,fp);
-      if (ptr == NULL) {
-        eof = 1;
-        fclose(fp);
-      } else n = strlen(line) + 1;
-    }
-    MPI_Bcast(&eof,1,MPI_INT,0,world);
-    if (eof) break;
-    MPI_Bcast(&n,1,MPI_INT,0,world);
-    MPI_Bcast(line,n,MPI_CHAR,0,world);
-
-    // strip comment, skip line if blank
-
-    if ((ptr = strchr(line,'#'))) *ptr = '\0';
-    nwords = atom->count_words(line);
-    if (nwords == 0) continue;
-
-    // concatenate additional lines until have params_per_line words
-
-    while (nwords < params_per_line) {
-      n = strlen(line);
-      if (comm->me == 0) {
-        ptr = fgets(&line[n],MAXLINE-n,fp);
-        if (ptr == NULL) {
-          eof = 1;
-          fclose(fp);
-        } else n = strlen(line) + 1;
-      }
-      MPI_Bcast(&eof,1,MPI_INT,0,world);
-      if (eof) break;
-      MPI_Bcast(&n,1,MPI_INT,0,world);
-      MPI_Bcast(line,n,MPI_CHAR,0,world);
-      if ((ptr = strchr(line,'#'))) *ptr = '\0';
-      nwords = atom->count_words(line);
-    }
-
-    if (nwords != params_per_line)
-      error->all(FLERR,"Incorrect format in Tersoff potential file");
-
-    // words = ptrs to all words in line
-
-    nwords = 0;
-    words[nwords++] = strtok(line," \t\n\r\f");
-    while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
-
-    // ielement,jelement,kelement = 1st args
-    // if all 3 args are in element list, then parse this line
-    // else skip to next line
-
-    for (ielement = 0; ielement < nelements; ielement++)
-      if (strcmp(words[0],elements[ielement]) == 0) break;
-    if (ielement == nelements) continue;
-    for (jelement = 0; jelement < nelements; jelement++)
-      if (strcmp(words[1],elements[jelement]) == 0) break;
-    if (jelement == nelements) continue;
-    for (kelement = 0; kelement < nelements; kelement++)
-      if (strcmp(words[2],elements[kelement]) == 0) break;
-    if (kelement == nelements) continue;
-
-    // load up parameter settings and error check their values
-
-    if (nparams == maxparam) {
-      maxparam += DELTA;
-      params = (Param *) memory->srealloc(params,maxparam*sizeof(Param),
-                                          "pair:params");
-    }
-
-    params[nparams].ielement = ielement;
-    params[nparams].jelement = jelement;
-    params[nparams].kelement = kelement;
-    params[nparams].powerm = atof(words[3]);
-    params[nparams].gamma = atof(words[4]);
-    params[nparams].lam3 = atof(words[5]);
-    params[nparams].c = atof(words[6]);
-    params[nparams].d = atof(words[7]);
-    params[nparams].h = atof(words[8]);
-    params[nparams].powern = atof(words[9]);
-    params[nparams].beta = atof(words[10]);
-    params[nparams].lam2 = atof(words[11]);
-    params[nparams].bigb = atof(words[12]);
-    params[nparams].bigr = atof(words[13]);
-    params[nparams].bigd = atof(words[14]);
-    params[nparams].lam1 = atof(words[15]);
-    params[nparams].biga = atof(words[16]);
-    params[nparams].Z_i = atof(words[17]);
-    params[nparams].Z_j = atof(words[18]);
-    params[nparams].ZBLcut = atof(words[19]);
-    params[nparams].ZBLexpscale = atof(words[20]);
-
-    // currently only allow m exponent of 1 or 3
-
-    params[nparams].powermint = int(params[nparams].powerm);
-
-    if (
-        params[nparams].lam3 < 0.0 || params[nparams].c < 0.0 ||
-        params[nparams].d < 0.0 || params[nparams].powern < 0.0 ||
-        params[nparams].beta < 0.0 || params[nparams].lam2 < 0.0 ||
-        params[nparams].bigb < 0.0 || params[nparams].bigr < 0.0 ||
-        params[nparams].bigd < 0.0 ||
-        params[nparams].bigd > params[nparams].bigr ||
-        params[nparams].lam3 < 0.0 || params[nparams].biga < 0.0 ||
-        params[nparams].powerm - params[nparams].powermint != 0.0 ||
-        (params[nparams].powermint != 3 && params[nparams].powermint != 1) ||
-        params[nparams].gamma < 0.0 ||
-        params[nparams].Z_i < 1.0 || params[nparams].Z_j < 1.0 ||
-        params[nparams].ZBLcut < 0.0 || params[nparams].ZBLexpscale < 0.0)
-      error->all(FLERR,"Illegal Tersoff parameter");
-
-    nparams++;
-  }
-
-  delete [] words;
-}
-
-void PairTersoffZBLCuda::coeff(int narg, char **arg)
-{
-  PairTersoffCuda::coeff(narg, arg);
-  for(int i=0;i<maxparam;i++)
-  {
-    params_f[i].a_ij = (0.8854*global_a_0) /
-        (pow(params[i].Z_i,0.23) + pow(params[i].Z_j,0.23));
-    params_f[i].premult = (params[i].Z_i * params[i].Z_j * pow(global_e,2.0))/(4.0*MY_PI*global_epsilon_0);
-    params_f[i].ZBLcut = params[i].ZBLcut;
-    params_f[i].ZBLexpscale = params[i].ZBLexpscale;
-  }
-}
diff --git a/src/USER-CUDA/pair_tersoff_zbl_cuda.h b/src/USER-CUDA/pair_tersoff_zbl_cuda.h
deleted file mode 100644
index 4b935b86b2..0000000000
--- a/src/USER-CUDA/pair_tersoff_zbl_cuda.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(tersoff/zbl/cuda,PairTersoffZBLCuda)
-
-#else
-
-#ifndef PAIR_TERSOFF_ZBL_CUDA_H
-#define PAIR_TERSOFF_ZBL_CUDA_H
-
-#include "pair_tersoff_cuda.h"
-
-namespace LAMMPS_NS {
-
-class PairTersoffZBLCuda : public PairTersoffCuda
-{
-        public:
-                PairTersoffZBLCuda(class LAMMPS *);
-         private:
-          double global_a_0;    // Bohr radius for Coulomb repulsion
-          double global_epsilon_0;  // permittivity of vacuum for Coulomb repulsion
-          double global_e;    // proton charge (negative of electron charge)
-
-          void read_file(char *);
-          void coeff(int narg, char **arg);
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pppm_cuda.cpp b/src/USER-CUDA/pppm_cuda.cpp
deleted file mode 100644
index b322c9dd17..0000000000
--- a/src/USER-CUDA/pppm_cuda.cpp
+++ /dev/null
@@ -1,1420 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <cstring>
-#include <cstdio>
-#include <cstdlib>
-#include <cmath>
-#include "pppm_cuda.h"
-#include "atom.h"
-#include "comm.h"
-#include "neighbor.h"
-#include "force.h"
-#include "fft3d_wrap_cuda.h" // has to come before pair.h to avoid clash with kokkos
-#include "pair.h"
-#include "bond.h"
-#include "angle.h"
-#include "domain.h"
-#include "remap_wrap.h"
-#include "memory.h"
-#include "error.h"
-#include "update.h"
-#include <ctime> //crmadd
-#include "cuda_wrapper_cu.h"
-#include "pppm_cuda_cu.h"
-#include "user_cuda.h"
-#include "math_const.h"
-
-using namespace LAMMPS_NS;
-using namespace MathConst;
-
-#define MAXORDER 7
-#define OFFSET 4096
-#define SMALL 0.00001
-#define LARGE 10000.0
-#define EPS_HOC 1.0e-7
-
-void printArray(double* data,int nx, int ny, int nz)
-{
-  for(int i=0;i<nz;i++)
-  for(int j=0;j<ny;j++)
-  {
-          printf("%i %i\n",i,j);
-          for(int k=0;k<nx;k++)
-          printf("%e ",data[2*(i*ny*nx+j*nx+k)]);
-          printf("\n\n");
-  }
-}
-void printArray(double*** data,int nx, int ny, int nz)
-{
-  for(int i=0;i<nx;i++)
-  for(int j=0;j<ny;j++)
-  {
-          printf("%i %i\n",i,j);
-          for(int k=0;k<nz;k++)
-          printf("%e ",data[i][j][k]);
-          printf("\n\n");
-  }
-}
-/* ---------------------------------------------------------------------- */
-
-PPPMCuda::PPPMCuda(LAMMPS *lmp, int narg, char **arg) :
-  PPPMOld(lmp, (narg==2?1:narg), arg)
-{
-  cuda = lmp->cuda;
-   if(cuda == NULL)
-        error->all(FLERR,"You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  if ((narg > 3)||(narg<1)) error->all(FLERR,"Illegal kspace_style pppm/cuda command");
-  #ifndef FFT_CUFFT
-  error->all(FLERR,"Using kspace_style pppm/cuda without cufft is not possible. Compile with cufft=1 to include cufft. Aborting.");
-  #endif
-
-  triclinic_support = 0;
-  accuracy_relative = fabs(force->numeric(FLERR,arg[0]));
-
-  nfactors = 3;
-  factors = new int[nfactors];
-  factors[0] = 2;
-  factors[1] = 3;
-  factors[2] = 5;
-
-  MPI_Comm_rank(world,&me);
-  MPI_Comm_size(world,&nprocs);
-
-  density_brick = vdx_brick = vdy_brick = vdz_brick = vdx_brick_tmp = NULL;
-  density_fft = NULL;
-  greensfn = NULL;
-  work1 = work2 = NULL;
-  vg = NULL;
-  fkx = fky = fkz = NULL;
-  buf1 = buf2 = NULL;
-
-  gf_b = NULL;
-  rho1d = rho_coeff = NULL;
-
-  fft1c = fft2c = NULL;
-  remap = NULL;
-
-  density_brick_int=NULL;
-  density_intScale=1000000;
-  cu_vdx_brick = cu_vdy_brick = cu_vdz_brick = NULL;
-  cu_density_brick = NULL;
-  cu_density_brick_int = NULL;
-  cu_density_fft = NULL;
-  cu_energy=NULL;
-  cu_greensfn = NULL;
-  cu_work1 = cu_work2 = cu_work3 = NULL;
-  cu_vg = NULL;
-  cu_fkx = cu_fky = cu_fkz = NULL;
-
-  cu_flag = NULL;
-  cu_debugdata = NULL;
-  cu_rho_coeff = NULL;
-  cu_virial = NULL;
-
-  cu_gf_b = NULL;
-
-  cu_slabbuf = NULL;
-  slabbuf = NULL;
-
-  nmax = 0;
-  part2grid = NULL;
-  cu_part2grid = NULL;
-  adev_data_array=NULL;
-  poissontime=0;
-  old_nmax=0;
-  cu_pppm_grid_n=NULL;
-  cu_pppm_grid_ids=NULL;
-
-  pppm_grid_nmax=0;
-  pppm2partgrid=new int[3];
-  pppm_grid=new int[3];
-  firstpass=true;
-  scale = 1.0;
-}
-
-
-/* ----------------------------------------------------------------------
-   free all memory
-------------------------------------------------------------------------- */
-
-PPPMCuda::~PPPMCuda()
-{
-  delete [] slabbuf;
-  delete cu_slabbuf;
-
-  delete [] factors;
-  factors=NULL;
-  deallocate();
-  delete cu_part2grid;
-  cu_part2grid=NULL;
-  memory->destroy(part2grid);
-  part2grid = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   called once before run
-------------------------------------------------------------------------- */
-
-void PPPMCuda::init()
-{
-  cuda->shared_data.pppm.cudable_force=1;
-
-    //if(cuda->finished_run) {PPPM::init(); return;}
-
-  if (me == 0) {
-    if (screen) fprintf(screen,"PPPMCuda initialization ...\n");
-    if (logfile) fprintf(logfile,"PPPMCuda initialization ...\n");
-  }
-
-  // error check
-
-  if (domain->dimension == 2) error->all(FLERR,"Cannot use PPPMCuda with 2d simulation");
-  if (comm->style != 0)
-    error->universe_all(FLERR,"PPPMCuda can only currently be used with "
-                        "comm_style brick");
-
-  if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
-
-  if (slabflag == 0 && domain->nonperiodic > 0)
-    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMCuda");
-  if (slabflag == 1) {
-    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
-        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
-      error->all(FLERR,"Incorrect boundaries with slab PPPMCuda");
-  }
-
-  if (order < 2 || order > MAXORDER) {
-    char str[128];
-    sprintf(str,"PPPMCuda order cannot be smaller than 2 or greater than %d",MAXORDER);
-    error->all(FLERR,str);
-  }
-  // free all arrays previously allocated
-
-  deallocate();
-
-  // extract short-range Coulombic cutoff from pair style
-
-  triclinic_check();
-
-  if (force->pair == NULL)
-    error->all(FLERR,"KSpace style is incompatible with Pair style");
-  int itmp=0;
-  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
-  if (p_cutoff == NULL)
-    error->all(FLERR,"KSpace style is incompatible with Pair style");
-  cutoff = *p_cutoff;
-
-  // if kspace is TIP4P, extract TIP4P params from pair style
-
-  qdist = 0.0;
-
-  if (strcmp(force->kspace_style,"pppm/tip4p") == 0) {
-    if (force->pair == NULL)
-      error->all(FLERR,"KSpace style is incompatible with Pair style");
-    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
-    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
-    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
-    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
-    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
-    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
-      error->all(FLERR,"KSpace style is incompatible with Pair style");
-    qdist = *p_qdist;
-    typeO = *p_typeO;
-    typeH = *p_typeH;
-    int typeA = *p_typeA;
-    int typeB = *p_typeB;
-
-    if (force->angle == NULL || force->bond == NULL)
-      error->all(FLERR,"Bond and angle potentials must be defined for TIP4P");
-    double theta = force->angle->equilibrium_angle(typeA);
-    double blen = force->bond->equilibrium_distance(typeB);
-    alpha = qdist / (2.0 * cos(0.5*theta) * blen);
-  }
-
-  // compute qsum & qsqsum and warn if not charge-neutral
-
-  scale = 1.0;
-  qqrd2e = force->qqrd2e;
-  qsum_qsq();
-  natoms_original = atom->natoms;
-
-  // set accuracy (force units) from accuracy_relative or accuracy_absolute
-
-  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
-  else accuracy = accuracy_relative * two_charge_force;
-
-  // setup FFT grid resolution and g_ewald
-  // normally one iteration thru while loop is all that is required
-  // if grid stencil extends beyond neighbor proc, reduce order and try again
-
-  int iteration = 0;
-
-  while (order > 1) {
-    if (iteration && me == 0)
-      error->warning(FLERR,"Reducing PPPMCuda order b/c stencil extends "
-                     "beyond neighbor processor");
-    iteration++;
-
-    set_grid();
-
-    if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
-      error->all(FLERR,"PPPMCuda grid is too large");
-
-    // global indices of PPPMCuda grid range from 0 to N-1
-    // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
-    //   global PPPMCuda grid that I own without ghost cells
-    // for slab PPPMCuda, assign z grid as if it were not extended
-
-    nxlo_in = comm->myloc[0]*nx_pppm / comm->procgrid[0];
-    nxhi_in = (comm->myloc[0]+1)*nx_pppm / comm->procgrid[0] - 1;
-    nylo_in = comm->myloc[1]*ny_pppm / comm->procgrid[1];
-    nyhi_in = (comm->myloc[1]+1)*ny_pppm / comm->procgrid[1] - 1;
-    nzlo_in = comm->myloc[2] *
-      (static_cast<int> (nz_pppm/slab_volfactor)) / comm->procgrid[2];
-    nzhi_in = (comm->myloc[2]+1) *
-      (static_cast<int> (nz_pppm/slab_volfactor)) / comm->procgrid[2] - 1;
-
-    // nlower,nupper = stencil size for mapping particles to PPPMCuda grid
-
-    nlower = -(order-1)/2;
-    nupper = order/2;
-
-    // shift values for particle <-> grid mapping
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    if (order % 2) shift = OFFSET + 0.5;
-    else shift = OFFSET;
-    if (order % 2) shiftone = 0.0;
-    else shiftone = 0.5;
-
-    // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
-    //   global PPPMCuda grid that my particles can contribute charge to
-    // effectively nlo_in,nhi_in + ghost cells
-    // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
-    //           position a particle in my box can be at
-    // dist[3] = particle position bound = subbox + skin/2.0 + qdist
-    //   qdist = offset due to TIP4P fictitious charge
-    //   convert to triclinic if necessary
-    // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
-    // for slab PPPMCuda, assign z grid as if it were not extended
-
-
-    triclinic = domain->triclinic;
-    double *prd,*sublo,*subhi;
-
-    if (triclinic == 0) {
-      prd = domain->prd;
-      boxlo = domain->boxlo;
-      sublo = domain->sublo;
-      subhi = domain->subhi;
-    } else {
-      prd = domain->prd_lamda;
-      boxlo = domain->boxlo_lamda;
-      sublo = domain->sublo_lamda;
-      subhi = domain->subhi_lamda;
-    }
-
-    double xprd = prd[0];
-    double yprd = prd[1];
-    double zprd = prd[2];
-    double zprd_slab = zprd*slab_volfactor;
-
-    double dist[3];
-    double cuthalf = 0.5*neighbor->skin + qdist;
-    if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
-    else {
-      dist[0] = cuthalf/domain->prd[0];
-      dist[1] = cuthalf/domain->prd[1];
-      dist[2] = cuthalf/domain->prd[2];
-    }
-
-    int nlo,nhi;
-
-    nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-    nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-    nxlo_out = nlo + nlower;
-    nxhi_out = nhi + nupper;
-
-    nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-    nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-    nylo_out = nlo + nlower;
-    nyhi_out = nhi + nupper;
-
-    nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-    nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-    nzlo_out = nlo + nlower;
-    nzhi_out = nhi + nupper;
-
-    // for slab PPPMCuda, change the grid boundary for processors at +z end
-    //   to include the empty volume between periodically repeating slabs
-    // for slab PPPMCuda, want charge data communicated from -z proc to +z proc,
-    //   but not vice versa, also want field data communicated from +z proc to
-    //   -z proc, but not vice versa
-    // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
-
-    if (slabflag && ((comm->myloc[2]+1) == (comm->procgrid[2]))) {
-      nzhi_in =  nz_pppm - 1;
-      nzhi_out = nz_pppm - 1;
-    }
-
-    // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions
-    //   that overlay domain I own
-    // proc in that direction tells me via sendrecv()
-    // if no neighbor proc, value is from self since I have ghosts regardless
-
-    int nplanes;
-
-    nplanes = nxlo_in - nxlo_out;
-    if (comm->procneigh[0][0] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0,
-                   &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0,
-                   world,MPI_STATUS_IGNORE);
-    else nxhi_ghost = nplanes;
-
-    nplanes = nxhi_out - nxhi_in;
-    if (comm->procneigh[0][1] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0,
-                   &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0],
-                   0,world,MPI_STATUS_IGNORE);
-    else nxlo_ghost = nplanes;
-
-    nplanes = nylo_in - nylo_out;
-    if (comm->procneigh[1][0] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0,
-                   &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0,
-                   world,MPI_STATUS_IGNORE);
-    else nyhi_ghost = nplanes;
-
-    nplanes = nyhi_out - nyhi_in;
-    if (comm->procneigh[1][1] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0,
-                   &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0,
-                   world,MPI_STATUS_IGNORE);
-    else nylo_ghost = nplanes;
-
-    nplanes = nzlo_in - nzlo_out;
-    if (comm->procneigh[2][0] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0,
-                   &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0,
-                   world,MPI_STATUS_IGNORE);
-    else nzhi_ghost = nplanes;
-
-    nplanes = nzhi_out - nzhi_in;
-    if (comm->procneigh[2][1] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0,
-                   &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0,
-                   world,MPI_STATUS_IGNORE);
-    else nzlo_ghost = nplanes;
-
-    // test that ghost overlap is not bigger than my sub-domain
-
-    int flag = 0;
-    if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1;
-    if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1;
-    if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1;
-    if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1;
-    if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1;
-    if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1;
-
-    int flag_all;
-    MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
-
-    if (flag_all == 0) break;
-    order--;
-  }
-
-  if (order == 0) error->all(FLERR,"PPPMCuda order has been reduced to 0");
-
-  // decomposition of FFT mesh
-  // global indices range from 0 to N-1
-  // proc owns entire x-dimension, clump of columns in y,z dimensions
-  // npey_fft,npez_fft = # of procs in y,z dims
-  // if nprocs is small enough, proc can own 1 or more entire xy planes,
-  //   else proc owns 2d sub-blocks of yz plane
-  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
-  // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own
-
-  int npey_fft,npez_fft;
-  if (nz_pppm >= nprocs) {
-    npey_fft = 1;
-    npez_fft = nprocs;
-  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-  int me_y = me % npey_fft;
-  int me_z = me / npey_fft;
-
-  nxlo_fft = 0;
-  nxhi_fft = nx_pppm - 1;
-  nylo_fft = me_y*ny_pppm/npey_fft;
-  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-  nzlo_fft = me_z*nz_pppm/npez_fft;
-  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
-
-  // PPPMCuda grid for this proc, including ghosts
-
-  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-
-  // FFT arrays on this proc, without ghosts
-  // nfft = FFT points in FFT decomposition on this proc
-  // nfft_brick = FFT points in 3d brick-decomposition on this proc
-  // nfft_both = greater of 2 values
-
-  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
-    (nzhi_fft-nzlo_fft+1);
-  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
-    (nzhi_in-nzlo_in+1);
-  nfft_both = MAX(nfft,nfft_brick);
-
-  // buffer space for use in brick2fft and fillbrick
-  // idel = max # of ghost planes to send or recv in +/- dir of each dim
-  // nx,ny,nz = owned planes (including ghosts) in each dim
-  // nxx,nyy,nzz = max # of grid cells to send in each dim
-  // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick
-
-  int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz;
-
-  idelx = MAX(nxlo_ghost,nxhi_ghost);
-  idelx = MAX(idelx,nxhi_out-nxhi_in);
-  idelx = MAX(idelx,nxlo_in-nxlo_out);
-
-  idely = MAX(nylo_ghost,nyhi_ghost);
-  idely = MAX(idely,nyhi_out-nyhi_in);
-  idely = MAX(idely,nylo_in-nylo_out);
-
-  idelz = MAX(nzlo_ghost,nzhi_ghost);
-  idelz = MAX(idelz,nzhi_out-nzhi_in);
-  idelz = MAX(idelz,nzlo_in-nzlo_out);
-
-  nx = nxhi_out - nxlo_out + 1;
-  ny = nyhi_out - nylo_out + 1;
-  nz = nzhi_out - nzlo_out + 1;
-
-  nxx = idelx * ny * nz;
-  nyy = idely * nx * nz;
-  nzz = idelz * nx * ny;
-
-  nbuf = MAX(nxx,nyy);
-  nbuf = MAX(nbuf,nzz);
-  nbuf *= 3;
-
-  // print stats
-
-  int ngrid_max,nfft_both_max,nbuf_max;
-  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
-  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
-  MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world);
-
-  if (me == 0) {
-    if (screen) fprintf(screen,"  brick FFT buffer size/proc = %d %d %d\n",
-                        ngrid_max,nfft_both_max,nbuf_max);
-    if (logfile) fprintf(logfile,"  brick FFT buffer size/proc = %d %d %d\n",
-                         ngrid_max,nfft_both_max,nbuf_max);
-  }
-  cuda_shared_pppm* ap=&(cuda->shared_data.pppm);
-
-   ap->density_intScale=density_intScale;
-   ap->nxlo_in=nxlo_in;
-   ap->nxhi_in=nxhi_in;
-   ap->nxlo_out=nxlo_out;
-   ap->nxhi_out=nxhi_out;
-   ap->nylo_in=nylo_in;
-   ap->nyhi_in=nyhi_in;
-   ap->nylo_out=nylo_out;
-   ap->nyhi_out=nyhi_out;
-   ap->nzlo_in=nzlo_in;
-   ap->nzhi_in=nzhi_in;
-   ap->nzlo_out=nzlo_out;
-   ap->nzhi_out=nzhi_out;
-   ap->nxlo_in=nxlo_fft;
-   ap->nxhi_in=nxhi_fft;
-   ap->nylo_in=nylo_fft;
-   ap->nyhi_in=nyhi_fft;
-   ap->nzlo_in=nzlo_fft;
-   ap->nzhi_in=nzhi_fft;
-   ap->nx_pppm=nx_pppm;
-   ap->ny_pppm=ny_pppm;
-   ap->nz_pppm=nz_pppm;
-   ap->qqrd2e=qqrd2e;
-   ap->order=order;
-   ap->nmax=nmax;
-   ap->nlocal=atom->nlocal;
-   ap->delxinv=delxinv;
-   ap->delyinv=delyinv;
-   ap->delzinv=delzinv;
-   ap->nlower=nlower;
-   ap->nupper=nupper;
-   ap->shiftone=shiftone;
-
-  // allocate K-space dependent memory
-
-
-  allocate();
-
-  // pre-compute Green's function denomiator expansion
-  // pre-compute 1d charge distribution coefficients
-
-  compute_gf_denom();
-  compute_rho_coeff();
-}
-
-/* ----------------------------------------------------------------------
-   adjust PPPMCuda coeffs, called initially and whenever volume has changed
-------------------------------------------------------------------------- */
-
-void PPPMCuda::setup()
-{
-  double *prd;
-  cu_gf_b->upload();
-  // volume-dependent factors
-  // adjust z dimension for 2d slab PPPMCuda
-  // z dimension for 3d PPPMCuda is zprd since slab_volfactor = 1.0
-
-  if (triclinic == 0) prd = domain->prd;
-  else prd = domain->prd_lamda;
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  delxinv = nx_pppm/xprd;
-  delyinv = ny_pppm/yprd;
-  delzinv = nz_pppm/zprd_slab;
-
-  delvolinv = delxinv*delyinv*delzinv;
-
-  double unitkx = (2.0*MY_PI/xprd);
-  double unitky = (2.0*MY_PI/yprd);
-  double unitkz = (2.0*MY_PI/zprd_slab);
-
-  // fkx,fky,fkz for my FFT grid pts
-  Cuda_PPPM_Setup_fkxyz_vg(nx_pppm, ny_pppm,nz_pppm,unitkx,unitky,unitkz,g_ewald);
-
-
-
-  // modified (Hockney-Eastwood) Coulomb Green's function
-
-  int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
-                              pow(-log(EPS_HOC),0.25));
-  int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
-                              pow(-log(EPS_HOC),0.25));
-  int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
-                              pow(-log(EPS_HOC),0.25));
-  Cuda_PPPM_setup_greensfn(nx_pppm,ny_pppm,nz_pppm,unitkx,unitky,unitkz,g_ewald,
-nbx,nby,nbz,xprd,yprd,zprd_slab);
-
-
-#ifdef FFT_CUFFT
-  cu_vdx_brick->upload();
-  cu_vdy_brick->upload();
-  cu_vdz_brick->upload();
-#endif
-  cu_rho_coeff->upload();
-  cu_density_brick->memset_device(0);
-  pppm_device_init_setup(&cuda->shared_data,shiftone,delxinv,delyinv,delzinv,nlower,nupper);
-}
-
-/* ----------------------------------------------------------------------
-   compute the PPPMCuda long-range force, energy, virial
-------------------------------------------------------------------------- */
-
-void PPPMCuda::compute(int eflag, int vflag)
-{
-  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
-
-  int i;
-  my_times starttime;
-  my_times endtime;
-  my_times starttotal;
-  my_times endtotal;
-  // convert atoms from box to lamda coords
-
-  if (triclinic == 0) boxlo = domain->boxlo;
-  else {
-    boxlo = domain->boxlo_lamda;
-    domain->x2lamda(atom->nlocal);
-  }
-
-  // extend size of PPPM per-atom arrays if necessary
-  // force update of device data, if arrays resized
-
-
-  if (cu_atom->update_nmax || old_nmax == 0) {
-    memory->destroy(part2grid);
-    nmax = atom->nmax;
-    memory->create(part2grid,nmax,3,"pppm:part2grid");
-    delete cu_part2grid;
-    delete [] adev_data_array;
-    adev_data_array=new dev_array[1];
-    cu_part2grid = new cCudaData<int  , int   , yx >
-      ((int*)part2grid,adev_data_array, nmax,3);
-
-    pppm_device_update(&cuda->shared_data,cu_part2grid->dev_data(),
-                       atom->nlocal,atom->nmax);
-    old_nmax=nmax;
-  }
-  if(cu_atom->update_nlocal) {pppm_update_nlocal(cu_atom->nlocal);}
-
-  energy = 0.0;
-  if (vflag)
-  {
-          for (i = 0; i < 6; i++) virial[i] = 0.0;
-          cu_virial->memset_device(0);
-  }
-  if(eflag) cu_energy->memset_device(0);
-  my_gettime(CLOCK_REALTIME,&starttotal);
-
-  // find grid points for all my particles
-  // map my particle charge onto my local 3d density grid
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-
-  particle_map();
-
-  my_gettime(CLOCK_REALTIME,&endtime);
-  cuda->shared_data.cuda_timings.pppm_particle_map+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  //cu_part2grid->download();
-  my_gettime(CLOCK_REALTIME,&starttime);
-  make_rho();
-  my_gettime(CLOCK_REALTIME,&endtime);
-  cuda->shared_data.cuda_timings.pppm_make_rho+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
-  // remap from 3d decomposition to FFT decomposition
-
-  int nprocs=comm->nprocs;
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-
-  if(nprocs>1)
-  {
-    cu_density_brick->download();
-    brick2fft();
-  }
-  else
-  {
-     #ifdef FFT_CUFFT
-     pppm_initfftdata(&cuda->shared_data,(PPPM_CFLOAT*)cu_density_brick->dev_data(),(FFT_CFLOAT*)cu_work2->dev_data());
-     #endif
-  }
-
-  my_gettime(CLOCK_REALTIME,&endtime);
-  cuda->shared_data.cuda_timings.pppm_brick2fft+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  // compute potential gradient on my FFT grid and
-  //   portion of e_long on this proc's FFT grid
-  // return gradients (electric fields) in 3d brick decomposition
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-  poisson(eflag,vflag);
-  my_gettime(CLOCK_REALTIME,&endtime);
-  cuda->shared_data.cuda_timings.pppm_poisson+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  // all procs communicate E-field values to fill ghost cells
-  //   surrounding their 3d bricks
-
-  // not necessary since all the calculations are done on one proc
-
-  // calculate the force on my particles
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-  fieldforce();
-  my_gettime(CLOCK_REALTIME,&endtime);
-  cuda->shared_data.cuda_timings.pppm_fieldforce+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  // sum energy across procs and add in volume-dependent term
-  // reset qsum and qsqsum if atom count has changed
-
-  my_gettime(CLOCK_REALTIME,&endtotal);
-  cuda->shared_data.cuda_timings.pppm_compute+=(endtotal.tv_sec-starttotal.tv_sec+1.0*(endtotal.tv_nsec-starttotal.tv_nsec)/1000000000);
-
-  if (eflag) {
-    double energy_all;
-    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
-    energy = energy_all;
-
-    if (atom->natoms != natoms_original) {
-      qsum_qsq();
-      natoms_original = atom->natoms;
-    }
-
-    energy *= 0.5*volume;
-    energy -= g_ewald*qsqsum/1.772453851 +
-      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
-    energy *= qqrd2e;
-  }
-
-  // sum virial across procs
-
-  if (vflag) {
-    double virial_all[6];
-    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
-    for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*volume*virial_all[i];
-  }
-
-  // 2d slab correction
-
-  if (slabflag) slabcorr(eflag);
-
-  // convert atoms back from lamda to box coords
-
-  if (triclinic) domain->lamda2x(atom->nlocal);
-
-  if(firstpass) firstpass=false;
-}
-
-
-/* ----------------------------------------------------------------------
-   allocate memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-
-void PPPMCuda::allocate()
-{
-
-  struct dev_array* dev_tmp=new struct dev_array[20];
-  int n_cudata=0;
-
-
-  memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_brick");
-  memory->create3d_offset(density_brick_int,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_brick_int");
-
-
-  cu_density_brick = new cCudaData<double, PPPM_CFLOAT, x> ((double*) &(density_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
-                                     (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
-
-  cu_density_brick_int = new cCudaData<int, int, x> ((int*) &(density_brick_int[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
-                                     (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
-
-  memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdx_brick");
-  memory->create3d_offset(vdx_brick_tmp,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdx_brick_tmp");
-
-  cu_vdx_brick = new cCudaData<double, PPPM_CFLOAT, x> ((double*) &(vdx_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
-                                     (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
-
-  memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdy_brick");
-  cu_vdy_brick = new cCudaData<double, PPPM_CFLOAT, x> ((double*) &(vdy_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
-                                     (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
-
-  memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdz_brick");
-  cu_vdz_brick = new cCudaData<double, PPPM_CFLOAT, x> ((double*) &(vdz_brick[nzlo_out][nylo_out][nxlo_out]), & (dev_tmp[n_cudata++]),
-                                     (nzhi_out-nzlo_out+1)*(nyhi_out-nylo_out+1)*(nxhi_out-nxlo_out+1));
-
-  memory->create(density_fft,nfft_both,"pppm:density_fft");
-
-  cu_density_fft = new cCudaData<double, PPPM_CFLOAT, x> (density_fft, & (dev_tmp[n_cudata++]),nfft_both);
-
-  cu_energy = new cCudaData<double, ENERGY_CFLOAT, x> (NULL, &(dev_tmp[n_cudata++]),ny_pppm*nz_pppm);
-  cu_virial = new cCudaData<double, ENERGY_CFLOAT, x> (NULL, &(dev_tmp[n_cudata++]),ny_pppm*nz_pppm*6);
-
-  memory->create(greensfn,nfft_both,"pppm:greensfn");
-  cu_greensfn = new cCudaData<double, PPPM_CFLOAT, x> (greensfn, & (dev_tmp[n_cudata++]) , nx_pppm*ny_pppm*nz_pppm);
-
-  memory->create(work1,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work1");
-  memory->create(work2,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work2");
-  memory->create(work3,2*nx_pppm*ny_pppm*nz_pppm,"pppm:work3");
-
-  cu_work1 = new cCudaData<double, FFT_CFLOAT, x> (work1, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm);
-  cu_work2 = new cCudaData<double, FFT_CFLOAT, x> (work2, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm);
-  cu_work3 = new cCudaData<double, FFT_CFLOAT, x> (work3, & (dev_tmp[n_cudata++]) , 2*nx_pppm*ny_pppm*nz_pppm);
-
-
-  memory->create(fkx,nx_pppm,"pppmcuda:fkx");
-  cu_fkx = new cCudaData<double, PPPM_CFLOAT, x> (fkx, & (dev_tmp[n_cudata++]) , nx_pppm);
-  memory->create(fky,ny_pppm,"pppmcuda:fky");
-  cu_fky = new cCudaData<double, PPPM_CFLOAT, x> (fky, & (dev_tmp[n_cudata++]) , ny_pppm);
-  memory->create(fkz,nz_pppm,"pppmcuda:fkz");
-  cu_fkz = new cCudaData<double, PPPM_CFLOAT, x> (fkz, & (dev_tmp[n_cudata++]) , nz_pppm);
-
-  memory->create(vg,nfft_both,6,"pppm:vg");
-
-  cu_vg = new cCudaData<double, PPPM_CFLOAT, xy> ((double*)vg, & (dev_tmp[n_cudata++]) , nfft_both,6);
-
-  memory->create(buf1,nbuf,"pppm:buf1");
-  memory->create(buf2,nbuf,"pppm:buf2");
-
-
-  // summation coeffs
-
-
-  gf_b = new double[order];
-  cu_gf_b = new cCudaData<double,PPPM_CFLOAT,x> (gf_b, &(dev_tmp[n_cudata++]) , order);
-  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
-  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
-
-  cu_rho_coeff = new cCudaData<double, PPPM_CFLOAT, x> ((double*) &(rho_coeff[0][(1-order)/2]), & (dev_tmp[n_cudata++]) , order*(order/2-(1-order)/2+1));
-
-  debugdata=new PPPM_CFLOAT[100];
-  cu_debugdata = new cCudaData<PPPM_CFLOAT, PPPM_CFLOAT, x> (debugdata,& (dev_tmp[n_cudata++]),100);
-  cu_flag = new cCudaData<int, int, x> (&global_flag,& (dev_tmp[n_cudata++]),3);
-
-  // create 2 FFTs and a Remap
-  // 1st FFT keeps data in FFT decompostion
-  // 2nd FFT returns data in 3d brick decomposition
-  // remap takes data from 3d brick to FFT decomposition
-
-  int tmp;
-
-
-
-
-  fft1c = new FFT3dCuda(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,true);
-
-  fft2c = new FFT3dCuda(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,false);
-
-
-#ifdef FFT_CUFFT
-  fft1c->set_cudata(cu_work2->dev_data(),cu_work1->dev_data());
-  fft2c->set_cudata(cu_work2->dev_data(),cu_work3->dev_data());
-#endif
-
-  remap = new Remap(lmp,world,
-                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,2,0);
-
-
-pppm_device_init(cu_density_brick->dev_data(), cu_vdx_brick->dev_data(), cu_vdy_brick->dev_data(), cu_vdz_brick->dev_data(), cu_density_fft->dev_data(),cu_energy->dev_data(),cu_virial->dev_data()
-            , cu_work1->dev_data(), cu_work2->dev_data(), cu_work3->dev_data(), cu_greensfn->dev_data(), cu_fkx->dev_data(), cu_fky->dev_data(), cu_fkz->dev_data(), cu_vg->dev_data()
-            ,nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,nx_pppm,ny_pppm,nz_pppm
-            ,nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,cu_gf_b->dev_data()
-            ,qqrd2e,order,cu_rho_coeff->dev_data(),cu_debugdata->dev_data(),cu_density_brick_int->dev_data(),slabflag
-         );
-}
-
-
-
-/* ----------------------------------------------------------------------
-   deallocate memory that depends on # of K-vectors and order
- ---------------------------------------------------------------------- */
-
-void PPPMCuda::deallocate()
-{
-  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
-
-  density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
-
-  memory->destroy(density_fft);
-  memory->destroy(greensfn);
-  memory->destroy(work1);
-  memory->destroy(work2);
-  memory->destroy(vg);
-
-  density_fft = NULL;
-  greensfn = NULL;
-  work1 = NULL;
-  work2 = NULL;
-  vg = NULL;
-
-  memory->destroy(fkx);
-  memory->destroy(fky);
-  memory->destroy(fkz);
-
-  fkx = NULL;
-  fky = NULL;
-  fkz = NULL;
-
-  delete cu_density_brick;
-  delete cu_density_brick_int;
-  delete cu_vdx_brick;
-  delete cu_vdy_brick;
-  delete cu_vdz_brick;
-  delete cu_density_fft;
-  delete cu_energy;
-  delete cu_virial;
-#ifdef FFT_CUFFT
-  delete cu_greensfn;
-  delete cu_gf_b;
-  delete cu_vg;
-  delete cu_work1;
-  delete cu_work2;
-  delete cu_work3;
-  delete cu_fkx;
-  delete cu_fky;
-  delete cu_fkz;
-#endif
-
-  delete cu_flag;
-  delete cu_debugdata;
-  delete cu_rho_coeff;
-
-
-  cu_vdx_brick = cu_vdy_brick = cu_vdz_brick = NULL;
-  cu_density_brick = NULL;
-  cu_density_brick_int = NULL;
-  cu_density_fft = NULL;
-  cu_energy=NULL;
-  cu_virial=NULL;
-#ifdef FFT_CUFFT
-  cu_greensfn = NULL;
-  cu_gf_b = NULL;
-  cu_work1 = cu_work2 = cu_work3 = NULL;
-  cu_vg = NULL;
-  cu_fkx = cu_fky = cu_fkz = NULL;
-#endif
-
-  cu_flag = NULL;
-  cu_debugdata = NULL;
-  cu_rho_coeff = NULL;
-  cu_part2grid = NULL;
-
-  memory->destroy(buf1);
-  memory->destroy(buf2);
-
-  delete [] gf_b;
-  gf_b = NULL;
-  memory->destroy2d_offset(rho1d,-order/2); rho1d = NULL;
-  memory->destroy2d_offset(rho_coeff,(1-order)/2); rho_coeff = NULL;
-
-  delete fft1c;
-  fft1c = NULL;
-
-  delete fft2c;
-  fft2c = NULL;
-  delete remap;
-  remap = NULL;
-  buf1 = NULL;
-  buf2 = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   set size of FFT grid (nx,ny,nz_pppm) and g_ewald
--------------------------------------------------------------------------*/
-
-void PPPMCuda::set_grid()
-{
-  // see JCP 109, pg 7698 for derivation of coefficients
-  // higher order coefficients may be computed if needed
-
-  double **acons;
-  memory->create(acons,8,7,"pppm:acons");
-
-  acons[1][0] = 2.0 / 3.0;
-  acons[2][0] = 1.0 / 50.0;
-  acons[2][1] = 5.0 / 294.0;
-  acons[3][0] = 1.0 / 588.0;
-  acons[3][1] = 7.0 / 1440.0;
-  acons[3][2] = 21.0 / 3872.0;
-  acons[4][0] = 1.0 / 4320.0;
-  acons[4][1] = 3.0 / 1936.0;
-  acons[4][2] = 7601.0 / 2271360.0;
-  acons[4][3] = 143.0 / 28800.0;
-  acons[5][0] = 1.0 / 23232.0;
-  acons[5][1] = 7601.0 / 13628160.0;
-  acons[5][2] = 143.0 / 69120.0;
-  acons[5][3] = 517231.0 / 106536960.0;
-  acons[5][4] = 106640677.0 / 11737571328.0;
-  acons[6][0] = 691.0 / 68140800.0;
-  acons[6][1] = 13.0 / 57600.0;
-  acons[6][2] = 47021.0 / 35512320.0;
-  acons[6][3] = 9694607.0 / 2095994880.0;
-  acons[6][4] = 733191589.0 / 59609088000.0;
-  acons[6][5] = 326190917.0 / 11700633600.0;
-  acons[7][0] = 1.0 / 345600.0;
-  acons[7][1] = 3617.0 / 35512320.0;
-  acons[7][2] = 745739.0 / 838397952.0;
-  acons[7][3] = 56399353.0 / 12773376000.0;
-  acons[7][4] = 25091609.0 / 1560084480.0;
-  acons[7][5] = 1755948832039.0 / 36229939200000.0;
-  acons[7][6] = 4887769399.0 / 37838389248.0;
-
-  bigint natoms = atom->natoms;
-
-  // use xprd,yprd,zprd even if triclinic so grid size is the same
-  // adjust z dimension for 2d slab PPPMCuda
-  // 3d PPPMCuda just uses zprd since slab_volfactor = 1.0
-
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  double zprd_slab = zprd*slab_volfactor;
-
-  // make initial g_ewald estimate
-  // based on desired error and real space cutoff
-  // fluid-occupied volume used to estimate real-space error
-  // zprd used rather than zprd_slab
-
-  double h_x,h_y,h_z;
-
-  if (!gewaldflag)
-    g_ewald = sqrt(-log(accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) /
-                        (2.0*q2))) / cutoff;
-
-  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and precision
-  // nz_pppm uses extended zprd_slab instead of zprd
-  // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1
-  // reduce it until precision target is met
-
-  if (!gridflag) {
-    double err;
-    h_x = h_y = h_z = 1/g_ewald;
-
-    nx_pppm = static_cast<int> (xprd/h_x + 1);
-    ny_pppm = static_cast<int> (yprd/h_y + 1);
-    nz_pppm = static_cast<int> (zprd_slab/h_z + 1);
-
-    err = rms(h_x,xprd,natoms,q2,acons);
-    while (err > accuracy) {
-      err = rms(h_x,xprd,natoms,q2,acons);
-      nx_pppm++;
-      h_x = xprd/nx_pppm;
-    }
-
-    err = rms(h_y,yprd,natoms,q2,acons);
-    while (err > accuracy) {
-      err = rms(h_y,yprd,natoms,q2,acons);
-      ny_pppm++;
-      h_y = yprd/ny_pppm;
-    }
-
-    err = rms(h_z,zprd_slab,natoms,q2,acons);
-    while (err > accuracy) {
-      err = rms(h_z,zprd_slab,natoms,q2,acons);
-      nz_pppm++;
-      h_z = zprd_slab/nz_pppm;
-    }
-  }
-
-  // boost grid size until it is factorable
-
-  while (!factorable(nx_pppm)) nx_pppm++;
-  while (!factorable(ny_pppm)) ny_pppm++;
-  while (!factorable(nz_pppm)) nz_pppm++;
-
-
-  // adjust g_ewald for new grid size
-
-  h_x = xprd/nx_pppm;
-  h_y = yprd/ny_pppm;
-  h_z = zprd_slab/nz_pppm;
-
-  if (!gewaldflag) {
-    double gew1,gew2,dgew,f,fmid,hmin,rtb;
-    int ncount;
-
-    gew1 = 0.0;
-    g_ewald = gew1;
-    f = diffpr(h_x,h_y,h_z,q2,acons);
-
-    hmin = MIN(h_x,MIN(h_y,h_z));
-    gew2 = 10/hmin;
-    g_ewald = gew2;
-    fmid = diffpr(h_x,h_y,h_z,q2,acons);
-
-    if (f*fmid >= 0.0) error->all(FLERR,"Cannot compute PPPMCuda G");
-    rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2);
-    ncount = 0;
-    while (fabs(dgew) > SMALL && fmid != 0.0) {
-      dgew *= 0.5;
-      g_ewald = rtb + dgew;
-      fmid = diffpr(h_x,h_y,h_z,q2,acons);
-      if (fmid <= 0.0) rtb = g_ewald;
-      ncount++;
-      if (ncount > LARGE) error->all(FLERR,"Cannot compute PPPMCuda G");
-    }
-  }
-
-  // final RMS precision
-
-  double lprx = rms(h_x,xprd,natoms,q2,acons);
-  double lpry = rms(h_y,yprd,natoms,q2,acons);
-  double lprz = rms(h_z,zprd_slab,natoms,q2,acons);
-  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
-  double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) /
-    sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
-
-  // free local memory
-
-  memory->destroy(acons);
-
-  // print info
-
-  if (me == 0) {
-    if (screen) {
-      fprintf(screen,"  G vector = %g\n",g_ewald);
-      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
-      fprintf(screen,"  stencil order = %d\n",order);
-      fprintf(screen,"  absolute RMS force accuracy = %g\n",MAX(lpr,spr));
-      fprintf(screen,"  relative force accuracy = %g\n",
-              MAX(lpr,spr)/two_charge_force);
-    }
-    if (logfile) {
-      fprintf(logfile,"  G vector = %g\n",g_ewald);
-      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
-      fprintf(logfile,"  stencil order = %d\n",order);
-      fprintf(logfile,"  absolute RMS force accuracy = %g\n",MAX(lpr,spr));
-      fprintf(logfile,"  relative force accuracy = %g\n",
-              MAX(lpr,spr)/two_charge_force);
-    }
-  }
-}
-
-
-/* ----------------------------------------------------------------------
-   find center grid pt for each of my particles
-   check that full stencil for the particle will fit in my 3d brick
-   store central grid pt indices in part2grid array
-------------------------------------------------------------------------- */
-
-
-void PPPMCuda::particle_map()
-{
-  MYDBG(printf("# CUDA PPPMCuda::particle_map() ... start\n");)
-  int flag = 0;
-
-    cu_flag->memset_device(0);
-    flag=cuda_particle_map(&cuda->shared_data,cu_flag->dev_data());
-    if(flag)
-    {
-      cu_debugdata->download();
-      printf("Out of range atom: ");
-       printf("ID: %i ",atom->tag[int(debugdata[0])]);
-       printf("x: %e ",debugdata[7]);
-       printf("y: %e ",debugdata[8]);
-       printf("z: %e ",debugdata[9]);
-       printf("nx: %e ",debugdata[4]);
-       printf("ny: %e ",debugdata[5]);
-
-      printf("\n");
-      //printf("debugdata: cpu: %e %e %e %i\n",boxlo[0],boxlo[1],boxlo[2],atom->nlocal);
-      cuda->cu_x->download();
-            int nx,ny,nz;
-
-            double **x = atom->x;
-      int nlocal = atom->nlocal;
-            for (int i = 0; i < nlocal; i++) {
-        nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
-        ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
-              nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
-
-            if(i==1203)printf("Outside Atom: %i %e %e %e (%i %i %i)\n",i,x[i][0],x[i][1],x[i][2],nx,ny,nz);
-            if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
-                ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-                nz+nlower < nzlo_out || nz+nupper > nzhi_out || i==1203) {printf("Outside Atom: %i %e %e %e (%i %i %i)\n",i,x[i][0],x[i][1],x[i][2],nx,ny,nz); }
-            }
-
-    }
-
-  int flag_all;
-  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
-  if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPMCuda!");
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = charge "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid
-------------------------------------------------------------------------- */
-
-
-void PPPMCuda::make_rho()
-{
-    cuda_make_rho(&cuda->shared_data,cu_flag->dev_data(),&density_intScale,nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,cu_density_brick->dev_data(),cu_density_brick_int->dev_data());
-}
-
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver
-------------------------------------------------------------------------- */
-void PPPMCuda::poisson(int eflag, int vflag)
-{
-
-#ifndef FFT_CUFFT
-    PPPMOld::poisson(eflag,vflag);
-    return;
-#endif
-#ifdef FFT_CUFFT
-  my_times starttime;
-  my_times endtime;
-
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-  fft1c->compute(density_fft,work1,1);
-
-  my_gettime(CLOCK_REALTIME,&endtime);
-  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-
-
-  if (eflag || vflag) {
-    poisson_energy(nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,vflag);
-    ENERGY_CFLOAT gpuvirial[6];
-    energy+=sum_energy(cu_virial->dev_data(),cu_energy->dev_data(),nx_pppm,ny_pppm,nz_pppm,vflag,gpuvirial);
-    if(vflag)
-    {
-      for(int j=0;j<6;j++) virial[j]+=gpuvirial[j];
-    }
-  }
-
-
-  // scale by 1/total-grid-pts to get rho(k)
-  // multiply by Green's function to get V(k)
-
-  poisson_scale(nx_pppm,ny_pppm,nz_pppm);
-
-   // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
-  // FFT leaves data in 3d brick decomposition
-  // copy it into inner portion of vdx,vdy,vdz arrays
-
-  // x direction gradient
-
-
-  poisson_xgrad(nx_pppm,ny_pppm,nz_pppm);
-
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-  fft2c->compute(work2,work2,-1);
-  my_gettime(CLOCK_REALTIME,&endtime);
-  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  poisson_vdx_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
-
-
-  // y direction gradient
-
-  poisson_ygrad(nx_pppm,ny_pppm,nz_pppm);
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-  fft2c->compute(work2,work2,-1);
-  my_gettime(CLOCK_REALTIME,&endtime);
-  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  poisson_vdy_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
-
-  // z direction gradient
-
-  poisson_zgrad(nx_pppm,ny_pppm,nz_pppm);
-
-  my_gettime(CLOCK_REALTIME,&starttime);
-  fft2c->compute(work2,work2,-1);
-  my_gettime(CLOCK_REALTIME,&endtime);
-  poissontime+=(endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000);
-
-  poisson_vdz_brick(nxhi_out,nxlo_out,nyhi_out,nylo_out,nzhi_out,nzlo_out,nx_pppm,ny_pppm,nz_pppm);
- #endif
-}
-
-/*----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
--------------------------------------------------------------------------*/
-
-void PPPMCuda::fieldforce()
-{
-  cuda_fieldforce(& cuda->shared_data,cu_flag);
-  return;
-}
-
-/* ----------------------------------------------------------------------
-   perform and time the 4 FFTs required for N timesteps
-------------------------------------------------------------------------- */
-
-int PPPMCuda::timing_1d(int n, double &time1d)
-{
-  time1d = cuda->shared_data.cuda_timings.pppm_poisson/update->nsteps/4*n;
-  return 4;
-}
-
-int PPPMCuda::timing_3d(int n, double &time3d)
-{
-  time3d = cuda->shared_data.cuda_timings.pppm_poisson/update->nsteps*n;
-  return 4;
-}
-
-void PPPMCuda::slabcorr(int eflag)
-{
-  // compute local contribution to global dipole moment
-  if(slabbuf==NULL)
-  {
-          slabbuf=new ENERGY_CFLOAT[(atom->nmax+31)/32];
-          cu_slabbuf = new cCudaData<ENERGY_CFLOAT,ENERGY_CFLOAT, x> (slabbuf, (atom->nmax+31)/32);
-  }
-  if(unsigned((atom->nlocal+31)/32)*sizeof(ENERGY_CFLOAT)>=unsigned(cu_slabbuf->dev_size()))
-  {
-          delete [] slabbuf;
-          delete cu_slabbuf;
-          slabbuf=new ENERGY_CFLOAT[(atom->nmax+31)/32];
-          cu_slabbuf = new cCudaData<ENERGY_CFLOAT,ENERGY_CFLOAT, x> (slabbuf, (atom->nmax+31)/32);
-  }
-
-
-  double dipole = cuda_slabcorr_energy(&cuda->shared_data,slabbuf,(ENERGY_CFLOAT*) cu_slabbuf->dev_data());
-
-  double dipole_all;
-  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
-
-  //if (eflag) energy += qqrd2e*scale * e_slabcorr;
-  // need to add a correction to make non-neutral systems and per-atom energy translationally invariant
-  if (eflag || fabs(qsum) > SMALL)
-    error->all(FLERR,"Cannot (yet) use slab correction with kspace_style pppm/cuda for non-neutral systems or to get per-atom energy. Aborting.");
-
-  double ffact = -4.0*MY_PI*dipole_all/volume;
-
-  cuda_slabcorr_force(&cuda->shared_data,ffact);
-}
diff --git a/src/USER-CUDA/pppm_cuda.h b/src/USER-CUDA/pppm_cuda.h
deleted file mode 100644
index cd22aa1d5d..0000000000
--- a/src/USER-CUDA/pppm_cuda.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef KSPACE_CLASS
-
-KSpaceStyle(pppm/cuda,PPPMCuda)
-
-#else
-
-#ifndef LMP_PPPM_CUDA_H
-#define LMP_PPPM_CUDA_H
-
-#include "pppm_old.h"
-#include "cuda_data.h"
-#include "cuda_precision.h"
-
-namespace LAMMPS_NS {
-
-class PPPMCuda : public PPPMOld {
- public:
-  PPPMCuda(class LAMMPS *, int, char **);
-  ~PPPMCuda();
-  void init();
-  void setup();
-  void compute(int, int);
-  int timing_1d(int, double &);
-  int timing_3d(int, double &);
-
-  double poissontime;
-
- protected:
-  class Cuda *cuda;
-  class FFT3dCuda *fft1c,*fft2c;
-  double* work3;
-
-  cCudaData<double     , FFT_CFLOAT      , x >* cu_work1;
-  cCudaData<double     , FFT_CFLOAT      , x >* cu_work2;
-  cCudaData<double     , FFT_CFLOAT      , x >* cu_work3;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_greensfn;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_gf_b;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_fkx;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_fky;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_fkz;
-  cCudaData<double     , PPPM_CFLOAT     , xy>* cu_vg;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_density_brick;
-  cCudaData<int        , int                     , x >* cu_density_brick_int;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_vdx_brick;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_vdy_brick;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_vdz_brick;
-  cCudaData<double     , PPPM_CFLOAT     , x >* cu_density_fft;
-  cCudaData<double     , ENERGY_CFLOAT   , x >* cu_energy;
-  cCudaData<double     , ENERGY_CFLOAT   , x >* cu_virial;
-  cCudaData<double     , X_CFLOAT                   , yx>* cu_x;
-  cCudaData<double     , V_CFLOAT                   , yx>* cu_v;
-  cCudaData<double     , F_CFLOAT                   , yx>* cu_f;
-  cCudaData<double     , F_CFLOAT                   , yx>* cu_q;
-  cCudaData<int        , int                           , yx>* cu_part2grid;
-  cCudaData<double           , PPPM_CFLOAT                , x >* cu_rho_coeff;
-  cCudaData<PPPM_CFLOAT , PPPM_CFLOAT                , x >* cu_debugdata;
-  cCudaData<int        , int                           , x >* cu_flag;
-  cCudaData<int        , int                           , x >* cu_pppm_grid_n;
-  cCudaData<int        , int                           , x >* cu_pppm_grid_ids;
-
-  ENERGY_CFLOAT* slabbuf;
-  cCudaData<ENERGY_CFLOAT, ENERGY_CFLOAT, x >* cu_slabbuf;
-
-  int*** density_brick_int;
-  PPPM_CFLOAT density_intScale;
-  int pppm_grid_nmax;
-  int* pppm2partgrid;
-  int* pppm_grid;
-  PPPM_CFLOAT* debugdata;
-  bool firstpass;
-
-  void set_grid();
-  void allocate();
-  void deallocate();
-
-  virtual void particle_map();
-  virtual void make_rho();
-  virtual void poisson(int, int);
-  virtual void fieldforce();
-  virtual void slabcorr(int);
-  double*** vdx_brick_tmp;
-  int old_nmax;
-  int global_flag;
-  dev_array* adev_data_array;
-};
-
-}
-
-#endif
-#endif
diff --git a/src/USER-CUDA/pppm_old.cpp b/src/USER-CUDA/pppm_old.cpp
deleted file mode 100755
index 2cc4c18626..0000000000
--- a/src/USER-CUDA/pppm_old.cpp
+++ /dev/null
@@ -1,2839 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
-     per-atom energy/virial & group/group energy/force added by Stan Moore (BYU)
-------------------------------------------------------------------------- */
-
-#include <mpi.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include "pppm_old.h"
-#include "math_const.h"
-#include "atom.h"
-#include "comm.h"
-#include "neighbor.h"
-#include "force.h"
-#include "pair.h"
-#include "bond.h"
-#include "angle.h"
-#include "domain.h"
-#include "fft3d_wrap.h"
-#include "remap_wrap.h"
-#include "memory.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-using namespace MathConst;
-
-#define MAXORDER 7
-#define OFFSET 16384
-#define SMALL 0.00001
-#define LARGE 10000.0
-#define EPS_HOC 1.0e-7
-
-#ifdef FFT_SINGLE
-#define ZEROF 0.0f
-#define ONEF  1.0f
-#else
-#define ZEROF 0.0
-#define ONEF  1.0
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-PPPMOld::PPPMOld(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
-{
-  if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command");
-
-  triclinic_support = 0;
-  pppmflag = 1;
-  group_group_enable = 0;
-
-  accuracy_relative = fabs(force->numeric(FLERR,arg[0]));
-
-  nfactors = 3;
-  factors = new int[nfactors];
-  factors[0] = 2;
-  factors[1] = 3;
-  factors[2] = 5;
-
-  MPI_Comm_rank(world,&me);
-  MPI_Comm_size(world,&nprocs);
-
-  density_brick = vdx_brick = vdy_brick = vdz_brick = NULL;
-  density_fft = NULL;
-  u_brick = NULL;
-  v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL;
-  greensfn = NULL;
-  work1 = work2 = NULL;
-  vg = NULL;
-  fkx = fky = fkz = NULL;
-  buf1 = buf2 = buf3 = buf4 = NULL;
-
-  density_A_brick = density_B_brick = NULL;
-  density_A_fft = density_B_fft = NULL;
-
-  gf_b = NULL;
-  rho1d = rho_coeff = NULL;
-
-  fft1 = fft2 = NULL;
-  remap = NULL;
-
-  nmax = 0;
-  part2grid = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   free all memory
-------------------------------------------------------------------------- */
-
-PPPMOld::~PPPMOld()
-{
-  delete [] factors;
-  deallocate();
-  deallocate_peratom();
-  deallocate_groups();
-  memory->destroy(part2grid);
-}
-
-/* ----------------------------------------------------------------------
-   called once before run
-------------------------------------------------------------------------- */
-
-void PPPMOld::init()
-{
-  if (me == 0) {
-    if (screen) fprintf(screen,"PPPM initialization ...\n");
-    if (logfile) fprintf(logfile,"PPPM initialization ...\n");
-  }
-
-  // error check
-
-  triclinic_check();
-  if (domain->dimension == 2) error->all(FLERR,
-                                         "Cannot use PPPM with 2d simulation");
-
-  if (!atom->q_flag) error->all(FLERR,"Kspace style requires atom attribute q");
-
-  if (slabflag == 0 && domain->nonperiodic > 0)
-    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPM");
-  if (slabflag) {
-    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
-        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
-      error->all(FLERR,"Incorrect boundaries with slab PPPM");
-  }
-
-  if (order < 2 || order > MAXORDER) {
-    char str[128];
-    sprintf(str,"PPPM order cannot be < 2 or > than %d",MAXORDER);
-    error->all(FLERR,str);
-  }
-
-  // free all arrays previously allocated
-
-  deallocate();
-  deallocate_peratom();
-  peratom_allocate_flag = 0;
-  deallocate_groups();
-  group_allocate_flag = 0;
-
-  // extract short-range Coulombic cutoff from pair style
-
-  pair_check();
-
-  int itmp=0;
-  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
-  if (p_cutoff == NULL)
-    error->all(FLERR,"KSpace style is incompatible with Pair style");
-  cutoff = *p_cutoff;
-
-  // if kspace is TIP4P, extract TIP4P params from pair style
-  // bond/angle are not yet init(), so insure equilibrium request is valid
-
-  qdist = 0.0;
-
-  if (tip4pflag) {
-    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
-    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
-    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
-    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
-    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
-    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
-      error->all(FLERR,"KSpace style is incompatible with Pair style");
-    qdist = *p_qdist;
-    typeO = *p_typeO;
-    typeH = *p_typeH;
-    int typeA = *p_typeA;
-    int typeB = *p_typeB;
-
-    if (force->angle == NULL || force->bond == NULL)
-      error->all(FLERR,"Bond and angle potentials must be defined for TIP4P");
-    if (typeA < 1 || typeA > atom->nangletypes ||
-        force->angle->setflag[typeA] == 0)
-      error->all(FLERR,"Bad TIP4P angle type for PPPM/TIP4P");
-    if (typeB < 1 || typeB > atom->nbondtypes ||
-        force->bond->setflag[typeB] == 0)
-      error->all(FLERR,"Bad TIP4P bond type for PPPM/TIP4P");
-    double theta = force->angle->equilibrium_angle(typeA);
-    double blen = force->bond->equilibrium_distance(typeB);
-    alpha = qdist / (cos(0.5*theta) * blen);
-  }
-
-  // compute qsum & qsqsum and warn if not charge-neutral
-
-  scale = 1.0;
-  qqrd2e = force->qqrd2e;
-  qsum_qsq();
-  natoms_original = atom->natoms;
-
-  // set accuracy (force units) from accuracy_relative or accuracy_absolute
-
-  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
-  else accuracy = accuracy_relative * two_charge_force;
-
-  // setup FFT grid resolution and g_ewald
-  // normally one iteration thru while loop is all that is required
-  // if grid stencil extends beyond neighbor proc, reduce order and try again
-
-  int iteration = 0;
-
-  while (order > 1) {
-    if (iteration && me == 0)
-      error->warning(FLERR,"Reducing PPPM order b/c stencil extends "
-                     "beyond neighbor processor");
-    iteration++;
-
-    set_grid();
-
-    if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
-      error->all(FLERR,"PPPM grid is too large");
-
-    // global indices of PPPM grid range from 0 to N-1
-    // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
-    //   global PPPM grid that I own without ghost cells
-    // for slab PPPM, assign z grid as if it were not extended
-
-    nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
-    nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
-
-    nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
-    nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
-
-    nzlo_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
-    nzhi_in = static_cast<int>
-      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
-
-    // nlower,nupper = stencil size for mapping particles to PPPM grid
-
-    nlower = -(order-1)/2;
-    nupper = order/2;
-
-    // shift values for particle <-> grid mapping
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    if (order % 2) shift = OFFSET + 0.5;
-    else shift = OFFSET;
-    if (order % 2) shiftone = 0.0;
-    else shiftone = 0.5;
-
-    // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
-    //   global PPPM grid that my particles can contribute charge to
-    // effectively nlo_in,nhi_in + ghost cells
-    // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
-    //           position a particle in my box can be at
-    // dist[3] = particle position bound = subbox + skin/2.0 + qdist
-    //   qdist = offset due to TIP4P fictitious charge
-    //   convert to triclinic if necessary
-    // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
-    // for slab PPPM, assign z grid as if it were not extended
-
-    triclinic = domain->triclinic;
-    double *prd,*sublo,*subhi;
-
-    if (triclinic == 0) {
-      prd = domain->prd;
-      boxlo = domain->boxlo;
-      sublo = domain->sublo;
-      subhi = domain->subhi;
-    } else {
-      prd = domain->prd_lamda;
-      boxlo = domain->boxlo_lamda;
-      sublo = domain->sublo_lamda;
-      subhi = domain->subhi_lamda;
-    }
-
-    double xprd = prd[0];
-    double yprd = prd[1];
-    double zprd = prd[2];
-    double zprd_slab = zprd*slab_volfactor;
-
-    double dist[3];
-    double cuthalf = 0.5*neighbor->skin + qdist;
-    if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
-    else {
-      dist[0] = cuthalf/domain->prd[0];
-      dist[1] = cuthalf/domain->prd[1];
-      dist[2] = cuthalf/domain->prd[2];
-    }
-
-    int nlo,nhi;
-
-    nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-    nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
-                            nx_pppm/xprd + shift) - OFFSET;
-    nxlo_out = nlo + nlower;
-    nxhi_out = nhi + nupper;
-
-    nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-    nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
-                            ny_pppm/yprd + shift) - OFFSET;
-    nylo_out = nlo + nlower;
-    nyhi_out = nhi + nupper;
-
-    nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-    nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
-                            nz_pppm/zprd_slab + shift) - OFFSET;
-    nzlo_out = nlo + nlower;
-    nzhi_out = nhi + nupper;
-
-    // for slab PPPM, change the grid boundary for processors at +z end
-    //   to include the empty volume between periodically repeating slabs
-    // for slab PPPM, want charge data communicated from -z proc to +z proc,
-    //   but not vice versa, also want field data communicated from +z proc to
-    //   -z proc, but not vice versa
-    // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
-
-    if (slabflag == 1 && (comm->myloc[2] == comm->procgrid[2]-1)) {
-      nzhi_in = nz_pppm - 1;
-      nzhi_out = nz_pppm - 1;
-    }
-
-    // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions
-    //   that overlay domain I own
-    // proc in that direction tells me via sendrecv()
-    // if no neighbor proc, value is from self since I have ghosts regardless
-
-    int nplanes;
-
-    nplanes = nxlo_in - nxlo_out;
-    if (comm->procneigh[0][0] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0,
-                   &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0,
-                   world,MPI_STATUS_IGNORE);
-    else nxhi_ghost = nplanes;
-
-    nplanes = nxhi_out - nxhi_in;
-    if (comm->procneigh[0][1] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0,
-                   &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0],
-                   0,world,MPI_STATUS_IGNORE);
-    else nxlo_ghost = nplanes;
-
-    nplanes = nylo_in - nylo_out;
-    if (comm->procneigh[1][0] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0,
-                   &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0,
-                   world,MPI_STATUS_IGNORE);
-    else nyhi_ghost = nplanes;
-
-    nplanes = nyhi_out - nyhi_in;
-    if (comm->procneigh[1][1] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0,
-                   &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0,
-                   world,MPI_STATUS_IGNORE);
-    else nylo_ghost = nplanes;
-
-    nplanes = nzlo_in - nzlo_out;
-    if (comm->procneigh[2][0] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0,
-                   &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0,
-                   world,MPI_STATUS_IGNORE);
-    else nzhi_ghost = nplanes;
-
-    nplanes = nzhi_out - nzhi_in;
-    if (comm->procneigh[2][1] != me)
-      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0,
-                   &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0,
-                   world,MPI_STATUS_IGNORE);
-    else nzlo_ghost = nplanes;
-
-    // test that ghost overlap is not bigger than my sub-domain
-
-    int flag = 0;
-    if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1;
-    if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1;
-    if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1;
-    if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1;
-    if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1;
-    if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1;
-
-    int flag_all;
-    MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
-
-    if (flag_all == 0) break;
-    order--;
-  }
-
-  if (order == 0) error->all(FLERR,"PPPM order has been reduced to 0");
-
-  // decomposition of FFT mesh
-  // global indices range from 0 to N-1
-  // proc owns entire x-dimension, clump of columns in y,z dimensions
-  // npey_fft,npez_fft = # of procs in y,z dims
-  // if nprocs is small enough, proc can own 1 or more entire xy planes,
-  //   else proc owns 2d sub-blocks of yz plane
-  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
-  // nlo_fft,nhi_fft = lower/upper limit of the section
-  //   of the global FFT mesh that I own
-
-  int npey_fft,npez_fft;
-  if (nz_pppm >= nprocs) {
-    npey_fft = 1;
-    npez_fft = nprocs;
-  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
-
-  int me_y = me % npey_fft;
-  int me_z = me / npey_fft;
-
-  nxlo_fft = 0;
-  nxhi_fft = nx_pppm - 1;
-  nylo_fft = me_y*ny_pppm/npey_fft;
-  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
-  nzlo_fft = me_z*nz_pppm/npez_fft;
-  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
-
-  // PPPM grid for this proc, including ghosts
-
-  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-
-  // FFT arrays on this proc, without ghosts
-  // nfft = FFT points in FFT decomposition on this proc
-  // nfft_brick = FFT points in 3d brick-decomposition on this proc
-  // nfft_both = greater of 2 values
-
-  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
-    (nzhi_fft-nzlo_fft+1);
-  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
-    (nzhi_in-nzlo_in+1);
-  nfft_both = MAX(nfft,nfft_brick);
-
-  // buffer space for use in brick2fft and fillbrick
-  // idel = max # of ghost planes to send or recv in +/- dir of each dim
-  // nx,ny,nz = owned planes (including ghosts) in each dim
-  // nxx,nyy,nzz = max # of grid cells to send in each dim
-  // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick
-
-  int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz;
-
-  idelx = MAX(nxlo_ghost,nxhi_ghost);
-  idelx = MAX(idelx,nxhi_out-nxhi_in);
-  idelx = MAX(idelx,nxlo_in-nxlo_out);
-
-  idely = MAX(nylo_ghost,nyhi_ghost);
-  idely = MAX(idely,nyhi_out-nyhi_in);
-  idely = MAX(idely,nylo_in-nylo_out);
-
-  idelz = MAX(nzlo_ghost,nzhi_ghost);
-  idelz = MAX(idelz,nzhi_out-nzhi_in);
-  idelz = MAX(idelz,nzlo_in-nzlo_out);
-
-  nx = nxhi_out - nxlo_out + 1;
-  ny = nyhi_out - nylo_out + 1;
-  nz = nzhi_out - nzlo_out + 1;
-
-  nxx = idelx * ny * nz;
-  nyy = idely * nx * nz;
-  nzz = idelz * nx * ny;
-
-  nbuf = MAX(nxx,nyy);
-  nbuf = MAX(nbuf,nzz);
-
-  nbuf_peratom = 7*nbuf;
-  nbuf *= 3;
-
-  // print stats
-
-  int ngrid_max,nfft_both_max,nbuf_max;
-  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
-  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
-  MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world);
-
-  if (me == 0) {
-    if (screen) fprintf(screen,"  brick FFT buffer size/proc = %d %d %d\n",
-                        ngrid_max,nfft_both_max,nbuf_max);
-    if (logfile) fprintf(logfile,"  brick FFT buffer size/proc = %d %d %d\n",
-                         ngrid_max,nfft_both_max,nbuf_max);
-  }
-
-  // allocate K-space dependent memory
-  // don't invoke allocate_peratom() here, wait to see if needed
-
-  allocate();
-
-  // pre-compute Green's function denomiator expansion
-  // pre-compute 1d charge distribution coefficients
-
-  compute_gf_denom();
-  compute_rho_coeff();
-}
-
-/* ----------------------------------------------------------------------
-   adjust PPPM coeffs, called initially and whenever volume has changed
-------------------------------------------------------------------------- */
-
-void PPPMOld::setup()
-{
-  int i,j,k,l,m,n;
-  double *prd;
-
-  // volume-dependent factors
-  // adjust z dimension for 2d slab PPPM
-  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
-
-  if (triclinic == 0) prd = domain->prd;
-  else prd = domain->prd_lamda;
-
-  double xprd = prd[0];
-  double yprd = prd[1];
-  double zprd = prd[2];
-  double zprd_slab = zprd*slab_volfactor;
-  volume = xprd * yprd * zprd_slab;
-
-  delxinv = nx_pppm/xprd;
-  delyinv = ny_pppm/yprd;
-  delzinv = nz_pppm/zprd_slab;
-
-  delvolinv = delxinv*delyinv*delzinv;
-
-  double unitkx = (2.0*MY_PI/xprd);
-  double unitky = (2.0*MY_PI/yprd);
-  double unitkz = (2.0*MY_PI/zprd_slab);
-
-  // fkx,fky,fkz for my FFT grid pts
-
-  double per;
-
-  for (i = nxlo_fft; i <= nxhi_fft; i++) {
-    per = i - nx_pppm*(2*i/nx_pppm);
-    fkx[i] = unitkx*per;
-  }
-
-  for (i = nylo_fft; i <= nyhi_fft; i++) {
-    per = i - ny_pppm*(2*i/ny_pppm);
-    fky[i] = unitky*per;
-  }
-
-  for (i = nzlo_fft; i <= nzhi_fft; i++) {
-    per = i - nz_pppm*(2*i/nz_pppm);
-    fkz[i] = unitkz*per;
-  }
-
-  // virial coefficients
-
-  double sqk,vterm;
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++) {
-    for (j = nylo_fft; j <= nyhi_fft; j++) {
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
-        if (sqk == 0.0) {
-          vg[n][0] = 0.0;
-          vg[n][1] = 0.0;
-          vg[n][2] = 0.0;
-          vg[n][3] = 0.0;
-          vg[n][4] = 0.0;
-          vg[n][5] = 0.0;
-        } else {
-          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
-          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
-          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
-          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
-          vg[n][3] = vterm*fkx[i]*fky[j];
-          vg[n][4] = vterm*fkx[i]*fkz[k];
-          vg[n][5] = vterm*fky[j]*fkz[k];
-        }
-        n++;
-      }
-    }
-  }
-
-  // modified (Hockney-Eastwood) Coulomb Green's function
-
-  int nx,ny,nz,kper,lper,mper;
-  double snx,sny,snz,snx2,sny2,snz2;
-  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
-  double sum1,dot1,dot2;
-  double numerator,denominator;
-
-  int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
-                              pow(-log(EPS_HOC),0.25));
-  int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
-                              pow(-log(EPS_HOC),0.25));
-  int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
-                              pow(-log(EPS_HOC),0.25));
-
-  double form = 1.0;
-
-  n = 0;
-  for (m = nzlo_fft; m <= nzhi_fft; m++) {
-    mper = m - nz_pppm*(2*m/nz_pppm);
-    snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm);
-    snz2 = snz*snz;
-
-    for (l = nylo_fft; l <= nyhi_fft; l++) {
-      lper = l - ny_pppm*(2*l/ny_pppm);
-      sny = sin(0.5*unitky*lper*yprd/ny_pppm);
-      sny2 = sny*sny;
-
-      for (k = nxlo_fft; k <= nxhi_fft; k++) {
-        kper = k - nx_pppm*(2*k/nx_pppm);
-        snx = sin(0.5*unitkx*kper*xprd/nx_pppm);
-        snx2 = snx*snx;
-
-        sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) +
-          pow(unitkz*mper,2.0);
-
-        if (sqk != 0.0) {
-          numerator = form*12.5663706/sqk;
-          denominator = gf_denom(snx2,sny2,snz2);
-          sum1 = 0.0;
-          const double dorder = static_cast<double>(order);
-          for (nx = -nbx; nx <= nbx; nx++) {
-            qx = unitkx*(kper+nx_pppm*nx);
-            sx = exp(-0.25*pow(qx/g_ewald,2.0));
-            wx = 1.0;
-            argx = 0.5*qx*xprd/nx_pppm;
-            if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
-            for (ny = -nby; ny <= nby; ny++) {
-              qy = unitky*(lper+ny_pppm*ny);
-              sy = exp(-0.25*pow(qy/g_ewald,2.0));
-              wy = 1.0;
-              argy = 0.5*qy*yprd/ny_pppm;
-              if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
-              for (nz = -nbz; nz <= nbz; nz++) {
-                qz = unitkz*(mper+nz_pppm*nz);
-                sz = exp(-0.25*pow(qz/g_ewald,2.0));
-                wz = 1.0;
-                argz = 0.5*qz*zprd_slab/nz_pppm;
-                if (argz != 0.0) wz = pow(sin(argz)/argz,dorder);
-
-                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
-                dot2 = qx*qx+qy*qy+qz*qz;
-                sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,2.0);
-              }
-            }
-          }
-          greensfn[n++] = numerator*sum1/denominator;
-        } else greensfn[n++] = 0.0;
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   compute the PPPM long-range force, energy, virial
-------------------------------------------------------------------------- */
-
-void PPPMOld::compute(int eflag, int vflag)
-{
-  int i,j;
-
-  // set energy/virial flags
-  // invoke allocate_peratom() if needed for first time
-
-  if (eflag || vflag) ev_setup(eflag,vflag);
-  else evflag = evflag_atom = eflag_global = vflag_global =
-         eflag_atom = vflag_atom = 0;
-
-  if (evflag_atom && !peratom_allocate_flag) {
-    allocate_peratom();
-    peratom_allocate_flag = 1;
-  }
-
-  // convert atoms from box to lamda coords
-
-  if (triclinic == 0) boxlo = domain->boxlo;
-  else {
-    boxlo = domain->boxlo_lamda;
-    domain->x2lamda(atom->nlocal);
-  }
-
-  // extend size of per-atom arrays if necessary
-
-  if (atom->nlocal > nmax) {
-    memory->destroy(part2grid);
-    nmax = atom->nmax;
-    memory->create(part2grid,nmax,3,"pppm:part2grid");
-  }
-
-  // find grid points for all my particles
-  // map my particle charge onto my local 3d density grid
-
-  particle_map();
-  make_rho();
-
-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
-  // remap from 3d decomposition to FFT decomposition
-
-  brick2fft();
-
-  // compute potential gradient on my FFT grid and
-  //   portion of e_long on this proc's FFT grid
-  // return gradients (electric fields) in 3d brick decomposition
-  // also performs per-atom calculations via poisson_peratom()
-
-  poisson(eflag,vflag);
-
-  // all procs communicate E-field values
-  // to fill ghost cells surrounding their 3d bricks
-
-  fillbrick();
-
-  // extra per-atom energy/virial communication
-
-  if (evflag_atom) fillbrick_peratom();
-
-  // calculate the force on my particles
-
-  fieldforce();
-
-  // extra per-atom energy/virial communication
-
-  if (evflag_atom) fieldforce_peratom();
-
-  // update qsum and qsqsum, if atom count has changed and energy needed
-
-  if ((eflag_global || eflag_atom) && atom->natoms != natoms_original) {
-    qsum_qsq();
-    natoms_original = atom->natoms;
-  }
-
-  // sum global energy across procs and add in volume-dependent term
-
-  const double qscale = qqrd2e * scale;
-
-  if (eflag_global) {
-    double energy_all;
-    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
-    energy = energy_all;
-
-    energy *= 0.5*volume;
-    energy -= g_ewald*qsqsum/MY_PIS +
-      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
-    energy *= qscale;
-  }
-
-  // sum global virial across procs
-
-  if (vflag_global) {
-    double virial_all[6];
-    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
-    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
-  }
-
-  // per-atom energy/virial
-  // energy includes self-energy correction
-
-  if (evflag_atom) {
-    double *q = atom->q;
-    int nlocal = atom->nlocal;
-
-    if (eflag_atom) {
-      for (i = 0; i < nlocal; i++) {
-        eatom[i] *= 0.5;
-        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
-          (g_ewald*g_ewald*volume);
-        eatom[i] *= qscale;
-      }
-    }
-
-    if (vflag_atom) {
-      for (i = 0; i < nlocal; i++)
-        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*q[i]*qscale;
-    }
-  }
-
-  // 2d slab correction
-
-  if (slabflag == 1) slabcorr();
-
-  // convert atoms back from lamda to box coords
-
-  if (triclinic) domain->lamda2x(atom->nlocal);
-}
-
-/* ----------------------------------------------------------------------
-   allocate memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPMOld::allocate()
-{
-  memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_brick");
-  memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdx_brick");
-  memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdy_brick");
-  memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:vdz_brick");
-
-  memory->create(density_fft,nfft_both,"pppm:density_fft");
-  memory->create(greensfn,nfft_both,"pppm:greensfn");
-  memory->create(work1,2*nfft_both,"pppm:work1");
-  memory->create(work2,2*nfft_both,"pppm:work2");
-  memory->create(vg,nfft_both,6,"pppm:vg");
-
-  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
-  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
-  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
-
-  memory->create(buf1,nbuf,"pppm:buf1");
-  memory->create(buf2,nbuf,"pppm:buf2");
-
-  // summation coeffs
-
-  memory->create(gf_b,order,"pppm:gf_b");
-  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
-  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
-
-  // create 2 FFTs and a Remap
-  // 1st FFT keeps data in FFT decompostion
-  // 2nd FFT returns data in 3d brick decomposition
-  // remap takes data from 3d brick to FFT decomposition
-
-  int tmp;
-
-  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   0,0,&tmp,collective_flag);
-
-  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
-                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                   0,0,&tmp,collective_flag);
-
-  remap = new Remap(lmp,world,
-                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
-                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-                    1,0,0,FFT_PRECISION,collective_flag);
-}
-
-/* ----------------------------------------------------------------------
-   allocate per-atom memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPMOld::allocate_peratom()
-{
-  memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:u_brick");
-
-  memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v0_brick");
-  memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v1_brick");
-  memory->create3d_offset(v2_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v2_brick");
-  memory->create3d_offset(v3_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v3_brick");
-  memory->create3d_offset(v4_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v4_brick");
-  memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:v5_brick");
-
-  memory->create(buf3,nbuf_peratom,"pppm:buf3");
-  memory->create(buf4,nbuf_peratom,"pppm:buf4");
-}
-
-/* ----------------------------------------------------------------------
-   deallocate memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPMOld::deallocate()
-{
-  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out);
-
-  memory->destroy(density_fft);
-  memory->destroy(greensfn);
-  memory->destroy(work1);
-  memory->destroy(work2);
-  memory->destroy(vg);
-
-  memory->destroy1d_offset(fkx,nxlo_fft);
-  memory->destroy1d_offset(fky,nylo_fft);
-  memory->destroy1d_offset(fkz,nzlo_fft);
-
-  memory->destroy(buf1);
-  memory->destroy(buf2);
-
-  memory->destroy(gf_b);
-  memory->destroy2d_offset(rho1d,-order/2);
-  memory->destroy2d_offset(rho_coeff,(1-order)/2);
-
-  delete fft1;
-  delete fft2;
-  delete remap;
-}
-
-/* ----------------------------------------------------------------------
-   deallocate per-atom memory that depends on # of K-vectors and order
-------------------------------------------------------------------------- */
-
-void PPPMOld::deallocate_peratom()
-{
-  memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out);
-
-  memory->destroy3d_offset(v0_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v1_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v2_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v3_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v4_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(v5_brick,nzlo_out,nylo_out,nxlo_out);
-
-  memory->destroy(buf3);
-  memory->destroy(buf4);
-}
-
-/* ----------------------------------------------------------------------
-   set size of FFT grid (nx,ny,nz_pppm) and g_ewald
-------------------------------------------------------------------------- */
-
-void PPPMOld::set_grid()
-{
-  // see JCP 109, pg 7698 for derivation of coefficients
-  // higher order coefficients may be computed if needed
-
-  double **acons;
-  memory->create(acons,8,7,"pppm:acons");
-
-  acons[1][0] = 2.0 / 3.0;
-  acons[2][0] = 1.0 / 50.0;
-  acons[2][1] = 5.0 / 294.0;
-  acons[3][0] = 1.0 / 588.0;
-  acons[3][1] = 7.0 / 1440.0;
-  acons[3][2] = 21.0 / 3872.0;
-  acons[4][0] = 1.0 / 4320.0;
-  acons[4][1] = 3.0 / 1936.0;
-  acons[4][2] = 7601.0 / 2271360.0;
-  acons[4][3] = 143.0 / 28800.0;
-  acons[5][0] = 1.0 / 23232.0;
-  acons[5][1] = 7601.0 / 13628160.0;
-  acons[5][2] = 143.0 / 69120.0;
-  acons[5][3] = 517231.0 / 106536960.0;
-  acons[5][4] = 106640677.0 / 11737571328.0;
-  acons[6][0] = 691.0 / 68140800.0;
-  acons[6][1] = 13.0 / 57600.0;
-  acons[6][2] = 47021.0 / 35512320.0;
-  acons[6][3] = 9694607.0 / 2095994880.0;
-  acons[6][4] = 733191589.0 / 59609088000.0;
-  acons[6][5] = 326190917.0 / 11700633600.0;
-  acons[7][0] = 1.0 / 345600.0;
-  acons[7][1] = 3617.0 / 35512320.0;
-  acons[7][2] = 745739.0 / 838397952.0;
-  acons[7][3] = 56399353.0 / 12773376000.0;
-  acons[7][4] = 25091609.0 / 1560084480.0;
-  acons[7][5] = 1755948832039.0 / 36229939200000.0;
-  acons[7][6] = 4887769399.0 / 37838389248.0;
-
-  // use xprd,yprd,zprd even if triclinic so grid size is the same
-  // adjust z dimension for 2d slab PPPM
-  // 3d PPPM just uses zprd since slab_volfactor = 1.0
-
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  double zprd_slab = zprd*slab_volfactor;
-
-  // make initial g_ewald estimate
-  // based on desired accuracy and real space cutoff
-  // fluid-occupied volume used to estimate real-space error
-  // zprd used rather than zprd_slab
-
-  double h_x,h_y,h_z;
-  bigint natoms = atom->natoms;
-
-  if (!gewaldflag) {
-    if (accuracy <= 0.0)
-      error->all(FLERR,"KSpace accuracy must be > 0");
-    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
-    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
-    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
-  }
-
-  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
-  // nz_pppm uses extended zprd_slab instead of zprd
-  // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1
-  // reduce it until accuracy target is met
-
-  if (!gridflag) {
-    double err;
-    h_x = h_y = h_z = 1.0/g_ewald;
-
-    nx_pppm = static_cast<int> (xprd/h_x) + 1;
-    ny_pppm = static_cast<int> (yprd/h_y) + 1;
-    nz_pppm = static_cast<int> (zprd_slab/h_z) + 1;
-
-    err = rms(h_x,xprd,natoms,q2,acons);
-    while (err > accuracy) {
-      err = rms(h_x,xprd,natoms,q2,acons);
-      nx_pppm++;
-      h_x = xprd/nx_pppm;
-    }
-
-    err = rms(h_y,yprd,natoms,q2,acons);
-    while (err > accuracy) {
-      err = rms(h_y,yprd,natoms,q2,acons);
-      ny_pppm++;
-      h_y = yprd/ny_pppm;
-    }
-
-    err = rms(h_z,zprd_slab,natoms,q2,acons);
-    while (err > accuracy) {
-      err = rms(h_z,zprd_slab,natoms,q2,acons);
-      nz_pppm++;
-      h_z = zprd_slab/nz_pppm;
-    }
-  }
-
-  // boost grid size until it is factorable
-
-  while (!factorable(nx_pppm)) nx_pppm++;
-  while (!factorable(ny_pppm)) ny_pppm++;
-  while (!factorable(nz_pppm)) nz_pppm++;
-
-  // adjust g_ewald for new grid size
-
-  h_x = xprd/static_cast<double>(nx_pppm);
-  h_y = yprd/static_cast<double>(ny_pppm);
-  h_z = zprd_slab/static_cast<double>(nz_pppm);
-
-  if (!gewaldflag) {
-    double gew1,gew2,dgew,f,fmid,hmin,rtb;
-    int ncount;
-
-    gew1 = 0.0;
-    g_ewald = gew1;
-    f = diffpr(h_x,h_y,h_z,q2,acons);
-
-    hmin = MIN(h_x,MIN(h_y,h_z));
-    gew2 = 10.0/hmin;
-    g_ewald = gew2;
-    fmid = diffpr(h_x,h_y,h_z,q2,acons);
-
-    if (f*fmid >= 0.0) error->all(FLERR,"Cannot compute PPPM G");
-    rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2);
-    ncount = 0;
-    while (fabs(dgew) > SMALL && fmid != 0.0) {
-      dgew *= 0.5;
-      g_ewald = rtb + dgew;
-      fmid = diffpr(h_x,h_y,h_z,q2,acons);
-      if (fmid <= 0.0) rtb = g_ewald;
-      ncount++;
-      if (ncount > LARGE) error->all(FLERR,"Cannot compute PPPM G");
-    }
-  }
-
-  // final RMS accuracy
-
-  double lprx = rms(h_x,xprd,natoms,q2,acons);
-  double lpry = rms(h_y,yprd,natoms,q2,acons);
-  double lprz = rms(h_z,zprd_slab,natoms,q2,acons);
-  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
-  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
-  double spr = 2.0 *q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
-  double tpr = estimate_table_accuracy(q2_over_sqrt,spr);
-  double accuracy = sqrt(lpr*lpr + spr*spr + tpr*tpr);
-
-  // free local memory
-
-  memory->destroy(acons);
-
-  // print info
-
-  if (me == 0) {
-#ifdef FFT_SINGLE
-    const char fft_prec[] = "single";
-#else
-    const char fft_prec[] = "double";
-#endif
-    if (screen) {
-      fprintf(screen,"  G vector (1/distance)= %g\n",g_ewald);
-      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
-      fprintf(screen,"  stencil order = %d\n",order);
-      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
-              accuracy);
-      fprintf(screen,"  estimated relative force accuracy = %g\n",
-              accuracy/two_charge_force);
-      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
-    }
-    if (logfile) {
-      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
-      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
-      fprintf(logfile,"  stencil order = %d\n",order);
-      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
-              accuracy);
-      fprintf(logfile,"  estimated relative force accuracy = %g\n",
-              accuracy/two_charge_force);
-      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   check if all factors of n are in list of factors
-   return 1 if yes, 0 if no
-------------------------------------------------------------------------- */
-
-int PPPMOld::factorable(int n)
-{
-  int i;
-
-  while (n > 1) {
-    for (i = 0; i < nfactors; i++) {
-      if (n % factors[i] == 0) {
-        n /= factors[i];
-        break;
-      }
-    }
-    if (i == nfactors) return 0;
-  }
-
-  return 1;
-}
-
-/* ----------------------------------------------------------------------
-   compute RMS accuracy for a dimension
-------------------------------------------------------------------------- */
-
-double PPPMOld::rms(double h, double prd, bigint natoms,
-                 double q2, double **acons)
-{
-  double sum = 0.0;
-  for (int m = 0; m < order; m++)
-    sum += acons[order][m] * pow(h*g_ewald,2.0*m);
-  double value = q2 * pow(h*g_ewald,(double)order) *
-    sqrt(g_ewald*prd*sqrt(2.0*MY_PI)*sum/natoms) / (prd*prd);
-  return value;
-}
-
-/* ----------------------------------------------------------------------
-   compute difference in real-space and KSpace RMS accuracy
-------------------------------------------------------------------------- */
-
-double PPPMOld::diffpr(double h_x, double h_y, double h_z, double q2,
-                    double **acons)
-{
-  double lprx,lpry,lprz,kspace_prec,real_prec;
-  double xprd = domain->xprd;
-  double yprd = domain->yprd;
-  double zprd = domain->zprd;
-  bigint natoms = atom->natoms;
-
-  lprx = rms(h_x,xprd,natoms,q2,acons);
-  lpry = rms(h_y,yprd,natoms,q2,acons);
-  lprz = rms(h_z,zprd*slab_volfactor,natoms,q2,acons);
-  kspace_prec = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
-  real_prec = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) /
-   sqrt(static_cast<double>(natoms)*cutoff*xprd*yprd*zprd);
-  double value = kspace_prec - real_prec;
-  return value;
-}
-
-/* ----------------------------------------------------------------------
-   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
-------------------------------------------------------------------------- */
-
-void PPPMOld::compute_gf_denom()
-{
-  int k,l,m;
-
-  for (l = 1; l < order; l++) gf_b[l] = 0.0;
-  gf_b[0] = 1.0;
-
-  for (m = 1; m < order; m++) {
-    for (l = m; l > 0; l--)
-      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
-    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
-  }
-
-  bigint ifact = 1;
-  for (k = 1; k < 2*order; k++) ifact *= k;
-  double gaminv = 1.0/ifact;
-  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
-}
-
-/* ----------------------------------------------------------------------
-   ghost-swap to accumulate full density in brick decomposition
-   remap density from 3d brick decomposition to FFT decomposition
-------------------------------------------------------------------------- */
-
-void PPPMOld::brick2fft()
-{
-  int i,n,ix,iy,iz;
-  MPI_Request request;
-
-  // pack my ghosts for +x processor
-  // pass data to self or +x processor
-  // unpack and sum recv data into my real cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
-        buf1[n++] = density_brick[iz][iy][ix];
-
-  if (comm->procneigh[0][1] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
-        density_brick[iz][iy][ix] += buf2[n++];
-
-  // pack my ghosts for -x processor
-  // pass data to self or -x processor
-  // unpack and sum recv data into my real cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxlo_out; ix < nxlo_in; ix++)
-        buf1[n++] = density_brick[iz][iy][ix];
-
-  if (comm->procneigh[0][0] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
-        density_brick[iz][iy][ix] += buf2[n++];
-
-  // pack my ghosts for +y processor
-  // pass data to self or +y processor
-  // unpack and sum recv data into my real cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        buf1[n++] = density_brick[iz][iy][ix];
-
-  if (comm->procneigh[1][1] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        density_brick[iz][iy][ix] += buf2[n++];
-
-  // pack my ghosts for -y processor
-  // pass data to self or -y processor
-  // unpack and sum recv data into my real cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy < nylo_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        buf1[n++] = density_brick[iz][iy][ix];
-
-  if (comm->procneigh[1][0] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        density_brick[iz][iy][ix] += buf2[n++];
-
-  // pack my ghosts for +z processor
-  // pass data to self or +z processor
-  // unpack and sum recv data into my real cells
-
-  n = 0;
-  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        buf1[n++] = density_brick[iz][iy][ix];
-
-  if (comm->procneigh[2][1] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        density_brick[iz][iy][ix] += buf2[n++];
-
-  // pack my ghosts for -z processor
-  // pass data to self or -z processor
-  // unpack and sum recv data into my real cells
-
-  n = 0;
-  for (iz = nzlo_out; iz < nzlo_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        buf1[n++] = density_brick[iz][iy][ix];
-
-  if (comm->procneigh[2][0] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        density_brick[iz][iy][ix] += buf2[n++];
-
-  // remap from 3d brick decomposition to FFT decomposition
-  // copy grabs inner portion of density from 3d brick
-  // remap could be done as pre-stage of FFT,
-  //   but this works optimally on only double values, not complex values
-
-  n = 0;
-  for (iz = nzlo_in; iz <= nzhi_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++)
-        density_fft[n++] = density_brick[iz][iy][ix];
-
-  remap->perform(density_fft,density_fft,work1);
-}
-
-/* ----------------------------------------------------------------------
-   ghost-swap to fill ghost cells of my brick with field values
-------------------------------------------------------------------------- */
-
-void PPPMOld::fillbrick()
-{
-  int i,n,ix,iy,iz;
-  MPI_Request request;
-
-  // pack my real cells for +z processor
-  // pass data to self or +z processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        buf1[n++] = vdx_brick[iz][iy][ix];
-        buf1[n++] = vdy_brick[iz][iy][ix];
-        buf1[n++] = vdz_brick[iz][iy][ix];
-      }
-
-  if (comm->procneigh[2][1] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz < nzlo_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        vdx_brick[iz][iy][ix] = buf2[n++];
-        vdy_brick[iz][iy][ix] = buf2[n++];
-        vdz_brick[iz][iy][ix] = buf2[n++];
-      }
-
-  // pack my real cells for -z processor
-  // pass data to self or -z processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        buf1[n++] = vdx_brick[iz][iy][ix];
-        buf1[n++] = vdy_brick[iz][iy][ix];
-        buf1[n++] = vdz_brick[iz][iy][ix];
-      }
-
-  if (comm->procneigh[2][0] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        vdx_brick[iz][iy][ix] = buf2[n++];
-        vdy_brick[iz][iy][ix] = buf2[n++];
-        vdz_brick[iz][iy][ix] = buf2[n++];
-      }
-
-  // pack my real cells for +y processor
-  // pass data to self or +y processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        buf1[n++] = vdx_brick[iz][iy][ix];
-        buf1[n++] = vdy_brick[iz][iy][ix];
-        buf1[n++] = vdz_brick[iz][iy][ix];
-      }
-
-  if (comm->procneigh[1][1] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy < nylo_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        vdx_brick[iz][iy][ix] = buf2[n++];
-        vdy_brick[iz][iy][ix] = buf2[n++];
-        vdz_brick[iz][iy][ix] = buf2[n++];
-      }
-
-  // pack my real cells for -y processor
-  // pass data to self or -y processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        buf1[n++] = vdx_brick[iz][iy][ix];
-        buf1[n++] = vdy_brick[iz][iy][ix];
-        buf1[n++] = vdz_brick[iz][iy][ix];
-      }
-
-  if (comm->procneigh[1][0] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        vdx_brick[iz][iy][ix] = buf2[n++];
-        vdy_brick[iz][iy][ix] = buf2[n++];
-        vdz_brick[iz][iy][ix] = buf2[n++];
-      }
-
-  // pack my real cells for +x processor
-  // pass data to self or +x processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++) {
-        buf1[n++] = vdx_brick[iz][iy][ix];
-        buf1[n++] = vdy_brick[iz][iy][ix];
-        buf1[n++] = vdz_brick[iz][iy][ix];
-      }
-
-  if (comm->procneigh[0][1] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxlo_out; ix < nxlo_in; ix++) {
-        vdx_brick[iz][iy][ix] = buf2[n++];
-        vdy_brick[iz][iy][ix] = buf2[n++];
-        vdz_brick[iz][iy][ix] = buf2[n++];
-      }
-
-  // pack my real cells for -x processor
-  // pass data to self or -x processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++) {
-        buf1[n++] = vdx_brick[iz][iy][ix];
-        buf1[n++] = vdy_brick[iz][iy][ix];
-        buf1[n++] = vdz_brick[iz][iy][ix];
-      }
-
-  if (comm->procneigh[0][0] == me)
-    for (i = 0; i < n; i++) buf2[i] = buf1[i];
-  else {
-    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxhi_in+1; ix <= nxhi_out; ix++) {
-        vdx_brick[iz][iy][ix] = buf2[n++];
-        vdy_brick[iz][iy][ix] = buf2[n++];
-        vdz_brick[iz][iy][ix] = buf2[n++];
-      }
-}
-
-/* ----------------------------------------------------------------------
-   ghost-swap to fill ghost cells of my brick with per-atom field values
-------------------------------------------------------------------------- */
-
-void PPPMOld::fillbrick_peratom()
-{
-  int i,n,ix,iy,iz;
-  MPI_Request request;
-
-  // pack my real cells for +z processor
-  // pass data to self or +z processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
-        if (vflag_atom) {
-          buf3[n++] = v0_brick[iz][iy][ix];
-          buf3[n++] = v1_brick[iz][iy][ix];
-          buf3[n++] = v2_brick[iz][iy][ix];
-          buf3[n++] = v3_brick[iz][iy][ix];
-          buf3[n++] = v4_brick[iz][iy][ix];
-          buf3[n++] = v5_brick[iz][iy][ix];
-        }
-      }
-
-  if (comm->procneigh[2][1] == me)
-    for (i = 0; i < n; i++) buf4[i] = buf3[i];
-  else {
-    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
-              comm->procneigh[2][0],0,world,&request);
-    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz < nzlo_in; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
-        if (vflag_atom) {
-          v0_brick[iz][iy][ix] = buf4[n++];
-          v1_brick[iz][iy][ix] = buf4[n++];
-          v2_brick[iz][iy][ix] = buf4[n++];
-          v3_brick[iz][iy][ix] = buf4[n++];
-          v4_brick[iz][iy][ix] = buf4[n++];
-          v5_brick[iz][iy][ix] = buf4[n++];
-        }
-      }
-
-  // pack my real cells for -z processor
-  // pass data to self or -z processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
-        if (vflag_atom) {
-          buf3[n++] = v0_brick[iz][iy][ix];
-          buf3[n++] = v1_brick[iz][iy][ix];
-          buf3[n++] = v2_brick[iz][iy][ix];
-          buf3[n++] = v3_brick[iz][iy][ix];
-          buf3[n++] = v4_brick[iz][iy][ix];
-          buf3[n++] = v5_brick[iz][iy][ix];
-        }
-      }
-
-  if (comm->procneigh[2][0] == me)
-    for (i = 0; i < n; i++) buf4[i] = buf3[i];
-  else {
-    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
-              comm->procneigh[2][1],0,world,&request);
-    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
-    for (iy = nylo_in; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
-        if (vflag_atom) {
-          v0_brick[iz][iy][ix] = buf4[n++];
-          v1_brick[iz][iy][ix] = buf4[n++];
-          v2_brick[iz][iy][ix] = buf4[n++];
-          v3_brick[iz][iy][ix] = buf4[n++];
-          v4_brick[iz][iy][ix] = buf4[n++];
-          v5_brick[iz][iy][ix] = buf4[n++];
-        }
-      }
-
-  // pack my real cells for +y processor
-  // pass data to self or +y processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
-        if (vflag_atom) {
-          buf3[n++] = v0_brick[iz][iy][ix];
-          buf3[n++] = v1_brick[iz][iy][ix];
-          buf3[n++] = v2_brick[iz][iy][ix];
-          buf3[n++] = v3_brick[iz][iy][ix];
-          buf3[n++] = v4_brick[iz][iy][ix];
-          buf3[n++] = v5_brick[iz][iy][ix];
-        }
-      }
-
-  if (comm->procneigh[1][1] == me)
-    for (i = 0; i < n; i++) buf4[i] = buf3[i];
-  else {
-    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
-              comm->procneigh[1][0],0,world,&request);
-    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy < nylo_in; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
-        if (vflag_atom) {
-          v0_brick[iz][iy][ix] = buf4[n++];
-          v1_brick[iz][iy][ix] = buf4[n++];
-          v2_brick[iz][iy][ix] = buf4[n++];
-          v3_brick[iz][iy][ix] = buf4[n++];
-          v4_brick[iz][iy][ix] = buf4[n++];
-          v5_brick[iz][iy][ix] = buf4[n++];
-        }
-      }
-
-  // pack my real cells for -y processor
-  // pass data to self or -y processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
-        if (vflag_atom) {
-          buf3[n++] = v0_brick[iz][iy][ix];
-          buf3[n++] = v1_brick[iz][iy][ix];
-          buf3[n++] = v2_brick[iz][iy][ix];
-          buf3[n++] = v3_brick[iz][iy][ix];
-          buf3[n++] = v4_brick[iz][iy][ix];
-          buf3[n++] = v5_brick[iz][iy][ix];
-        }
-      }
-
-  if (comm->procneigh[1][0] == me)
-    for (i = 0; i < n; i++) buf4[i] = buf3[i];
-  else {
-    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
-              comm->procneigh[1][1],0,world,&request);
-    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
-      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
-        if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
-        if (vflag_atom) {
-          v0_brick[iz][iy][ix] = buf4[n++];
-          v1_brick[iz][iy][ix] = buf4[n++];
-          v2_brick[iz][iy][ix] = buf4[n++];
-          v3_brick[iz][iy][ix] = buf4[n++];
-          v4_brick[iz][iy][ix] = buf4[n++];
-          v5_brick[iz][iy][ix] = buf4[n++];
-        }
-      }
-
-  // pack my real cells for +x processor
-  // pass data to self or +x processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++) {
-        if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
-        if (vflag_atom) {
-          buf3[n++] = v0_brick[iz][iy][ix];
-          buf3[n++] = v1_brick[iz][iy][ix];
-          buf3[n++] = v2_brick[iz][iy][ix];
-          buf3[n++] = v3_brick[iz][iy][ix];
-          buf3[n++] = v4_brick[iz][iy][ix];
-          buf3[n++] = v5_brick[iz][iy][ix];
-        }
-      }
-
-  if (comm->procneigh[0][1] == me)
-    for (i = 0; i < n; i++) buf4[i] = buf3[i];
-  else {
-    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
-              comm->procneigh[0][0],0,world,&request);
-    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxlo_out; ix < nxlo_in; ix++) {
-        if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
-        if (vflag_atom) {
-          v0_brick[iz][iy][ix] = buf4[n++];
-          v1_brick[iz][iy][ix] = buf4[n++];
-          v2_brick[iz][iy][ix] = buf4[n++];
-          v3_brick[iz][iy][ix] = buf4[n++];
-          v4_brick[iz][iy][ix] = buf4[n++];
-          v5_brick[iz][iy][ix] = buf4[n++];
-        }
-      }
-
-  // pack my real cells for -x processor
-  // pass data to self or -x processor
-  // unpack and sum recv data into my ghost cells
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++) {
-        if (eflag_atom) buf3[n++] = u_brick[iz][iy][ix];
-        if (vflag_atom) {
-          buf3[n++] = v0_brick[iz][iy][ix];
-          buf3[n++] = v1_brick[iz][iy][ix];
-          buf3[n++] = v2_brick[iz][iy][ix];
-          buf3[n++] = v3_brick[iz][iy][ix];
-          buf3[n++] = v4_brick[iz][iy][ix];
-          buf3[n++] = v5_brick[iz][iy][ix];
-        }
-      }
-
-  if (comm->procneigh[0][0] == me)
-    for (i = 0; i < n; i++) buf4[i] = buf3[i];
-  else {
-    MPI_Irecv(buf4,nbuf_peratom,MPI_FFT_SCALAR,
-              comm->procneigh[0][1],0,world,&request);
-    MPI_Send(buf3,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
-    MPI_Wait(&request,MPI_STATUS_IGNORE);
-  }
-
-  n = 0;
-  for (iz = nzlo_out; iz <= nzhi_out; iz++)
-    for (iy = nylo_out; iy <= nyhi_out; iy++)
-      for (ix = nxhi_in+1; ix <= nxhi_out; ix++) {
-        if (eflag_atom) u_brick[iz][iy][ix] = buf4[n++];
-        if (vflag_atom) {
-          v0_brick[iz][iy][ix] = buf4[n++];
-          v1_brick[iz][iy][ix] = buf4[n++];
-          v2_brick[iz][iy][ix] = buf4[n++];
-          v3_brick[iz][iy][ix] = buf4[n++];
-          v4_brick[iz][iy][ix] = buf4[n++];
-          v5_brick[iz][iy][ix] = buf4[n++];
-        }
-      }
-}
-
-/* ----------------------------------------------------------------------
-   find center grid pt for each of my particles
-   check that full stencil for the particle will fit in my 3d brick
-   store central grid pt indices in part2grid array
-------------------------------------------------------------------------- */
-
-void PPPMOld::particle_map()
-{
-  int nx,ny,nz;
-
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-
-  int flag = 0;
-  for (int i = 0; i < nlocal; i++) {
-
-    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-    // current particle coord can be outside global and local box
-    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
-
-    nx = static_cast<int> ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET;
-    ny = static_cast<int> ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET;
-    nz = static_cast<int> ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET;
-
-    part2grid[i][0] = nx;
-    part2grid[i][1] = ny;
-    part2grid[i][2] = nz;
-
-    // check that entire stencil around nx,ny,nz will fit in my 3d brick
-
-    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
-        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
-        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
-      flag = 1;
-  }
-
-  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
-}
-
-/* ----------------------------------------------------------------------
-   create discretized "density" on section of global grid due to my particles
-   density(x,y,z) = charge "density" at grid points of my 3d brick
-   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
-   in global grid
-------------------------------------------------------------------------- */
-
-void PPPMOld::make_rho()
-{
-  int l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-
-  // clear 3d density array
-
-  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  double *q = atom->q;
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-
-  for (int i = 0; i < nlocal; i++) {
-
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    z0 = delvolinv * q[i];
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      y0 = z0*rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        x0 = y0*rho1d[1][m];
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          density_brick[mz][my][mx] += x0*rho1d[0][l];
-        }
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver
-------------------------------------------------------------------------- */
-
-void PPPMOld::poisson(int,int)
-{
-  int i,j,k,n;
-  double eng;
-
-  // transform charge density (r -> k)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work1[n++] = density_fft[i];
-    work1[n++] = ZEROF;
-  }
-
-  fft1->compute(work1,work1,1);
-
-  // global energy and virial contribution
-
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
-  double s2 = scaleinv*scaleinv;
-
-  if (eflag_global || vflag_global) {
-    if (vflag_global) {
-      n = 0;
-      for (i = 0; i < nfft; i++) {
-        eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
-        for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
-        if (eflag_global) energy += eng;
-        n += 2;
-      }
-    } else {
-      n = 0;
-      for (i = 0; i < nfft; i++) {
-        energy +=
-          s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
-        n += 2;
-      }
-    }
-  }
-
-  // scale by 1/total-grid-pts to get rho(k)
-  // multiply by Green's function to get V(k)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work1[n++] *= scaleinv * greensfn[i];
-    work1[n++] *= scaleinv * greensfn[i];
-  }
-
-  // extra FFTs for per-atom energy/virial
-
-  if (evflag_atom) poisson_peratom();
-
-  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
-  // FFT leaves data in 3d brick decomposition
-  // copy it into inner portion of vdx,vdy,vdz arrays
-
-  // x direction gradient
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkx[i]*work1[n+1];
-        work2[n+1] = -fkx[i]*work1[n];
-        n += 2;
-      }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdx_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  // y direction gradient
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fky[j]*work1[n+1];
-        work2[n+1] = -fky[j]*work1[n];
-        n += 2;
-      }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdy_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  // z direction gradient
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkz[k]*work1[n+1];
-        work2[n+1] = -fkz[k]*work1[n];
-        n += 2;
-      }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        vdz_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for per-atom energy/virial
-------------------------------------------------------------------------- */
-
-void PPPMOld::poisson_peratom()
-{
-  int i,j,k,n;
-
-  // energy
-
-  if (eflag_atom) {
-    n = 0;
-    for (i = 0; i < nfft; i++) {
-      work2[n] = work1[n];
-      work2[n+1] = work1[n+1];
-      n += 2;
-    }
-
-    fft2->compute(work2,work2,-1);
-
-    n = 0;
-    for (k = nzlo_in; k <= nzhi_in; k++)
-      for (j = nylo_in; j <= nyhi_in; j++)
-        for (i = nxlo_in; i <= nxhi_in; i++) {
-          u_brick[k][j][i] = work2[n];
-          n += 2;
-        }
-  }
-
-  // 6 components of virial in v0 thru v5
-
-  if (!vflag_atom) return;
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][0];
-    work2[n+1] = work1[n+1]*vg[i][0];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v0_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][1];
-    work2[n+1] = work1[n+1]*vg[i][1];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v1_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][2];
-    work2[n+1] = work1[n+1]*vg[i][2];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v2_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][3];
-    work2[n+1] = work1[n+1]*vg[i][3];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v3_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][4];
-    work2[n+1] = work1[n+1]*vg[i][4];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v4_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work2[n] = work1[n]*vg[i][5];
-    work2[n+1] = work1[n+1]*vg[i][5];
-    n += 2;
-  }
-
-  fft2->compute(work2,work2,-1);
-
-  n = 0;
-  for (k = nzlo_in; k <= nzhi_in; k++)
-    for (j = nylo_in; j <= nyhi_in; j++)
-      for (i = nxlo_in; i <= nxhi_in; i++) {
-        v5_brick[k][j][i] = work2[n];
-        n += 2;
-      }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get electric field & force on my particles
-------------------------------------------------------------------------- */
-
-void PPPMOld::fieldforce()
-{
-  int i,l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-  FFT_SCALAR ekx,eky,ekz;
-
-  // loop over my charges, interpolate electric field from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-  // ek = 3 components of E-field on particle
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double **f = atom->f;
-
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++) {
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    ekx = eky = ekz = ZEROF;
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      z0 = rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        y0 = z0*rho1d[1][m];
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          x0 = y0*rho1d[0][l];
-          ekx -= x0*vdx_brick[mz][my][mx];
-          eky -= x0*vdy_brick[mz][my][mx];
-          ekz -= x0*vdz_brick[mz][my][mx];
-        }
-      }
-    }
-
-    // convert E-field to force
-
-    const double qfactor = qqrd2e * scale * q[i];
-    f[i][0] += qfactor*ekx;
-    f[i][1] += qfactor*eky;
-    if (slabflag != 2) f[i][2] += qfactor*ekz;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   interpolate from grid to get per-atom energy/virial
-------------------------------------------------------------------------- */
-
-void PPPMOld::fieldforce_peratom()
-{
-  int i,l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
-
-  // loop over my charges, interpolate from nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  double *q = atom->q;
-  double **x = atom->x;
-
-  int nlocal = atom->nlocal;
-
-  for (i = 0; i < nlocal; i++) {
-    nx = part2grid[i][0];
-    ny = part2grid[i][1];
-    nz = part2grid[i][2];
-    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-    compute_rho1d(dx,dy,dz);
-
-    u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
-    for (n = nlower; n <= nupper; n++) {
-      mz = n+nz;
-      z0 = rho1d[2][n];
-      for (m = nlower; m <= nupper; m++) {
-        my = m+ny;
-        y0 = z0*rho1d[1][m];
-        for (l = nlower; l <= nupper; l++) {
-          mx = l+nx;
-          x0 = y0*rho1d[0][l];
-          if (eflag_atom) u += x0*u_brick[mz][my][mx];
-          if (vflag_atom) {
-            v0 += x0*v0_brick[mz][my][mx];
-            v1 += x0*v1_brick[mz][my][mx];
-            v2 += x0*v2_brick[mz][my][mx];
-            v3 += x0*v3_brick[mz][my][mx];
-            v4 += x0*v4_brick[mz][my][mx];
-            v5 += x0*v5_brick[mz][my][mx];
-          }
-        }
-      }
-    }
-
-    if (eflag_atom) eatom[i] += q[i]*u;
-    if (vflag_atom) {
-      vatom[i][0] += v0;
-      vatom[i][1] += v1;
-      vatom[i][2] += v2;
-      vatom[i][3] += v3;
-      vatom[i][4] += v4;
-      vatom[i][5] += v5;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py
-------------------------------------------------------------------------- */
-
-void PPPMOld::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
-{
-  // loop thru all possible factorizations of nprocs
-  // surf = surface area of largest proc sub-domain
-  // innermost if test minimizes surface area and surface/volume ratio
-
-  int bestsurf = 2 * (nx + ny);
-  int bestboxx = 0;
-  int bestboxy = 0;
-
-  int boxx,boxy,surf,ipx,ipy;
-
-  ipx = 1;
-  while (ipx <= nprocs) {
-    if (nprocs % ipx == 0) {
-      ipy = nprocs/ipx;
-      boxx = nx/ipx;
-      if (nx % ipx) boxx++;
-      boxy = ny/ipy;
-      if (ny % ipy) boxy++;
-      surf = boxx + boxy;
-      if (surf < bestsurf ||
-          (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
-        bestsurf = surf;
-        bestboxx = boxx;
-        bestboxy = boxy;
-        *px = ipx;
-        *py = ipy;
-      }
-    }
-    ipx++;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   charge assignment into rho1d
-   dx,dy,dz = distance of particle from "lower left" grid point
-------------------------------------------------------------------------- */
-
-void PPPMOld::compute_rho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
-                         const FFT_SCALAR &dz)
-{
-  int k,l;
-  FFT_SCALAR r1,r2,r3;
-
-  for (k = (1-order)/2; k <= order/2; k++) {
-    r1 = r2 = r3 = ZEROF;
-
-    for (l = order-1; l >= 0; l--) {
-      r1 = rho_coeff[l][k] + r1*dx;
-      r2 = rho_coeff[l][k] + r2*dy;
-      r3 = rho_coeff[l][k] + r3*dz;
-    }
-    rho1d[0][k] = r1;
-    rho1d[1][k] = r2;
-    rho1d[2][k] = r3;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   generate coeffients for the weight function of order n
-
-              (n-1)
-  Wn(x) =     Sum    wn(k,x) , Sum is over every other integer
-           k=-(n-1)
-  For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1
-      k is odd integers if n is even and even integers if n is odd
-              ---
-             | n-1
-             | Sum a(l,j)*(x-k/2)**l   if abs(x-k/2) < 1/2
-  wn(k,x) = <  l=0
-             |
-             |  0                       otherwise
-              ---
-  a coeffients are packed into the array rho_coeff to eliminate zeros
-  rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k)
-------------------------------------------------------------------------- */
-
-void PPPMOld::compute_rho_coeff()
-{
-  int j,k,l,m;
-  FFT_SCALAR s;
-
-  FFT_SCALAR **a;
-  memory->create2d_offset(a,order,-order,order,"pppm:a");
-
-  for (k = -order; k <= order; k++)
-    for (l = 0; l < order; l++)
-      a[l][k] = 0.0;
-
-  a[0][0] = 1.0;
-  for (j = 1; j < order; j++) {
-    for (k = -j; k <= j; k += 2) {
-      s = 0.0;
-      for (l = 0; l < j; l++) {
-        a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1);
-#ifdef FFT_SINGLE
-        s += powf(0.5,(float) l+1) *
-          (a[l][k-1] + powf(-1.0,(float) l) * a[l][k+1]) / (l+1);
-#else
-        s += pow(0.5,(double) l+1) *
-          (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1);
-#endif
-      }
-      a[0][k] = s;
-    }
-  }
-
-  m = (1-order)/2;
-  for (k = -(order-1); k < order; k += 2) {
-    for (l = 0; l < order; l++)
-      rho_coeff[l][m] = a[l][k];
-    m++;
-  }
-
-  memory->destroy2d_offset(a,-order);
-}
-
-/* ----------------------------------------------------------------------
-   Slab-geometry correction term to dampen inter-slab interactions between
-   periodically repeating slabs.  Yields good approximation to 2D Ewald if
-   adequate empty space is left between repeating slabs (J. Chem. Phys.
-   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
-   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
-------------------------------------------------------------------------- */
-
-void PPPMOld::slabcorr()
-{
-  // compute local contribution to global dipole moment
-
-  double *q = atom->q;
-  double **x = atom->x;
-  double zprd = domain->zprd;
-  int nlocal = atom->nlocal;
-
-  double dipole = 0.0;
-  for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2];
-
-  // sum local contributions to get global dipole moment
-
-  double dipole_all;
-  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
-
-  // need to make non-neutral systems and/or
-  //  per-atom energy translationally invariant
-
-  double dipole_r2 = 0.0;
-  if (eflag_atom || fabs(qsum) > SMALL) {
-    for (int i = 0; i < nlocal; i++)
-      dipole_r2 += q[i]*x[i][2]*x[i][2];
-
-    // sum local contributions
-
-    double tmp;
-    MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
-    dipole_r2 = tmp;
-  }
-
-  // compute corrections
-
-  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all -
-    qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume;
-  const double qscale = qqrd2e * scale;
-
-  if (eflag_global) energy += qscale * e_slabcorr;
-
-  // per-atom energy
-
-  if (eflag_atom) {
-    double efact = qscale * MY_2PI/volume;
-    for (int i = 0; i < nlocal; i++)
-      eatom[i] += efact * q[i]*(x[i][2]*dipole_all - 0.5*(dipole_r2 +
-        qsum*x[i][2]*x[i][2]) - qsum*zprd*zprd/12.0);
-  }
-
-  // add on force corrections
-
-  double ffact = qscale * (-4.0*MY_PI/volume);
-  double **f = atom->f;
-
-  for (int i = 0; i < nlocal; i++) f[i][2] += ffact * q[i]*(dipole_all - qsum*x[i][2]);
-}
-
-
-/* ----------------------------------------------------------------------
-   perform and time the 1d FFTs required for N timesteps
-------------------------------------------------------------------------- */
-
-int PPPMOld::timing_1d(int n, double &time1d)
-{
-  double time1,time2;
-
-  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
-
-  MPI_Barrier(world);
-  time1 = MPI_Wtime();
-
-  for (int i = 0; i < n; i++) {
-    fft1->timing1d(work1,nfft_both,1);
-    fft2->timing1d(work1,nfft_both,-1);
-    fft2->timing1d(work1,nfft_both,-1);
-    fft2->timing1d(work1,nfft_both,-1);
-  }
-
-  MPI_Barrier(world);
-  time2 = MPI_Wtime();
-  time1d = time2 - time1;
-
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   perform and time the 3d FFTs required for N timesteps
-------------------------------------------------------------------------- */
-
-int PPPMOld::timing_3d(int n, double &time3d)
-{
-  double time1,time2;
-
-  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
-
-  MPI_Barrier(world);
-  time1 = MPI_Wtime();
-
-  for (int i = 0; i < n; i++) {
-    fft1->compute(work1,work1,1);
-    fft2->compute(work1,work1,-1);
-    fft2->compute(work1,work1,-1);
-    fft2->compute(work1,work1,-1);
-  }
-
-  MPI_Barrier(world);
-  time2 = MPI_Wtime();
-  time3d = time2 - time1;
-
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   memory usage of local arrays
-------------------------------------------------------------------------- */
-
-double PPPMOld::memory_usage()
-{
-  double bytes = nmax*3 * sizeof(double);
-  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
-    (nzhi_out-nzlo_out+1);
-  bytes += 4 * nbrick * sizeof(FFT_SCALAR);
-  bytes += 6 * nfft_both * sizeof(double);
-  bytes += nfft_both * sizeof(double);
-  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
-  bytes += 2 * nbuf * sizeof(FFT_SCALAR);
-
-  if (peratom_allocate_flag) {
-    bytes += 7 * nbrick * sizeof(FFT_SCALAR);
-    bytes += 2 * nbuf_peratom * sizeof(FFT_SCALAR);
-  }
-
-  if (group_allocate_flag) {
-    bytes += 2 * nbrick * sizeof(FFT_SCALAR);
-    bytes += 2 * nfft_both * sizeof(FFT_SCALAR);;
-  }
-
-  return bytes;
-}
-
-/* ----------------------------------------------------------------------
-   group-group interactions
- ------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   compute the PPPM total long-range force and energy for groups A and B
- ------------------------------------------------------------------------- */
-
-void PPPMOld::compute_group_group(int groupbit_A, int groupbit_B, int BA_flag)
-{
-  if (slabflag)
-    error->all(FLERR,"Cannot (yet) use K-space slab "
-               "correction with compute group/group");
-
-  int i;
-
-  if (!group_allocate_flag) {
-    allocate_groups();
-    group_allocate_flag = 1;
-  }
-
-  e2group = 0; //energy
-  f2group[0] = 0; //force in x-direction
-  f2group[1] = 0; //force in y-direction
-  f2group[2] = 0; //force in z-direction
-
-  // map my particle charge onto my local 3d density grid
-
-  make_rho_groups(groupbit_A,groupbit_B,BA_flag);
-
-  // all procs communicate density values from their ghost cells
-  //   to fully sum contribution in their 3d bricks
-  // remap from 3d decomposition to FFT decomposition
-
-  // temporarily store and switch pointers so we can
-  //  use brick2fft() for groups A and B (without
-  //  writing an additional function)
-
-  FFT_SCALAR ***density_brick_real = density_brick;
-  FFT_SCALAR *density_fft_real = density_fft;
-
-  // group A
-
-  density_brick = density_A_brick;
-  density_fft = density_A_fft;
-
-  brick2fft();
-
-  // group B
-
-  density_brick = density_B_brick;
-  density_fft = density_B_fft;
-
-  brick2fft();
-
-  // switch back pointers
-
-  density_brick = density_brick_real;
-  density_fft = density_fft_real;
-
-  // compute potential gradient on my FFT grid and
-  //   portion of group-group energy/force on this proc's FFT grid
-
-  poisson_groups(BA_flag);
-
-  const double qscale = qqrd2e * scale;
-
-  // total group A <--> group B energy
-  // self and boundary correction terms are in compute_group_group.cpp
-
-  double e2group_all;
-  MPI_Allreduce(&e2group,&e2group_all,1,MPI_DOUBLE,MPI_SUM,world);
-  e2group = e2group_all;
-
-  e2group *= qscale*0.5*volume;
-
-  // total group A <--> group B force
-
-  double f2group_all[3];
-  MPI_Allreduce(f2group,f2group_all,3,MPI_DOUBLE,MPI_SUM,world);
-
-  for (i = 0; i < 3; i++) f2group[i] = qscale*volume*f2group_all[i];
-}
-
-/* ----------------------------------------------------------------------
- allocate group-group memory that depends on # of K-vectors and order
- ------------------------------------------------------------------------- */
-
-void PPPMOld::allocate_groups()
-{
-  memory->create3d_offset(density_A_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_A_brick");
-  memory->create3d_offset(density_B_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
-                          nxlo_out,nxhi_out,"pppm:density_B_brick");
-  memory->create(density_A_fft,nfft_both,"pppm:density_A_fft");
-  memory->create(density_B_fft,nfft_both,"pppm:density_B_fft");
-}
-
-/* ----------------------------------------------------------------------
- deallocate group-group memory that depends on # of K-vectors and order
- ------------------------------------------------------------------------- */
-
-void PPPMOld::deallocate_groups()
-{
-  memory->destroy3d_offset(density_A_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy3d_offset(density_B_brick,nzlo_out,nylo_out,nxlo_out);
-  memory->destroy(density_A_fft);
-  memory->destroy(density_B_fft);
-}
-
-/* ----------------------------------------------------------------------
- create discretized "density" on section of global grid due to my particles
- density(x,y,z) = charge "density" at grid points of my 3d brick
- (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
- in global grid for group-group interactions
- ------------------------------------------------------------------------- */
-
-void PPPMOld::make_rho_groups(int groupbit_A, int groupbit_B, int BA_flag)
-{
-  int l,m,n,nx,ny,nz,mx,my,mz;
-  FFT_SCALAR dx,dy,dz,x0,y0,z0;
-
-  // clear 3d density arrays
-
-  memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
-
-  memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0,
-         ngrid*sizeof(FFT_SCALAR));
-
-  // loop over my charges, add their contribution to nearby grid points
-  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
-  // (dx,dy,dz) = distance to "lower left" grid pt
-  // (mx,my,mz) = global coords of moving stencil pt
-
-  double *q = atom->q;
-  double **x = atom->x;
-  int nlocal = atom->nlocal;
-  int *mask = atom->mask;
-
-  for (int i = 0; i < nlocal; i++) {
-
-    if ((mask[i] & groupbit_A) && (mask[i] & groupbit_B))
-      if (BA_flag) continue;
-
-    if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) {
-
-      nx = part2grid[i][0];
-      ny = part2grid[i][1];
-      nz = part2grid[i][2];
-      dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
-      dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
-      dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
-
-      compute_rho1d(dx,dy,dz);
-
-      z0 = delvolinv * q[i];
-      for (n = nlower; n <= nupper; n++) {
-        mz = n+nz;
-        y0 = z0*rho1d[2][n];
-        for (m = nlower; m <= nupper; m++) {
-          my = m+ny;
-          x0 = y0*rho1d[1][m];
-          for (l = nlower; l <= nupper; l++) {
-            mx = l+nx;
-
-            // group A
-
-            if (mask[i] & groupbit_A)
-              density_A_brick[mz][my][mx] += x0*rho1d[0][l];
-
-            // group B
-
-            if (mask[i] & groupbit_B)
-              density_B_brick[mz][my][mx] += x0*rho1d[0][l];
-          }
-        }
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   FFT-based Poisson solver for group-group interactions
- ------------------------------------------------------------------------- */
-
-void PPPMOld::poisson_groups(int BA_flag)
-{
-  int i,j,k,n;
-
-  // reuse memory (already declared)
-
-  FFT_SCALAR *work_A = work1;
-  FFT_SCALAR *work_B = work2;
-
-  // transform charge density (r -> k)
-
-  // group A
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work_A[n++] = density_A_fft[i];
-    work_A[n++] = ZEROF;
-  }
-
-  fft1->compute(work_A,work_A,1);
-
-  // group B
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work_B[n++] = density_B_fft[i];
-    work_B[n++] = ZEROF;
-  }
-
-  fft1->compute(work_B,work_B,1);
-
-  // group-group energy and force contribution,
-  //  keep everything in reciprocal space so
-  //  no inverse FFTs needed
-
-  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
-  double s2 = scaleinv*scaleinv;
-
-  // energy
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    e2group += s2 * greensfn[i] *
-      (work_A[n]*work_B[n] + work_A[n+1]*work_B[n+1]);
-    n += 2;
-  }
-
-  if (BA_flag) return;
-
-
-  // multiply by Green's function and s2
-  //  (only for work_A so it is not squared below)
-
-  n = 0;
-  for (i = 0; i < nfft; i++) {
-    work_A[n++] *= s2 * greensfn[i];
-    work_A[n++] *= s2 * greensfn[i];
-  }
-
-  double partial_group;
-
-  // force, x direction
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-        f2group[0] += fkx[i] * partial_group;
-        n += 2;
-      }
-
-  // force, y direction
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-        f2group[1] += fky[j] * partial_group;
-        n += 2;
-      }
-
-  // force, z direction
-
-  n = 0;
-  for (k = nzlo_fft; k <= nzhi_fft; k++)
-    for (j = nylo_fft; j <= nyhi_fft; j++)
-      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
-        f2group[2] += fkz[k] * partial_group;
-        n += 2;
-      }
-}
diff --git a/src/USER-CUDA/pppm_old.h b/src/USER-CUDA/pppm_old.h
deleted file mode 100644
index 57a92e1202..0000000000
--- a/src/USER-CUDA/pppm_old.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef KSPACE_CLASS
-
-KSpaceStyle(pppm/old,PPPMOld)
-
-#else
-
-#ifndef LMP_PPPM_OLD_H
-#define LMP_PPPM_OLD_H
-
-#include "lmptype.h"
-#include <mpi.h>
-
-#ifdef FFT_SINGLE
-typedef float FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_CFLOAT
-#else
-typedef double FFT_SCALAR;
-#define MPI_FFT_SCALAR MPI_DOUBLE
-#endif
-
-#include "kspace.h"
-
-namespace LAMMPS_NS {
-
-class PPPMOld : public KSpace {
- public:
-  PPPMOld(class LAMMPS *, int, char **);
-  virtual ~PPPMOld();
-  virtual void init();
-  virtual void setup();
-  virtual void compute(int, int);
-  virtual int timing_1d(int, double &);
-  virtual int timing_3d(int, double &);
-  virtual double memory_usage();
-
-  virtual void compute_group_group(int, int, int);
-
- protected:
-  int me,nprocs;
-  int nfactors;
-  int *factors;
-  double cutoff;
-  double volume;
-  double delxinv,delyinv,delzinv,delvolinv;
-  double shift,shiftone;
-  int peratom_allocate_flag;
-
-  int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
-  int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
-  int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
-  int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
-  int nlower,nupper;
-  int ngrid,nfft,nfft_both;
-  int nbuf,nbuf_peratom;
-
-  FFT_SCALAR ***density_brick;
-  FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick;
-  FFT_SCALAR ***u_brick;
-  FFT_SCALAR ***v0_brick,***v1_brick,***v2_brick;
-  FFT_SCALAR ***v3_brick,***v4_brick,***v5_brick;
-  double *greensfn;
-  double **vg;
-  double *fkx,*fky,*fkz;
-  FFT_SCALAR *density_fft;
-  FFT_SCALAR *work1,*work2;
-  FFT_SCALAR *buf1,*buf2,*buf3,*buf4;
-
-  double *gf_b;
-  FFT_SCALAR **rho1d,**rho_coeff;
-
-  // group-group interactions
-
-  int group_allocate_flag;
-  FFT_SCALAR ***density_A_brick,***density_B_brick;
-  FFT_SCALAR *density_A_fft,*density_B_fft;
-
-
-  class FFT3d *fft1,*fft2;
-  class Remap *remap;
-
-  int **part2grid;             // storage for particle -> grid mapping
-  int nmax;
-
-  int triclinic;               // domain settings, orthog or triclinic
-  double *boxlo;
-                               // TIP4P settings
-  int typeH,typeO;             // atom types of TIP4P water H and O atoms
-  double qdist;                // distance from O site to negative charge
-  double alpha;                // geometric factor
-
-  void set_grid();
-  virtual void allocate();
-  virtual void allocate_peratom();
-  virtual void deallocate();
-  virtual void deallocate_peratom();
-  int factorable(int);
-  double rms(double, double, bigint, double, double **);
-  double diffpr(double, double, double, double, double **);
-  void compute_gf_denom();
-
-  virtual void particle_map();
-  virtual void make_rho();
-  virtual void brick2fft();
-  virtual void fillbrick();
-  virtual void fillbrick_peratom();
-  virtual void poisson(int,int);
-  virtual void poisson_peratom();
-  virtual void fieldforce();
-  virtual void fieldforce_peratom();
-  void procs2grid2d(int,int,int,int *, int*);
-  void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &,
-                     const FFT_SCALAR &);
-  void compute_rho_coeff();
-  void slabcorr();
-
-  // group-group interactions
-
-  virtual void allocate_groups();
-  virtual void deallocate_groups();
-  virtual void make_rho_groups(int, int, int);
-  virtual void poisson_groups(int);
-
-/* ----------------------------------------------------------------------
-   denominator for Hockney-Eastwood Green's function
-     of x,y,z = sin(kx*deltax/2), etc
-
-            inf                 n-1
-   S(n,k) = Sum  W(k+pi*j)**2 = Sum b(l)*(z*z)**l
-           j=-inf               l=0
-
-          = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x)  at z = sin(x)
-   gf_b = denominator expansion coeffs
-------------------------------------------------------------------------- */
-
-  inline double gf_denom(const double &x, const double &y,
-                         const double &z) const {
-    double sx,sy,sz;
-    sz = sy = sx = 0.0;
-    for (int l = order-1; l >= 0; l--) {
-      sx = gf_b[l] + sx*x;
-      sy = gf_b[l] + sy*y;
-      sz = gf_b[l] + sz*z;
-    }
-    double s = sx*sy*sz;
-    return s*s;
-  };
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Illegal ... command
-
-Self-explanatory.  Check the input script syntax and compare to the
-documentation for the command.  You can use -echo screen as a
-command-line option when running LAMMPS to see the offending line.
-
-E: Cannot use PPPM with 2d simulation
-
-The kspace style pppm cannot be used in 2d simulations.  You can use
-2d PPPM in a 3d simulation; see the kspace_modify command.
-
-E: Kspace style requires atom attribute q
-
-The atom style defined does not have these attributes.
-
-E: Cannot use nonperiodic boundaries with PPPM
-
-For kspace style pppm, all 3 dimensions must have periodic boundaries
-unless you use the kspace_modify command to define a 2d slab with a
-non-periodic z dimension.
-
-E: Incorrect boundaries with slab PPPM
-
-Must have periodic x,y dimensions and non-periodic z dimension to use
-2d slab option with PPPM.
-
-E: PPPM order cannot be < 2 or > than %d
-
-This is a limitation of the PPPM implementation in LAMMPS.
-
-E: KSpace style is incompatible with Pair style
-
-Setting a kspace style requires that a pair style with a long-range
-Coulombic or dispersion component be used.
-
-E: Bond and angle potentials must be defined for TIP4P
-
-Cannot use TIP4P pair potential unless bond and angle potentials
-are defined.
-
-E: Bad TIP4P angle type for PPPM/TIP4P
-
-Specified angle type is not valid.
-
-E: Bad TIP4P bond type for PPPM/TIP4P
-
-Specified bond type is not valid.
-
-E: Cannot use kspace solver on system with no charge
-
-No atoms in system have a non-zero charge.
-
-W: System is not charge neutral, net charge = %g
-
-The total charge on all atoms on the system is not 0.0, which
-is not valid for the long-range Coulombic solvers.
-
-W: Reducing PPPM order b/c stencil extends beyond neighbor processor
-
-This may lead to a larger grid than desired.  See the kspace_modify overlap
-command to prevent changing of the PPPM order.
-
-E: PPPM grid is too large
-
-The global PPPM grid is larger than OFFSET in one or more dimensions.
-OFFSET is currently set to 4096.  You likely need to decrease the
-requested accuracy.
-
-E: PPPM order has been reduced to 0
-
-The auto-adjust of the order failed.  You will need to
-set the grid size and order directly via kspace_modify.
-
-E: KSpace accuracy must be > 0
-
-The kspace accuracy designated in the input must be greater than zero.
-
-E: Cannot compute PPPM G
-
-The Ewald factor could not be computed for the current choice of
-grid size, cutoff, accuracy.
-
-E: Out of range atoms - cannot compute PPPM
-
-One or more atoms are attempting to map their charge to a PPPM grid
-point that is not owned by a processor.  This is likely for one of two
-reasons, both of them bad.  First, it may mean that an atom near the
-boundary of a processor's sub-domain has moved more than 1/2 the
-"neighbor skin distance"_neighbor.html without neighbor lists being
-rebuilt and atoms being migrated to new processors.  This also means
-you may be missing pairwise interactions that need to be computed.
-The solution is to change the re-neighboring criteria via the
-"neigh_modify"_neigh_modify command.  The safest settings are "delay 0
-every 1 check yes".  Second, it may mean that an atom has moved far
-outside a processor's sub-domain or even the entire simulation box.
-This indicates bad physics, e.g. due to highly overlapping atoms, too
-large a timestep, etc.
-
-E: Cannot (yet) use K-space slab correction with compute group/group
-
-This option is not yet supported.
-
-*/
diff --git a/src/USER-CUDA/user_cuda.h b/src/USER-CUDA/user_cuda.h
deleted file mode 100644
index dbcc41ab3b..0000000000
--- a/src/USER-CUDA/user_cuda.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifndef CUDA_H
-#define CUDA_H
-
-#include "pointers.h"
-#include "cuda_shared.h"
-#include "cuda_data.h"
-#include "cuda_precision.h"
-#include <map>
-
-#ifdef _DEBUG
-#define MYDBG(a) a
-#else
-#define MYDBG(a)
-#endif
-
-namespace LAMMPS_NS
-{
-class Cuda : protected Pointers
-{
-  public:
-    Cuda(class LAMMPS*);
-    ~Cuda();
-    //static void setDevice(class LAMMPS*);
-    void allocate();
-
-    void accelerator(int, char**);
-    void activate();
-
-    void setSharedDataZero();
-    void setSystemParams();
-
-    void setDomainParams();
-
-    void checkResize();
-    void evsetup_eatom_vatom(int eflag_atom, int vflag_atom);
-    void uploadAll();
-    void downloadAll();
-    void upload(int datamask);
-    void download(int datamask);
-    void downloadX();
-
-    class CudaNeighList* registerNeighborList(class NeighList* neigh_list);
-    void uploadAllNeighborLists();
-    void downloadAllNeighborLists();
-    void set_neighinit(int dist_check, double triggerneighsq) {
-      shared_data.atom.dist_check = dist_check;
-      shared_data.atom.triggerneighsq = triggerneighsq;
-    }
-    bool decide_by_integrator() {
-      return neighbor_decide_by_integrator  && cu_xhold && finished_setup;
-    }
-    void update_xhold(int &maxhold, double* xhold);
-
-    void setTimingsZero();
-    void print_timings();
-
-    void cu_x_download() {
-      cu_x->download();
-    }
-    bool device_set;
-    bool dotiming;
-    bool dotestatom;
-    int testatom;
-
-    double uploadtime, downloadtime;
-    bool finished_setup, begin_setup;
-    bool oncpu;
-    bool finished_run;
-
-    int self_comm;
-
-    int cuda_exists;
-
-    double extent[6];
-    int* debugdata;
-    // data shared between host code and device code
-    // (number of atoms, device pointers for up- & download)
-    cuda_shared_data shared_data;
-
-    cCudaData<double  , F_CFLOAT , x >* cu_q;
-    cCudaData<double  , F_CFLOAT , yx>* cu_f;
-    cCudaData<double  , V_CFLOAT , x >* cu_mass;
-    cCudaData<double  , V_CFLOAT , x >* cu_rmass;
-    cCudaData<double  , V_CFLOAT , yx>* cu_v;
-    cCudaData<double  , X_CFLOAT , yx>* cu_x;
-    cCudaData<double  , X_CFLOAT , yx>* cu_xhold;
-    cCudaData<int     , int     , x >* cu_mask;
-    cCudaData<int     , int     , x >* cu_tag;
-    cCudaData<int     , int     , x >* cu_type;
-    cCudaData<int     , int     , x >* cu_image;
-    cCudaData<double  , ENERGY_CFLOAT, x >* cu_eatom;
-    cCudaData<double  , ENERGY_CFLOAT, yx>* cu_vatom;
-    cCudaData<double  , ENERGY_CFLOAT, x >* cu_virial;
-    cCudaData<double  , ENERGY_CFLOAT, x >* cu_eng_vdwl;
-    cCudaData<double  , ENERGY_CFLOAT, x >* cu_eng_coul;
-    cCudaData<double  , double  , x >* cu_extent;
-    int* binned_id;
-    cCudaData<int           , int            , xx >* cu_binned_id;
-    int* binned_idnew;
-    cCudaData<int           , int            , xx >* cu_binned_idnew;
-    cCudaData<int           , int            , x >* cu_debugdata;
-    cCudaData<double  , X_CFLOAT , x>* cu_radius;
-    cCudaData<double  , F_CFLOAT , x>* cu_density;
-    cCudaData<double  , V_CFLOAT , yx>* cu_omega;
-    cCudaData<double  , F_CFLOAT , yx>* cu_torque;
-    cCudaData<int           , int            , yx >* cu_special;
-    cCudaData<int           , int            , yx >* cu_nspecial;
-    cCudaData<int     , int     , x >* cu_molecule;
-
-
-    cCudaData<X_CFLOAT  , X_CFLOAT , x>* cu_x_type;
-    X_CFLOAT* x_type;
-
-    cCudaData<V_CFLOAT  , V_CFLOAT , x>* cu_v_radius;
-    V_CFLOAT* v_radius;
-
-    cCudaData<V_CFLOAT  , V_CFLOAT , x>* cu_omega_rmass;
-    V_CFLOAT* omega_rmass;
-
-    cCudaData<int     , int     , x >* cu_map_array;
-    int neighbor_decide_by_integrator;
-
-    bool pinned;
-
-    void* copy_buffer;
-    int copy_buffersize;
-
-  private:
-    int pppn;                  // number of GPUs/node
-    int *devicelist;           // IDs of GPUs
-
-    std::map<class NeighList*, class CudaNeighList*> neigh_lists;
-};
-}
-
-#endif // CUDA_H
diff --git a/src/USER-CUDA/verlet_cuda.cpp b/src/USER-CUDA/verlet_cuda.cpp
deleted file mode 100644
index 8bb4419cda..0000000000
--- a/src/USER-CUDA/verlet_cuda.cpp
+++ /dev/null
@@ -1,1230 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-
-#include <cstdlib>
-#include <cstdio>
-#include <cstring>
-#include "verlet_cuda.h"
-#include "neighbor.h"
-#include "domain.h"
-#include "comm.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "force.h"
-#include "pair.h"
-#include "bond.h"
-#include "angle.h"
-#include "dihedral.h"
-#include "improper.h"
-#include "kspace.h"
-#include "output.h"
-#include "update.h"
-#include "modify_cuda.h"
-#include "compute.h"
-#include "fix.h"
-#include "timer.h"
-#include "memory.h"
-#include "error.h"
-#include "cuda_wrapper_cu.h"
-#include "thermo.h"
-#include "cuda_pair_cu.h"
-#include "user_cuda.h"
-#include <ctime>
-#include <cmath>
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-using namespace LAMMPS_NS;
-
-#define MAKETIMEING
-
-
-VerletCuda::VerletCuda(LAMMPS* lmp, int narg, char** arg) : Verlet(lmp, narg, arg)
-{
-  if (comm->me == 0)
-    error->warning(FLERR,"The USER-CUDA pacakge will be deprecated "
-                   "soon - users should switch to the GPU or KOKKOS packages");
-
-  cuda = lmp->cuda;
-
-  if(cuda == NULL)
-    error->all(FLERR, "You cannot use a /cuda class, without activating 'cuda' acceleration. Provide '-c on' as command-line argument to LAMMPS..");
-
-  modify_cuda = (ModifyCuda*) modify;
-  int ifix = modify->find_fix("package_omp");
-
-  if(ifix >= 0) external_force_clear = 1;
-}
-
-/* ----------------------------------------------------------------------
-   setup before run
-------------------------------------------------------------------------- */
-
-void VerletCuda::setup()
-{
-  //debug related variables
-  cuda->debugdata[0] = 0;
-  cuda->cu_debugdata->upload();
-  dotestatom = cuda->dotestatom;
-  int testatom = cuda->testatom; //48267;
-
-  if(atom->nlocal == 0)
-    error->warning(FLERR, "# CUDA: There are currently no atoms on one of the MPI processes. This is known to cause errors with the USER-CUDA package. Please use the 'processors' keyword to enforce more balanced processor layout.");
-
-  MYDBG(printf("# CUDA VerletCuda::setup start\n");)
-
-  cuda->oncpu = true;
-  cuda->begin_setup = true;
-  cuda->finished_setup = false;
-  cuda->finished_run = false;
-
-  time_pair = 0;
-  time_kspace = 0;
-  time_comm = 0;
-  time_modify = 0;
-  time_fulliterate = 0;
-
-  atom->setup();
-
-  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
-  cu_atom->update_nlocal = 1;
-  cu_atom->update_nmax = 1;
-
-  if(atom->molecular || (force->kspace && (not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
-
-  cuda->setDomainParams();
-
-
-  if(cuda->shared_data.me == 0)
-    printf("# CUDA: Using precision: Global: %u X: %u V: %u F: %u PPPM: %u \n", CUDA_PRECISION == 1 ? 4 : 8, (int) sizeof(X_CFLOAT), (int) sizeof(V_CFLOAT), (int) sizeof(F_CFLOAT), (int) sizeof(PPPM_CFLOAT));
-
-  cuda->allocate();
-
-  if (comm->me == 0 && screen) {
-    fprintf(screen,"Setting up Verlet run ...\n");
-    fprintf(screen,"  Unit style  : %s\n", update->unit_style);
-    fprintf(screen,"  Current step: " BIGINT_FORMAT "\n", update->ntimestep);
-    fprintf(screen,"  Time step   : %g\n", update->dt);
-  }
-
-  // setup domain, communication and neighboring
-  // acquire ghosts
-  // build neighbor lists
-  modify->setup_pre_exchange();
-
-  if(triclinic) domain->x2lamda(atom->nlocal);
-
-  domain->pbc();
-  domain->reset_box();
-  comm->setup();
-
-  if(neighbor->style) neighbor->setup_bins();
-
-  comm->exchange();
-
-  if(atom->sortfreq > 0) atom->sort();
-
-  comm->borders();
-
-  if(triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
-
-  cuda->setSystemParams();
-  cuda->checkResize();
-
-  if(cuda->shared_data.me == 0)
-    printf("# CUDA: VerletCuda::setup: Upload data...\n");
-
-  cuda->uploadAll();
-  neighbor->build();
-  neighbor->ncalls = 0;
-
-  if(atom->mass)
-    cuda->cu_mass->upload();
-
-  if(cuda->cu_map_array)
-    cuda->cu_map_array->upload();
-
-  // compute all forces
-
-  ev_set(update->ntimestep);
-
-  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
-
-  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
-
-  if(elist_atom || vlist_atom) cuda->checkResize();
-
-  int test_BpA_vs_TpA = true;
-  my_times starttime;
-  my_times endtime;
-#ifdef NO_PREC_TIMING
-  double startsec, endsec;
-#endif
-
-  //if(atom->molecular||(force->kspace&&(not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = false;
-  if(test_BpA_vs_TpA && cuda->shared_data.pair.cudable_force && force->pair && (cuda->shared_data.pair.override_block_per_atom < 0)) {
-    int StyleLoops = 10;
-
-    if(cuda->shared_data.me == 0)
-      printf("Test TpA\n");
-
-    cuda->shared_data.pair.use_block_per_atom = 0;
-    neighbor->build();
-    Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-    if(cuda->cu_v_radius)
-      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-    if(cuda->cu_omega_rmass)
-      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-    force->pair->compute(eflag, vflag);
-    CudaWrapper_Sync();
-#ifdef NO_PREC_TIMING
-    startsec = 1.0 * clock() / CLOCKS_PER_SEC;
-#endif
-    my_gettime(CLOCK_REALTIME, &starttime);
-
-    for(int i = 0; i < StyleLoops; i++) {
-      Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-      if(cuda->cu_v_radius)
-        Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-      if(cuda->cu_omega_rmass)
-        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-      force->pair->compute(eflag, vflag);
-      CudaWrapper_Sync();
-    }
-
-    my_gettime(CLOCK_REALTIME, &endtime);
-
-    double TpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-#ifdef NO_PREC_TIMING
-    endsec = 1.0 * clock() / CLOCKS_PER_SEC;
-    TpAtime = endsec - startsec;
-#endif
-
-    if(cuda->shared_data.me == 0)
-      printf("Test BpA\n");
-
-    cuda->shared_data.pair.use_block_per_atom = 1;
-    neighbor->build();
-    Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-    if(cuda->cu_v_radius)
-      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-    if(cuda->cu_omega_rmass)
-      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-    force->pair->compute(eflag, vflag);
-    CudaWrapper_Sync();
-
-    my_gettime(CLOCK_REALTIME, &starttime);
-#ifdef NO_PREC_TIMING
-    startsec = 1.0 * clock() / CLOCKS_PER_SEC;
-#endif
-
-    for(int i = 0; i < StyleLoops; i++) {
-      Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-      if(cuda->cu_v_radius)
-        Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-      if(cuda->cu_omega_rmass)
-        Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-      force->pair->compute(eflag, vflag);
-      CudaWrapper_Sync();
-    }
-
-    my_gettime(CLOCK_REALTIME, &endtime);
-    double BpAtime = endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-#ifdef NO_PREC_TIMING
-    endsec = 1.0 * clock() / CLOCKS_PER_SEC;
-    BpAtime = endsec - startsec;
-#endif
-
-    if(cuda->shared_data.me == 0)
-      printf("\n# CUDA: Timing of parallelisation layout with %i loops:\n", StyleLoops);
-
-    if(cuda->shared_data.me == 0)
-      printf("# CUDA: BpA TpA\n %lf %lf\n", BpAtime, TpAtime);
-
-    if(BpAtime > TpAtime) cuda->shared_data.pair.use_block_per_atom = 0;
-  } else
-    cuda->shared_data.pair.use_block_per_atom = cuda->shared_data.pair.override_block_per_atom;
-
-  //cuda->shared_data.pair.use_block_per_atom = 0;
-  if(atom->molecular || (force->kspace && (not cuda->shared_data.pppm.cudable_force))) cuda->shared_data.pair.collect_forces_later = true;
-
-  neighbor->build();
-  neighbor->ncalls = 0;
-
-  force_clear();
-
-  modify->setup_pre_force(vflag);
-
-  cuda->cu_f->download();
-
-  if(cuda->cu_torque)
-    cuda->cu_torque->download();
-
-  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute\n");)
-
-  //test_atom(testatom,"pre pair force");
-
-  if(cuda->shared_data.pair.cudable_force) {
-    cuda->uploadAll();
-    Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-    if(cuda->cu_v_radius)
-      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-    if(cuda->cu_omega_rmass)
-      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-  }
-
-  if(force->pair) force->pair->compute(eflag, vflag);
-
-  if(cuda->shared_data.pair.cudable_force) {
-    if(cuda->shared_data.pair.collect_forces_later) {
-      if(eflag) cuda->cu_eng_vdwl->upload();
-
-      if(eflag) cuda->cu_eng_coul->upload();
-
-      if(vflag) cuda->cu_virial->upload();
-
-      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
-
-      if(eflag) cuda->cu_eng_vdwl->download();
-
-      if(eflag) cuda->cu_eng_coul->download();
-
-      if(vflag) cuda->cu_virial->download();
-    }
-
-    cuda->downloadAll();
-  }
-
-  test_atom(testatom, "post pair force");
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute done\n");)
-  //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
-
-  if(atom->molecular) {
-    if(force->bond) force->bond->compute(eflag, vflag);
-
-    if(force->angle) force->angle->compute(eflag, vflag);
-
-    if(force->dihedral) force->dihedral->compute(eflag, vflag);
-
-    if(force->improper) force->improper->compute(eflag, vflag);
-  }
-
-
-  if(cuda->shared_data.pppm.cudable_force) {
-    cuda->cu_tag ->upload();
-    cuda->cu_type->upload();
-    cuda->cu_x   ->upload();
-    cuda->cu_v   ->upload();
-    cuda->cu_f   ->upload();
-
-    if(cu_atom->q_flag) cuda->cu_q->upload();
-  }
-
-  if(force->kspace) {
-    force->kspace->setup();
-    force->kspace->compute(eflag, vflag);
-  }
-
-  if(cuda->shared_data.pppm.cudable_force) {
-    cuda->cu_f   ->download();
-  }
-
-  test_atom(testatom, "post kspace");
-
-  cuda->uploadAll();
-
-  if(force->newton) comm->reverse_comm();
-
-  cuda->downloadAll();
-
-  test_atom(testatom, "post reverse comm");
-
-  if(cuda->shared_data.me == 0)
-    printf("# CUDA: Total Device Memory usage post setup: %lf MB\n", 1.0 * CudaWrapper_CheckMemUsage() / 1024 / 1024);
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup\n");)
-  modify->setup(vflag);
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup done\n");)
-  output->setup(1);
-
-  test_atom(testatom, "post setup");
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: done\n");)
-  cuda->finished_setup = true;
-  cuda->oncpu = false;
-}
-
-
-//this routine is in a messy state
-void VerletCuda::setup_minimal(int flag)
-{
-
-  printf("SetupMinimal\n");
-  dotestatom = 0;
-  int testatom = 104;
-  cuda->oncpu = true;
-  cuda->begin_setup = true;
-  cuda->finished_run = false;
-  MYDBG(printf("# CUDA VerletCuda::setup start\n");)
-  time_pair = 0;
-  time_kspace = 0;
-  time_comm = 0;
-  time_modify = 0;
-  time_fulliterate = 0;
-
-  //cuda->allocate();
-
-  cuda_shared_atom*   cu_atom   = & cuda->shared_data.atom;
-  cu_atom->update_nlocal = 1;
-  cu_atom->update_nmax = 1;
-
-  if(atom->molecular) cuda->shared_data.pair.collect_forces_later = true;
-
-  cuda->setDomainParams();
-
-
-
-  if(cuda->shared_data.me == 0)
-    printf("# CUDA: VerletCuda::setup: Allocate memory on device for maximum of %i atoms...\n", atom->nmax);
-
-  cuda->allocate();
-
-
-
-
-  // setup domain, communication and neighboring
-  // acquire ghosts
-  // build neighbor lists
-
-  if(flag) {
-    if(triclinic) domain->x2lamda(atom->nlocal);
-
-    domain->pbc();
-    domain->reset_box();
-    comm->setup();
-
-    if(neighbor->style) neighbor->setup_bins();
-
-    comm->exchange();
-    comm->borders();
-
-    if(triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
-
-    cuda->setSystemParams();
-    cuda->checkResize();
-    neighbor->build();
-    neighbor->ncalls = 0;
-  }
-
-  if(cuda->shared_data.me == 0)
-    printf("# CUDA: VerletCuda::setup: Upload data...\n");
-
-  cuda->uploadAll();
-  cuda->uploadAllNeighborLists();
-
-  if(atom->mass)
-    cuda->cu_mass->upload();
-
-  if(cuda->cu_map_array)
-    cuda->cu_map_array->upload();
-
-  // compute all forces
-
-  ev_set(update->ntimestep);
-
-  if(elist_atom) cuda->shared_data.atom.need_eatom = 1;
-
-  if(vlist_atom) cuda->shared_data.atom.need_vatom = 1;
-
-  if(elist_atom || vlist_atom) cuda->checkResize();
-
-  force_clear();
-  cuda->cu_f->download();
-
-  //printf("# Verlet::setup: g f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
-
-  cuda->cu_mass->upload();
-  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute\n");)
-
-  test_atom(testatom, "pre pair force");
-
-  if(cuda->shared_data.pair.cudable_force) {
-    cuda->uploadAll();
-    Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-    if(cuda->cu_v_radius)
-      Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-    if(cuda->cu_omega_rmass)
-      Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-  }
-
-  if(force->pair) force->pair->compute(eflag, vflag);
-
-  if(cuda->shared_data.pair.cudable_force) {
-    if(cuda->shared_data.pair.collect_forces_later) {
-      if(eflag) cuda->cu_eng_vdwl->upload();
-
-      if(eflag) cuda->cu_eng_coul->upload();
-
-      if(vflag) cuda->cu_virial->upload();
-
-      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
-
-      if(eflag) cuda->cu_eng_vdwl->download();
-
-      if(eflag) cuda->cu_eng_coul->download();
-
-      if(vflag) cuda->cu_virial->download();
-    }
-
-    cuda->downloadAll();
-  }
-
-  test_atom(testatom, "post pair force");
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: initial force compute done\n");)
-  //printf("# Verlet::setup: h f[0] = (%f, %f, %f)\n", atom->f[0][0], atom->f[0][1], atom->f[0][2]);
-
-  if(atom->molecular) {
-    if(force->bond) force->bond->compute(eflag, vflag);
-
-    if(force->angle) force->angle->compute(eflag, vflag);
-
-    if(force->dihedral) force->dihedral->compute(eflag, vflag);
-
-    if(force->improper) force->improper->compute(eflag, vflag);
-  }
-
-
-  if(cuda->shared_data.pppm.cudable_force) {
-    cuda->cu_tag ->upload();
-    cuda->cu_type->upload();
-    cuda->cu_x   ->upload();
-    cuda->cu_v   ->upload();
-    cuda->cu_f   ->upload();
-
-    if(cu_atom->q_flag) cuda->cu_q->upload();
-  }
-
-  if(force->kspace) {
-    force->kspace->setup();
-    force->kspace->compute(eflag, vflag);
-  }
-
-  if(cuda->shared_data.pppm.cudable_force) {
-    cuda->cu_f   ->download();
-  }
-
-  test_atom(testatom, "post kspace");
-
-  cuda->uploadAll();
-
-  if(force->newton) comm->reverse_comm();
-
-  cuda->downloadAll();
-
-  test_atom(testatom, "post reverse comm");
-
-  if(cuda->shared_data.me == 0)
-    printf("# CUDA: Total Device Memory usage post setup: %lf MB\n", 1.0 * CudaWrapper_CheckMemUsage() / 1024 / 1024);
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: call modify setup\n");)
-  modify->setup(vflag);
-
-  MYDBG(printf("# CUDA: VerletCuda::setup: done\n");)
-  cuda->finished_setup = true;
-  cuda->oncpu = false;
-}
-
-//#define TESTATOM
-/* ----------------------------------------------------------------------
-   iterate for n steps
-------------------------------------------------------------------------- */
-
-void VerletCuda::run(int n)
-{
-  dotestatom = cuda->dotestatom;
-  int testatom = cuda->testatom; //48267;
-
-
-  my_times starttime;
-  my_times endtime;
-  my_times starttotal;
-  my_times endtotal;
-
-  cuda->setTimingsZero();
-
-  int nflag, ntimestep, sortflag;
-
-  int n_post_integrate = modify_cuda->n_post_integrate;
-  int n_pre_exchange = modify_cuda->n_pre_exchange;
-  int n_pre_neighbor = modify_cuda->n_pre_neighbor;
-  int n_pre_force = modify_cuda->n_pre_force;
-  int n_post_force = modify_cuda->n_post_force;
-  int n_end_of_step = modify_cuda->n_end_of_step;
-  MYDBG(printf("# CUDA: Fixes: i_int: %i p_int: %i f_int: %i pr_exc: %i pr_neigh: %i pr_f: %i p_f: %i eos: %i\n",
-               n_initial_integrate, n_post_integrate, n_final_integrate, n_pre_exchange, n_pre_neighbor, n_pre_force, n_post_force, n_end_of_step);)
-
-  if(atom->sortfreq > 0) sortflag = 1;
-  else sortflag = 0;
-
-
-  if(cuda->shared_data.me == 0) {
-    if((not cuda->shared_data.pair.cudable_force) && (force->pair))
-      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
-                     "but selected a pair force which has not yet been ported to Cuda");
-
-    if((not cuda->shared_data.pppm.cudable_force) && (force->kspace))
-      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
-                     "but selected a kspace force which has not yet been ported to Cuda");
-
-    if(modify_cuda->n_post_integrate_host + modify_cuda->n_pre_exchange_host + modify_cuda->n_pre_neighbor_host + modify_cuda->n_pre_force_host + modify_cuda->n_post_force_host + modify_cuda->n_end_of_step_host + modify_cuda->n_initial_integrate_host + modify_cuda->n_final_integrate_host)
-      error->warning(FLERR, "# CUDA: You asked for a Verlet integration using Cuda, "
-                     "but several fixes have not yet been ported to Cuda.\n"
-                     "This can cause a severe speed penalty due to frequent data synchronization between host and GPU.");
-
-    if(atom->firstgroupname)
-      error->warning(FLERR, "Warning: firstgroupname is used, this will cause additional data transfers.");
-  }
-
-  cuda->uploadAll();
-
-  if(cuda->neighbor_decide_by_integrator && cuda->cu_xhold) {
-    const int n = cuda->shared_data.atom.maxhold;
-    CudaWrapper_CopyData(cuda->cu_xhold->dev_data(), cuda->cu_x->dev_data(), n * sizeof(X_CFLOAT));
-    CudaWrapper_CopyData((void*) & ((X_CFLOAT*)cuda->cu_xhold->dev_data())[n], (void*) & ((X_CFLOAT*)cuda->cu_x->dev_data())[atom->nmax], n * sizeof(X_CFLOAT));
-    CudaWrapper_CopyData((void*) & ((X_CFLOAT*)cuda->cu_xhold->dev_data())[2 * n], (void*) & ((X_CFLOAT*)cuda->cu_x->dev_data())[2 * atom->nmax], n * sizeof(X_CFLOAT));
-  }
-
-  cuda->shared_data.atom.reneigh_flag = 0;
-  cuda->shared_data.atom.update_nlocal = 1;
-  cuda->shared_data.atom.update_nmax = 1;
-  cuda->shared_data.atom.update_neigh = 1;
-  cuda->shared_data.domain.update = 1;
-  cuda->shared_data.buffer_new = 1;
-  cuda->uploadtime = 0;
-  cuda->downloadtime = 0;
-  int firstreneigh = 1;
-
-  for(int i = 0; i < n; i++) {
-    if(atom->nlocal == 0)
-      error->warning(FLERR, "# CUDA: There are currently no atoms on one of the MPI processes. This is currently prone to encountering errors with USER-CUDA package. Please use the 'processors' keyword to use a more balanced processor layout.");
-
-    ntimestep = ++update->ntimestep;
-    ev_set(ntimestep);
-
-    // initial time integration
-
-    test_atom(testatom, "Pre initial");
-
-    MYDBG(printf("# CUDA VerletCuda::iterate: before initial_integrate\n");)
-
-    modify->initial_integrate(vflag);
-
-    MYDBG(printf("# CUDA VerletCuda::iterate: after initial_integrate\n");)
-
-    if(n_post_integrate) modify->post_integrate();
-
-
-
-    // regular communication vs neighbor list rebuild
-
-    test_atom(testatom, "Pre Exchange");
-
-    MYDBG(printf("# CUDA VerletCuda::iterate: before neighbor decide\n");)
-    nflag = neighbor->decide();
-
-    if(nflag == 0) {
-      MYDBG(printf("# CUDA VerletCuda::iterate: communicate\n");)
-      timer->stamp();
-
-      if((not(eflag || vflag)) && (cuda->shared_data.overlap_comm)) {
-        //overlap forward communication of ghost atom positions with inner force calculation (interactions between local atoms)
-        //build communication buffers
-        //      printf("Pre forward_comm(1)\n");
-        my_gettime(CLOCK_REALTIME, &starttotal);
-        cuda->shared_data.atom.reneigh_flag = 0;
-        my_gettime(CLOCK_REALTIME, &starttime);
-        timer->stamp();
-        comm->forward_comm(1);
-        timer->stamp(Timer::COMM);
-        my_gettime(CLOCK_REALTIME, &endtime);
-        cuda->shared_data.cuda_timings.comm_forward_total +=
-          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
-        //prepare force calculation
-        //     printf("Pre force_clear\n");
-        force_clear();
-        //     printf("Pre Generate XType\n");
-        Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-        if(cuda->cu_v_radius)
-          Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-        if(cuda->cu_omega_rmass)
-          Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-        //start force calculation asynchronus
-        cuda->shared_data.comm.comm_phase = 1;
-        force->pair->compute(eflag, vflag);
-        timer->stamp(Timer::PAIR);
-        //CudaWrapper_Sync();
-
-        //download comm buffers from GPU, perform MPI communication and upload buffers again
-        my_gettime(CLOCK_REALTIME, &starttime);
-        comm->forward_comm(2);
-        my_gettime(CLOCK_REALTIME, &endtime);
-        cuda->shared_data.cuda_timings.comm_forward_total +=
-          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-        timer->stamp(Timer::COMM);
-
-        //wait for force calculation
-        CudaWrapper_Sync();
-        timer->stamp(Timer::PAIR);
-
-        //unpack communication buffers
-        my_gettime(CLOCK_REALTIME, &starttime);
-        comm->forward_comm(3);
-        my_gettime(CLOCK_REALTIME, &endtime);
-        cuda->shared_data.cuda_timings.comm_forward_total +=
-          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
-        timer->stamp(Timer::COMM);
-        MYDBG(printf("# CUDA VerletCuda::iterate: communicate done\n");)
-        cuda->shared_data.cuda_timings.test1 +=
-          endtotal.tv_sec - starttotal.tv_sec + 1.0 * (endtotal.tv_nsec - starttotal.tv_nsec) / 1000000000;
-      } else {
-        //perform standard forward communication
-        my_gettime(CLOCK_REALTIME, &starttime);
-        comm->forward_comm();
-        my_gettime(CLOCK_REALTIME, &endtime);
-        cuda->shared_data.cuda_timings.comm_forward_total +=
-          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-        timer->stamp(Timer::COMM);
-        MYDBG(printf("# CUDA VerletCuda::iterate: communicate done\n");)
-      }
-    } else {
-      int nlocalold = cuda->shared_data.atom.nlocal;
-
-      if(firstreneigh) {
-        cuda->shared_data.atom.update_nlocal = 1;
-        cuda->shared_data.atom.update_nmax = 1;
-        firstreneigh = 0;
-      }
-
-      cuda->shared_data.buffer_new = 1;
-      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor\n");)
-      cuda->setDomainParams();
-
-      if(n_pre_exchange) modify->pre_exchange();
-
-      if(atom->nlocal != cuda->shared_data.atom.nlocal) { //did someone add atoms during pre_exchange?
-        cuda->checkResize();
-        cuda->uploadAll();
-      }
-
-      //check domain changes
-      if(domain->triclinic) domain->x2lamda(atom->nlocal);
-
-      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor pbc\n");)
-      domain->pbc();
-
-      if(domain->box_change) {
-        domain->reset_box();
-        comm->setup();
-
-        if(neighbor->style) neighbor->setup_bins();
-
-      }
-
-      timer->stamp();
-      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor exchange\n");)
-
-      //perform exchange of local atoms
-      my_gettime(CLOCK_REALTIME, &starttime);
-      comm->exchange();
-      my_gettime(CLOCK_REALTIME, &endtime);
-
-      //special and nspecial fields of the atom data are not currently transfered via the GPU buffer might be changed in the future
-      if(comm->nprocs > 1) {
-        my_gettime(CLOCK_REALTIME, &starttime);
-
-        if(atom->special)
-          cuda->cu_special->upload();
-
-        if(atom->nspecial)
-          cuda->cu_nspecial->upload();
-
-        my_gettime(CLOCK_REALTIME, &endtime);
-        cuda->shared_data.cuda_timings.test1 +=
-          endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-      }
-
-      cuda->shared_data.cuda_timings.comm_exchange_total +=
-        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
-      if(nlocalold != cuda->shared_data.atom.nlocal) cuda->shared_data.atom.update_nlocal = 2;
-
-      //sort atoms
-      if(sortflag && ntimestep >= atom->nextsort) atom->sort();
-
-      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor borders\n");)
-
-      //generate ghost atom lists, and transfer ghost atom data
-      my_gettime(CLOCK_REALTIME, &starttime);
-      comm->borders();
-      my_gettime(CLOCK_REALTIME, &endtime);
-      cuda->shared_data.cuda_timings.comm_border_total +=
-        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
-      my_gettime(CLOCK_REALTIME, &starttime);
-      //atom index maps are generated on CPU, and need to be transfered to GPU if they are used
-      if(cuda->cu_map_array)
-        cuda->cu_map_array->upload();
-
-
-      if(domain->triclinic) domain->lamda2x(atom->nlocal + atom->nghost);
-
-      if(n_pre_neighbor) modify->pre_neighbor();
-
-      cuda->shared_data.buffer_new = 2;
-
-      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor build\n");)
-      timer->stamp(Timer::COMM);
-      my_gettime(CLOCK_REALTIME, &endtime);
-      cuda->shared_data.cuda_timings.test2 +=
-        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
-      //rebuild neighbor list
-      test_atom(testatom, "Pre Neighbor");
-      neighbor->build(0);
-      timer->stamp(Timer::NEIGH);
-      MYDBG(printf("# CUDA VerletCuda::iterate: neighbor done\n");)
-      //if bonded interactions are used (in this case collect_forces_later is true), transfer data which only changes upon exchange/border routines from GPU to CPU
-      if(cuda->shared_data.pair.collect_forces_later) {
-        if(cuda->cu_molecule) cuda->cu_molecule->downloadAsync(2);
-
-        cuda->cu_tag->downloadAsync(2);
-        cuda->cu_type->downloadAsync(2);
-        cuda->cu_mask->downloadAsync(2);
-
-        if(cuda->cu_q) cuda->cu_q->downloadAsync(2);
-      }
-      cuda->shared_data.comm.comm_phase = 3;
-    }
-
-    test_atom(testatom, "Post Exchange");
-
-    // force computations
-
-    //only do force_clear if it has not been done during overlap of communication with local interactions
-    if(not((not(eflag || vflag)) && (cuda->shared_data.overlap_comm) && (cuda->shared_data.comm.comm_phase < 3)))
-      force_clear();
-
-    if(n_pre_force) modify->pre_force(vflag);
-
-    timer->stamp();
-
-    //if overlap of bonded interactions with nonbonded interactions takes place, download forces and positions
-    /*            if(cuda->shared_data.pair.collect_forces_later)
-               {
-                 cuda->cu_x->downloadAsync(2);
-                 cuda->cu_f->downloadAsync(2);
-               }*/
-
-    if(force->pair) {
-      if((not(eflag || vflag)) && (cuda->shared_data.overlap_comm) && (cuda->shared_data.comm.comm_phase < 3) && cuda->shared_data.pair.cudable_force) {
-        //second part of force calculations in case of overlaping it with commuincation. Only interactions between local and ghost atoms are done now
-        //regenerate data layout for force computations, its actually only needed for the ghost atoms
-        cuda->shared_data.comm.comm_phase = 2;
-
-        my_times atime1, atime2;
-        my_gettime(CLOCK_REALTIME, &atime1);
-
-        Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-        if(cuda->cu_v_radius)
-          Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-        if(cuda->cu_omega_rmass)
-          Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-        my_gettime(CLOCK_REALTIME, &atime2);
-        cuda->shared_data.cuda_timings.pair_xtype_conversion +=
-          atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-        force->pair->compute(eflag, vflag);
-
-      } else {
-        //calculate complete pair interactions
-        if(not cuda->shared_data.pair.cudable_force) cuda->downloadAll();
-        else {
-          //regenerate data layout for force computations, its actually only needed for the ghost atoms
-          my_times atime1, atime2;
-          my_gettime(CLOCK_REALTIME, &atime1);
-
-          Cuda_Pair_GenerateXType(&cuda->shared_data);
-
-          if(cuda->cu_v_radius)
-            Cuda_Pair_GenerateVRadius(&cuda->shared_data);
-
-          if(cuda->cu_omega_rmass)
-            Cuda_Pair_GenerateOmegaRmass(&cuda->shared_data);
-
-          my_gettime(CLOCK_REALTIME, &atime2);
-          cuda->shared_data.cuda_timings.pair_xtype_conversion +=
-            atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-        }
-
-        cuda->shared_data.comm.comm_phase = 0;
-        force->pair->compute(eflag, vflag);
-      }
-
-      if(not cuda->shared_data.pair.cudable_force) cuda->uploadAll();
-
-      //wait for force calculation in case of not using overlap with bonded interactions
-      if(not cuda->shared_data.pair.collect_forces_later)
-        CudaWrapper_Sync();
-
-      timer->stamp(Timer::PAIR);
-    }
-
-    //calculate bonded interactions
-    if(atom->molecular) {
-      cuda->cu_x->downloadAsync(2);
-
-      if(n_pre_force == 0) Verlet::force_clear();
-      else  cuda->cu_f->downloadAsync(2);
-
-      timer->stamp(Timer::PAIR);
-
-      if(neighbor->lastcall == update->ntimestep) {
-        neighbor->build_topology();
-        timer->stamp(Timer::NEIGH);
-      }
-
-      test_atom(testatom, "pre bond force");
-
-      if(force->bond) force->bond->compute(eflag, vflag);
-
-      if(force->angle) force->angle->compute(eflag, vflag);
-
-      if(force->dihedral) force->dihedral->compute(eflag, vflag);
-
-      if(force->improper) force->improper->compute(eflag, vflag);
-
-      timer->stamp(Timer::BOND);
-    }
-
-    //collect forces in case pair force and bonded interactions were overlapped, and either no KSPACE or a GPU KSPACE style is used
-    if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && (not(force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
-      my_gettime(CLOCK_REALTIME, &starttime);
-      cuda->cu_f->uploadAsync(2);
-
-      test_atom(testatom, "post molecular force");
-
-
-      if(eflag) cuda->cu_eng_vdwl->upload();
-
-      if(eflag) cuda->cu_eng_coul->upload();
-
-      if(vflag) cuda->cu_virial->upload();
-
-      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
-
-      if(eflag) cuda->cu_eng_vdwl->download();
-
-      if(eflag) cuda->cu_eng_coul->download();
-
-      if(vflag) cuda->cu_virial->download();
-
-      timer->stamp(Timer::PAIR);
-
-      my_gettime(CLOCK_REALTIME, &endtime);
-      cuda->shared_data.cuda_timings.pair_force_collection +=
-        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-    }
-
-    //compute kspace force
-    if(force->kspace) {
-      if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
-        cuda->downloadAll();
-
-      if((not cuda->shared_data.pppm.cudable_force) && (cuda->shared_data.pair.collect_forces_later) && (not atom->molecular)) {
-        cuda->cu_x->downloadAsync(2);
-
-        if(n_pre_force == 0) Verlet::force_clear();
-        else  cuda->cu_f->downloadAsync(2);
-
-        timer->stamp(Timer::PAIR);
-      }
-
-      force->kspace->compute(eflag, vflag);
-
-      if((not cuda->shared_data.pppm.cudable_force) && (not cuda->shared_data.pair.collect_forces_later))
-        cuda->uploadAll();
-
-      timer->stamp(Timer::KSPACE);
-    }
-
-    //collect forces in case pair forces and kspace was overlaped
-    if(cuda->shared_data.pair.collect_forces_later && cuda->shared_data.pair.cudable_force && ((force->kspace && (not cuda->shared_data.pppm.cudable_force)))) {
-      cuda->cu_f->uploadAsync(2);
-
-      my_gettime(CLOCK_REALTIME, &starttime);
-
-      if(eflag) cuda->cu_eng_vdwl->upload();
-
-      if(eflag) cuda->cu_eng_coul->upload();
-
-      if(vflag) cuda->cu_virial->upload();
-
-      Cuda_Pair_CollectForces(&cuda->shared_data, eflag, vflag);
-
-      if(eflag) cuda->cu_eng_vdwl->download();
-
-      if(eflag) cuda->cu_eng_coul->download();
-
-      if(vflag) cuda->cu_virial->download();
-
-      timer->stamp(Timer::PAIR);
-
-      my_gettime(CLOCK_REALTIME, &endtime);
-      cuda->shared_data.cuda_timings.pair_force_collection +=
-        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-    }
-
-    //send forces on ghost atoms back to other GPU: THIS SHOULD NEVER HAPPEN
-    if(force->newton) {
-      comm->reverse_comm();
-      timer->stamp(Timer::COMM);
-    }
-
-    test_atom(testatom, "post force");
-    // force modifications, final time integration, diagnostics
-
-    if(n_post_force) modify->post_force(vflag);
-
-    test_atom(testatom, "pre final");
-
-    modify->final_integrate();
-
-    test_atom(testatom, "post final");
-
-    if(n_end_of_step) modify->end_of_step();
-
-    // all output
-
-    test_atom(testatom, "pre output");
-
-    if(ntimestep == output->next) {
-      if(not output->thermo->cudable)
-        cuda->downloadAll();
-
-      timer->stamp();
-      output->write(ntimestep);
-      timer->stamp(Timer::OUTPUT);
-    }
-
-
-    test_atom(testatom, "post output");
-
-    if(cuda->shared_data.atom.update_nlocal > 0)
-      cuda->shared_data.atom.update_nlocal--;
-
-    if(cuda->shared_data.atom.update_nmax > 0)
-      cuda->shared_data.atom.update_nmax--;
-
-    if(cuda->shared_data.atom.update_neigh > 0)
-      cuda->shared_data.atom.update_neigh--;
-
-    if(cuda->shared_data.domain.update > 0)
-      cuda->shared_data.domain.update--;
-
-    if(cuda->shared_data.buffer_new > 0)
-      cuda->shared_data.buffer_new--;
-
-    cuda->shared_data.atom.reneigh_flag = 0;
-  }
-
-
-  cuda->downloadAll();
-  cuda->downloadAllNeighborLists();
-  cuda->shared_data.atom.update_nlocal = 1;
-  cuda->shared_data.atom.update_nmax = 1;
-  cuda->shared_data.atom.update_neigh = 1;
-  cuda->shared_data.buffer_new = 1;
-  cuda->shared_data.domain.update = 1;
-  cuda->oncpu = true;
-  cuda->finished_run = true;
-}
-
-
-/* ----------------------------------------------------------------------
-   clear force on own & ghost atoms
-   setup and clear other arrays as needed
-------------------------------------------------------------------------- */
-
-void VerletCuda::force_clear()
-{
-  cuda->cu_f->memset_device(0);
-
-  if(cuda->cu_torque) cuda->cu_torque->memset_device(0);
-
-#if 0
-  //The rest should not be necessary
-  int i;
-
-  for(i = 0; i < atom->nlocal; i++) {
-    atom->f[i][0] = 0.0;
-    atom->f[i][1] = 0.0;
-    atom->f[i][2] = 0.0;
-  }
-
-  // clear force on all particles
-  // if either newton flag is set, also include ghosts
-
-  if(neighbor->includegroup == 0) {
-    int nall;
-
-    if(force->newton) nall = atom->nlocal + atom->nghost;
-    else nall = atom->nlocal;
-
-    if(torqueflag) {
-      double** torque = atom->torque;
-
-      for(i = 0; i < nall; i++) {
-        torque[i][0] = 0.0;
-        torque[i][1] = 0.0;
-        torque[i][2] = 0.0;
-      }
-    }
-
-    // neighbor includegroup flag is set
-    // clear force only on initial nfirst particles
-    // if either newton flag is set, also include ghosts
-
-  } else {
-    int nall = atom->nfirst;
-
-
-    if(torqueflag) {
-      double** torque = atom->torque;
-
-      for(i = 0; i < nall; i++) {
-        torque[i][0] = 0.0;
-        torque[i][1] = 0.0;
-        torque[i][2] = 0.0;
-      }
-    }
-
-    if(force->newton) {
-      nall = atom->nlocal + atom->nghost;
-
-      if(torqueflag) {
-        double** torque = atom->torque;
-
-        for(i = atom->nlocal; i < nall; i++) {
-          torque[i][0] = 0.0;
-          torque[i][1] = 0.0;
-          torque[i][2] = 0.0;
-        }
-      }
-    }
-  }
-#endif
-}
-
-void VerletCuda::test_atom(int aatom, const char* string)  //printing properties of one atom for test purposes
-{
-  if(not dotestatom) return;
-
-  bool check = false;
-
-  if(cuda->finished_setup) cuda->downloadAll();
-
-  for(int i = 0; i < atom->nlocal + atom->nghost; i++) {
-    if((atom->tag[i] == aatom) && (i < atom->nlocal)) {
-
-      printf("%i # CUDA %s: " BIGINT_FORMAT " %i %e %e %e %i ",
-             comm->me, string, update->ntimestep, atom->tag[i],
-             atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
-
-      if(atom->molecular && (i < atom->nlocal)) {
-        printf(" // %i %i %i ", atom->num_bond[i], atom->num_angle[i], atom->num_dihedral[i]);
-
-        for(int k = 0; k < atom->num_bond[i]; k++)
-          printf("// %i %i ", atom->bond_type[i][k], atom->bond_atom[i][k]);
-      }
-
-      printf("\n");
-    }
-
-    if(i < atom->nlocal) {
-      if((atom->v[i][0] < -100 || atom->v[i][0] > 100) ||
-          (atom->v[i][1] < -100 || atom->v[i][1] > 100) ||
-          (atom->v[i][2] < -100 || atom->v[i][2] > 100) ||
-          (atom->v[i][0] != atom->v[i][0]) ||
-          (atom->v[i][1] != atom->v[i][1]) ||
-          (atom->v[i][2] != atom->v[i][2])) {
-        printf("%i # CUDA %s velocity: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
-        check = true;
-      }
-
-      if((atom->f[i][0] < -10000 || atom->f[i][0] > 10000) ||
-          (atom->f[i][1] < -10000 || atom->f[i][1] > 10000) ||
-          (atom->f[i][2] < -10000 || atom->f[i][2] > 10000) ||
-          (atom->f[i][0] != atom->f[i][0]) ||
-          (atom->f[i][1] != atom->f[i][1]) ||
-          (atom->f[i][2] != atom->f[i][2])) {
-        printf("%i # CUDA %s force: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
-        check = true;
-      }
-
-      if(atom->tag[i] <= 0)
-        printf("%i # CUDA %s tag: %i %e %e %e %i\n", comm->me, string, atom->tag[i], atom->x[i][0], atom->v[i][0], atom->f[i][0], i);
-    }
-  }
-
-  if(check) exit(0);
-}
diff --git a/src/USER-CUDA/verlet_cuda.h b/src/USER-CUDA/verlet_cuda.h
deleted file mode 100644
index 6760828010..0000000000
--- a/src/USER-CUDA/verlet_cuda.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
-------------------------------------------------------------------------- */
-
-#ifdef INTEGRATE_CLASS
-
-IntegrateStyle(verlet/cuda,VerletCuda)
-
-#else
-
-
-#ifndef LMP_VERLET_CUDA_H
-#define LMP_VERLET_CUDA_H
-#include "verlet.h"
-#include "modify_cuda.h"
-
-namespace LAMMPS_NS {
-
-class VerletCuda : public Verlet
-{
-        public:
-                VerletCuda(class LAMMPS *, int, char **);
-                void setup();
-                 void setup_minimal(int);
-                  void run(int);
-
-                void test_atom(int atom,const char* astring); //debugging purpose
-                int dotestatom;        //debugging purpose
-
-        protected:
-                class Cuda *cuda;
-                void force_clear();
-            double time_pair;
-            double time_kspace;
-            double time_comm;
-            double time_modify;
-            double time_fulliterate;
-            ModifyCuda* modify_cuda;
-};
-
-}
-
-#endif
-#endif
-- 
GitLab