diff --git a/lib/cuda/Makefile b/lib/cuda/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..844906ba89b31442a912318c49110fbc3dbfc130
--- /dev/null
+++ b/lib/cuda/Makefile
@@ -0,0 +1,4 @@
+#Makefile for liblammpscuda.a 
+#No need to modify anything here! The CUDA path is inserted into Makefile.common
+
+include Makefile.cudalib
\ No newline at end of file
diff --git a/lib/cuda/Makefile.common b/lib/cuda/Makefile.common
new file mode 100644
index 0000000000000000000000000000000000000000..7c918a23bc8770912580fc72d39520339c91a477
--- /dev/null
+++ b/lib/cuda/Makefile.common
@@ -0,0 +1,124 @@
+#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
+
+# make options:
+# emu=1        switch to cuda emulation mode (otherwise: use gpu)
+# dbg=1        print a lot of debugging output during runtime
+# verbose=1    output nvcc command line during compilation
+# keep=1       do not delete temporary compilation files (.ii, .cubin, ...)
+# cufft=1      use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
+# binning=1    create virtual particle grid (neighbor-lists otherwise); currently this is not supported
+# precision=1  single precision (global setting)
+# precision=2  double precision (global setting)
+
+SHELL = /bin/sh
+
+# System-specific settings
+
+#CUDA_INSTALL_PATH = /usr/local/cuda
+CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
+# e.g. in Gentoo
+# CUDA_INSTALL_PATH = /opt/cuda
+
+
+#//////////////////////////////////////////////////////////////////////////////////////////////
+# no need to change anything below this line
+#//////////////////////////////////////////////////////////////////////////////////////////////
+
+#use CPU FFT if cufft=0 is requested.
+FALLBACK_FFT = 1
+
+#default settings for compiler switches
+ifdef COMPILELIB 
+include Makefile.defaults
+else
+include ../../lib/cuda/Makefile.defaults
+endif
+
+#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
+
+CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX 
+CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64
+
+# debug setting
+ifeq ($(strip $(dbg)), 1)
+	CUDA_FLAGS += -D_DEBUG -g
+	NVCC_FLAGS += -g -G 
+else
+	NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O3
+endif
+
+# skip timing on Mac and Windows manually
+ifeq ($(strip $(prec_timer)), 0)
+	CUDA_FLAGS += -DNO_PREC_TIMING
+endif
+
+# set fft routine
+ifeq ($(strip $(cufft)), 0)
+	ifneq ($(FALLBACK_FFT), 1)
+	    FFT_INC = -DFFT_NONE
+	    FFT_PATH = 
+	    FFT_LIB = 
+		CUDA_FLAGS += -DFFT_NONE
+	endif
+else
+	CUDA_FLAGS += -DFFT_CUFFT
+	CUDA_USRLIB_CONDITIONAL += -lcufft
+endif
+
+# make global precision setting
+
+ifeq ($(strip $(precision)), 1)
+	CUDA_FLAGS += -DCUDA_PRECISION=1
+else
+	ifeq ($(strip $(precision)), 3)
+		CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
+	else
+		ifeq ($(strip $(precision)), 4)
+			CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
+		else
+			CUDA_FLAGS += -DCUDA_PRECISION=2
+		endif
+	endif
+endif
+
+# make architecture settings
+ifeq ($(strip $(arch)), 13)
+	CUDA_FLAGS += -DCUDA_ARCH=13
+	SMVERSIONFLAGS	:= -arch sm_13
+else
+  ifeq ($(strip $(arch)), 20)
+	 CUDA_FLAGS += -DCUDA_ARCH=20 
+	 #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+	 NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+	 SMVERSIONFLAGS	:= -arch sm_20
+  else
+     ifeq ($(strip $(arch)), 21)
+	   CUDA_FLAGS += -DCUDA_ARCH=20 
+	   #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+	   NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+	   SMVERSIONFLAGS	:= -arch sm_21
+     else
+       ifeq ($(strip $(arch)), 30)
+           CUDA_FLAGS += -DCUDA_ARCH=20
+           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+           SMVERSIONFLAGS       := -arch sm_30
+       else
+         ifeq ($(strip $(arch)), 35)
+           CUDA_FLAGS += -DCUDA_ARCH=20
+           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
+           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
+           SMVERSIONFLAGS       := -arch sm_35
+         else         
+           CUDA_FLAGS += -DCUDA_ARCH=99  
+           SMVERSIONFLAGS	:= -arch sm_13
+         endif
+       endif
+     endif
+  endif
+endif
+
+
+
+CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
+		-I$(CUDA_INSTALL_PATH)/include 
diff --git a/lib/cuda/Makefile.cudalib b/lib/cuda/Makefile.cudalib
new file mode 100644
index 0000000000000000000000000000000000000000..f21e95e6868d1367219898dc52cd27bdae049d6c
--- /dev/null
+++ b/lib/cuda/Makefile.cudalib
@@ -0,0 +1,87 @@
+#Makefile for liblammpscuda.a 
+#No need to modify anything here! The CUDA path is inserted into Makefile.common
+
+.DEFAULT: lib
+
+COMPILELIB := 1
+
+SHELL = /bin/sh
+
+CUDA_SRC_DIR = ../cuda
+CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
+CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
+include $(CUDA_TEMP)
+CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
+CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
+CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) 
+CUDA_DEP = $(CUDA_OBJ:.o=.d)
+
+NVCC_FLAGS := 
+
+VPATH = $(CUDA_SRC_DIR)
+
+#rewriting default settings if new ones are specified
+
+
+ifdef precision
+tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
+endif
+
+ifdef arch
+tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
+endif
+
+ifdef cufft
+tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
+endif
+
+ifdef dbg
+tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
+endif
+
+ifdef prec_timer
+tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
+endif
+
+include Makefile.common
+
+tmp := $(shell sed -i '2 d' Makefile.lammps)
+tmp := $(shell sed -i '2 d' Makefile.lammps)
+tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps)
+tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps)
+
+# verbose nvcc output during compilation
+ifeq ($(verbose), 1)
+	VERBOSE :=
+	NVCC_FLAGS += --ptxas-options=-v
+else
+	VERBOSE := @
+endif
+
+# keep temporary compilation files of nvcc
+ifeq ($(keep), 1)
+	NVCC_FLAGS += -keep -Xptxas="--verbose"
+endif
+
+
+NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
+CUDA_INCLUDES =  -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
+CUDA_USRLIB = 
+
+# Link target
+
+lib: $(CUDA_OBJ)
+	$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
+
+clean:
+	rm $(CUDA_SRC_DIR)/*.o
+	rm liblammpscuda.a
+	
+# Library target
+
+
+# Cuda compilation rules
+
+%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
+	$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<
+
diff --git a/lib/cuda/Makefile.defaults b/lib/cuda/Makefile.defaults
new file mode 100644
index 0000000000000000000000000000000000000000..590435446c23babb55649c13e7ec53e13e06446b
--- /dev/null
+++ b/lib/cuda/Makefile.defaults
@@ -0,0 +1,19 @@
+
+#precision setting: 1 single, 2 double, 4 mixed
+precision ?= 2
+
+#verbose setting: 0 no, 1 yes
+verbose ?= 1
+
+#GPU architecture (compute capability): 13, 20, 21
+arch ?= 20
+
+#Using cufft (should not be changed)
+cufft ?= 1
+
+#Using dbg mode 
+dbg ?= 0
+
+#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
+prec_timer ?= 1
+
diff --git a/lib/cuda/Makefile.lammps b/lib/cuda/Makefile.lammps
new file mode 100644
index 0000000000000000000000000000000000000000..75dd5a26bc49622587c5311fe96dbc2e3bd77c09
--- /dev/null
+++ b/lib/cuda/Makefile.lammps
@@ -0,0 +1,8 @@
+# Settings that the LAMMPS build will import when this package library is used
+CUDA_FLAGS := -I/home/crtrott/lib/cuda/include -DUNIX  -DFFT_CUFFT -DCUDA_PRECISION=2 -DCUDA_ARCH=20 
+CUDA_USRLIB_CONDITIONAL := -L/home/crtrott/lib/cuda/lib -L/home/crtrott/lib/cuda/lib64 -lcufft
+ 
+ user-cuda_SYSINC = ${CUDA_FLAGS}
+ user-cuda_SYSLIB = -lcuda -lcudart -lrt
+ user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL)
+
diff --git a/lib/cuda/README b/lib/cuda/README
new file mode 100644
index 0000000000000000000000000000000000000000..ce0dedcffe91da8956c2f48dd8b2d0f211aa073d
--- /dev/null
+++ b/lib/cuda/README
@@ -0,0 +1,26 @@
+This directory has source files to build a library that LAMMPS
+links against when using the USER-CUDA package.
+
+When you are done building this library, two files should
+exist in this directory:
+
+liblammpscuda.a		the library LAMMPS will link against
+Makefile.lammps		settings the LAMMPS Makefile will import
+
+The latter file will have settings like this (can be omitted if blank):
+
+user-cuda_SYSINC = -I$(CUDA_INSTALL_PATH)/include 
+user-cuda_SYSLIB = -lcuda -lcudart -lrt 
+user-cuda_SYSPATH = -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_INSTALL_PATH)/lib $(CUDA_USRLIB_CONDITIONAL)
+
+SYSINC is for settings needed to compile LAMMPS source files
+SYSLIB is for additional system libraries needed by this package
+SYSPATH is the path(s) to where those libraries are
+
+You must insure these settings are correct for your system, else
+the LAMMPS build will likely fail.
+
+-------------------------------------------------------------------------
+
+Christian - there needs to additional info here about how
+to build the lammpscuda lib.
diff --git a/lib/cuda/atom_vec_angle_cuda.cu b/lib/cuda/atom_vec_angle_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..29bf2d65f00884deb6c253237f3ae313c610b6aa
--- /dev/null
+++ b/lib/cuda/atom_vec_angle_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int ANGLE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
+
+#include "atom_vec_angle_cuda_cu.h"
+
+void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
+}
+
+int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
+
+int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
diff --git a/lib/cuda/atom_vec_angle_cuda_cu.h b/lib/cuda/atom_vec_angle_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..17e8e116878df713661b1ba8439d6a22be4cc7ae
--- /dev/null
+++ b/lib/cuda/atom_vec_angle_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
+#define ATOM_VEC_ANGLE_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
+extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+
+#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/
diff --git a/lib/cuda/atom_vec_atomic_cuda.cu b/lib/cuda/atom_vec_atomic_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e53efe9f74803a19ec08c812af928699038690f3
--- /dev/null
+++ b/lib/cuda/atom_vec_atomic_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int ATOMIC_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
+
+#include "atom_vec_atomic_cuda_cu.h"
+
+void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
+}
+
+int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
+
+int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
diff --git a/lib/cuda/atom_vec_atomic_cuda_cu.h b/lib/cuda/atom_vec_atomic_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..88b18f311def5964d78259f399792d47e82b0332
--- /dev/null
+++ b/lib/cuda/atom_vec_atomic_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
+#define ATOM_VEC_ATOMIC_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
+extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+
+#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/
diff --git a/lib/cuda/atom_vec_charge_cuda.cu b/lib/cuda/atom_vec_charge_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2441d770cd5b34526c866601389d0fb97882de1
--- /dev/null
+++ b/lib/cuda/atom_vec_charge_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int CHARGE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
+
+#include "atom_vec_charge_cuda_cu.h"
+
+void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
+}
+
+int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
+
+int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
diff --git a/lib/cuda/atom_vec_charge_cuda_cu.h b/lib/cuda/atom_vec_charge_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebae713142bf6b6ffa3ec2277aeb81cf1dc90632
--- /dev/null
+++ b/lib/cuda/atom_vec_charge_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
+#define ATOM_VEC_CHARGE_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
+extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+
+#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/
diff --git a/lib/cuda/atom_vec_cuda.cu b/lib/cuda/atom_vec_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..091fb7dbf17a8b171f220c1fc64bf616efe2cba4
--- /dev/null
+++ b/lib/cuda/atom_vec_cuda.cu
@@ -0,0 +1,628 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX atom_vec_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "cuda_wrapper_cu.h"
+#include "crm_cuda_utils.cu"
+
+#include "atom_vec_cuda_kernel.cu"
+
+int AtomVecCuda_CountDataItems(unsigned int data_mask)
+{
+  int n = 0;
+
+  if(data_mask & X_MASK) n += 3;
+
+  if(data_mask & V_MASK) n += 3;
+
+  if(data_mask & F_MASK) n += 3;
+
+  if(data_mask & TAG_MASK) n++;
+
+  if(data_mask & TYPE_MASK) n++;
+
+  if(data_mask & MASK_MASK) n++;
+
+  if(data_mask & IMAGE_MASK) n++;
+
+  if(data_mask & Q_MASK) n++;
+
+  if(data_mask & MOLECULE_MASK) n++;
+
+  if(data_mask & RMASS_MASK) n++;
+
+  if(data_mask & RADIUS_MASK) n++;
+
+  if(data_mask & DENSITY_MASK) n++;
+
+  if(data_mask & OMEGA_MASK) n += 3;
+
+  if(data_mask & TORQUE_MASK) n++;
+
+  //if(data_mask & NSPECIAL_MASK) n+=3;
+  return n;
+}
+
+void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
+{
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+template <const unsigned int data_mask>
+void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(image)   , & sdata->atom.image.dev_data, sizeof(int*));
+
+  if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q)       , & sdata->atom.q    .dev_data, sizeof(F_FLOAT*));
+
+  if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule)   , & sdata->atom.molecule.dev_data, sizeof(int*));
+
+  if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_AP(radius)   , & sdata->atom.radius.dev_data, sizeof(int*));
+
+  if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_AP(density)   , & sdata->atom.density.dev_data, sizeof(int*));
+
+  if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(int*));
+
+  if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_AP(omega)   , & sdata->atom.omega.dev_data, sizeof(int*));
+
+  //if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_AP(nspecial)   , & sdata->atom.nspecial.dev_data, sizeof(int*) );
+  cudaMemcpyToSymbol(MY_AP(flag)    , & sdata->flag, sizeof(int*));
+}
+
+template <const unsigned int data_mask>
+void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
+{
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n");)
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
+  cudaMemcpyToSymbol(MY_AP(prd)   , sdata->domain.prd, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(sublo)   , & sdata->domain.sublo, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(subhi)   , & sdata->domain.subhi, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(flag)   , & sdata->flag, sizeof(int*));
+  cudaThreadSynchronize();
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
+  int size = (n * n_data_items) * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    cudaMemset(sdata->flag, 0, sizeof(int));
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
+    Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
+        , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_kernel_pack +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
+
+    if(not sdata->overlap_comm)
+      cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+    //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_forward_download +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    int aflag;
+    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
+
+  }
+
+  return n_data_items * n;
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
+  int size = (n * n_data_items) * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  static int count = -1;
+  count++;
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
+
+    Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_kernel_self +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
+  }
+
+  return n_data_items * n;
+}
+
+
+template <const unsigned int data_mask>
+void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
+{
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
+  int size = (n * n_data_items) * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    if(not sdata->overlap_comm || iswap < 0)
+      cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_upload +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+    void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
+    Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_forward_kernel_unpack +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
+
+  }
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
+{
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n", dim);)
+  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed");
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  Cuda_AtomVecCuda_Init<data_mask>(sdata);
+  int size = n * sizeof(double);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nlocal, sizeof(int), 256, true);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  timespec time1, time2;
+  clock_gettime(CLOCK_REALTIME, &time1);
+
+  Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
+
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.comm_exchange_kernel_pack +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+  cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
+  int return_value = ((int*) buf_send)[0];
+
+  if(n > 1 + return_value)
+    cudaMemcpy(buf_send, sdata->buffer, (1 + return_value)*sizeof(double), cudaMemcpyDeviceToHost);
+
+  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
+
+  clock_gettime(CLOCK_REALTIME, &time1);
+  sdata->cuda_timings.comm_exchange_download +=
+    time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n");)
+  return return_value;
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n");)
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
+  int size = (nsend * n_data_items + 1) * sizeof(double);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
+
+  int3 layout = getgrid(nsend, 0);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  timespec time1, time2;
+  clock_gettime(CLOCK_REALTIME, &time1);
+
+  Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
+
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.comm_exchange_kernel_pack +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+  cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
+
+  clock_gettime(CLOCK_REALTIME, &time1);
+  sdata->cuda_timings.comm_exchange_download +=
+    time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n");)
+  return nsend * n_data_items + 1;
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
+
+  int size = (nsend * n_data_items + 1) * sizeof(double);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  cudaMemcpyToSymbol(MY_AP(flag)   , & sdata->flag, sizeof(int*));
+
+  cudaMemset((int*)(sdata->flag), 0, sizeof(int));
+
+  if(nsend) {
+    int3 layout = getgrid(nsend, 0);
+    dim3 threads(layout.z, 1, 1);
+    dim3 grid(layout.x, layout.y, 1);
+
+    if(sdata->atom.nlocal > 0) {
+      timespec time1, time2;
+      clock_gettime(CLOCK_REALTIME, &time1);
+
+      cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice);
+
+      clock_gettime(CLOCK_REALTIME, &time2);
+      sdata->cuda_timings.comm_exchange_upload +=
+        time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+      Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist);
+      cudaThreadSynchronize();
+
+      clock_gettime(CLOCK_REALTIME, &time1);
+      sdata->cuda_timings.comm_exchange_kernel_unpack +=
+        time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+      CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
+    }
+  }
+
+  int naccept;
+  cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+
+  return naccept;
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  timespec atime1, atime2;
+  clock_gettime(CLOCK_REALTIME, &atime1);
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &atime2);
+  sdata->cuda_timings.test1 +=
+    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
+
+  int size = nsend * n_data_items * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+  }
+
+  int3 layout = getgrid(nsend);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    timespec time1, time2;
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_border_kernel_pack +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_border_download +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+  }
+
+  return nsend * n_data_items;
+}
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
+
+  int size = n * n_data_items * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+  }
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    timespec time1, time2;
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_border_kernel_self +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
+
+  }
+
+  return n * n_data_items;
+}
+
+
+template <const unsigned int data_mask>
+int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  timespec atime1, atime2;
+  clock_gettime(CLOCK_REALTIME, &atime1);
+
+  if(sdata->atom.update_nmax)
+    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &atime2);
+  sdata->cuda_timings.test1 +=
+    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+
+  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
+
+  int size = n * n_data_items * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    timespec time1, time2;
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    cudaMemset((int*)(sdata->flag), 0, sizeof(int));
+    cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice);
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_border_upload +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_border_kernel_unpack +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    cudaMemcpy(&sdata->comm.grow_flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+
+    CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
+
+  }
+
+  return sdata->comm.grow_flag;
+}
+
+
+#include "atom_vec_angle_cuda.cu"
+#include "atom_vec_atomic_cuda.cu"
+#include "atom_vec_charge_cuda.cu"
+#include "atom_vec_full_cuda.cu"
+//#include "atom_vec_granular_cuda.cu"
diff --git a/lib/cuda/atom_vec_cuda_cu.h b/lib/cuda/atom_vec_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lib/cuda/atom_vec_cuda_kernel.cu b/lib/cuda/atom_vec_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5e2f6a974fc74ddf0e787c7ad24b97d17c6999c3
--- /dev/null
+++ b/lib/cuda/atom_vec_cuda_kernel.cu
@@ -0,0 +1,512 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#define RIMLARGER 1.000001
+#define RIMSMALLER 0.999999
+#define SMALL 1e-5
+
+extern __shared__ int shared[];
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    if(j > _nmax) _flag[0] = 1;
+
+    int k = 0;
+
+    if(data_mask & X_MASK) {
+      ((X_FLOAT*) buffer)[i + k * n] = _x[j] + dx;
+      k++;
+      ((X_FLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
+      k++;
+      ((X_FLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
+      k++;
+    }
+
+    if(data_mask & V_MASK) {
+      ((X_FLOAT*) buffer)[i + k * n] = _v[j];
+      k++;
+      ((X_FLOAT*) buffer)[i + k * n] = _v[j + _nmax];
+      k++;
+      ((X_FLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
+      k++;
+    }
+
+    if(data_mask & OMEGA_MASK) {
+      ((X_FLOAT*) buffer)[i + k * n] = _omega[j];
+      k++;
+      ((X_FLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
+      k++;
+      ((X_FLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
+      k++;
+    }
+
+    if(data_mask & RADIUS_MASK)((X_FLOAT*) buffer)[i + k * n] = _radius[j];
+
+    k++;
+
+    if(data_mask & RMASS_MASK)((X_FLOAT*) buffer)[i + k * n] = _rmass[j];
+
+    k++;
+  }
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = i;
+    j = list[i];
+
+    if(data_mask & X_MASK) {
+      _x[i + first] = _x[j] + dx;
+      _x[i + first + _nmax] = _x[j + _nmax] + dy;
+      _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
+    }
+
+    if(data_mask & V_MASK) {
+      _v[i + first] = _v[j];
+      _v[i + first + _nmax] = _v[j + _nmax];
+      _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
+    }
+
+    if(data_mask & OMEGA_MASK) {
+      _omega[i + first] = _omega[j];
+      _omega[i + first + _nmax] = _omega[j + _nmax];
+      _omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax];
+    }
+
+    if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
+
+    if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
+  }
+}
+
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    int k = 0;
+
+    if(data_mask & X_MASK) {
+      _x[i + first] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+      _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+      _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+    }
+
+    if(data_mask & V_MASK) {
+      _v[i + first] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+      _v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+      _v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+    }
+
+    if(data_mask & OMEGA_MASK) {
+      _omega[i + first] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+      _omega[i + first + _nmax] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+      _omega[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + k * n];
+      k++;
+    }
+
+    if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) buffer)[i + k * n];
+
+    k++;
+
+    if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) buffer)[i + k * n];
+
+    k++;
+  }
+}
+
+
+__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim)
+{
+  double* buf = (double*) _buffer;
+  buf = &buf[1];
+
+  //X_FLOAT lo=slablo[iswap];
+  //X_FLOAT hi=slabhi[iswap];
+
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  bool add = false;
+
+  if(i < _nlocal) {
+    double xdim_tmp = static_cast <double>(_x[i + dim * _nmax]);
+
+    if(xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) {
+      add = true;
+    }
+  }
+
+  shared[threadIdx.x] = add ? 1 : 0;
+  __syncthreads();
+  int nsend = 0;
+
+  if(threadIdx.x == 0) {
+    for(int k = 0; k < blockDim.x; k++) {
+      if(shared[k]) {
+        nsend++;
+        shared[k] = nsend;
+      }
+    }
+
+    shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+  }
+
+  __syncthreads();
+
+  nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+  if(add && nsend + 1 < n)
+    buf[nsend] = i;
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
+{
+  double* buf = (double*) _buffer;
+  int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(k >= nsend) return;
+
+  buf = &buf[1 + k];
+
+  int i = static_cast <int>(buf[0]);
+  int j = copylist[k];
+
+  int m = 1;
+
+  if(data_mask & X_MASK) {
+    buf[(m++)*nsend] = static_cast <double>(_x[i]);
+    buf[(m++)*nsend] = static_cast <double>(_x[i + _nmax]);
+    buf[(m++)*nsend] = static_cast <double>(_x[i + 2 * _nmax]);
+  }
+
+  if(data_mask & V_MASK) {
+    buf[(m++)*nsend] = _v[i];
+    buf[(m++)*nsend] = _v[i + _nmax];
+    buf[(m++)*nsend] = _v[i + 2 * _nmax];
+  }
+
+  if(data_mask & TAG_MASK) 		buf[(m++)*nsend] = _tag[i];
+
+  if(data_mask & TYPE_MASK) 	buf[(m++)*nsend] = _type[i];
+
+  if(data_mask & MASK_MASK) 	buf[(m++)*nsend] = _mask[i];
+
+  if(data_mask & IMAGE_MASK) 	buf[(m++)*nsend] = _image[i];
+
+  if(data_mask & Q_MASK) 		buf[(m++)*nsend] = _q[i];
+
+  if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
+
+  if(data_mask & RADIUS_MASK) 	buf[(m++)*nsend] = _radius[i];
+
+  if(data_mask & DENSITY_MASK) 	buf[(m++)*nsend] = _density[i];
+
+  if(data_mask & RMASS_MASK) 	buf[(m++)*nsend] = _rmass[i];
+
+  if(data_mask & OMEGA_MASK) {
+    buf[(m++)*nsend] = _omega[i];
+    buf[(m++)*nsend] = _omega[i + _nmax];
+    buf[(m++)*nsend] = _omega[i + 2 * _nmax];
+  }
+
+  /*  if(data_mask & NSPECIAL_MASK)
+    {
+    	buf[(m++)*nsend] = _nspecial[i];
+    	buf[(m++)*nsend] = _nspecial[i+_nmax];
+    	buf[(m++)*nsend] = _nspecial[i+2* _nmax];
+    }*/
+
+  if(i >= _nlocal) return;
+
+  if(data_mask & X_MASK) {
+    _x[i] = _x[j];
+    _x[i + _nmax] = _x[j + _nmax];
+    _x[i + 2 * _nmax] = _x[j + 2 * _nmax];
+  }
+
+  if(data_mask & V_MASK) {
+    _v[i] = _v[j];
+    _v[i + _nmax] = _v[j + _nmax];
+    _v[i + 2 * _nmax] = _v[j + 2 * _nmax];
+  }
+
+  if(data_mask & TAG_MASK)		_tag[i] 	= _tag[j];
+
+  if(data_mask & TYPE_MASK)		_type[i] 	= _type[j];
+
+  if(data_mask & MASK_MASK)		_mask[i] 	= _mask[j];
+
+  if(data_mask & IMAGE_MASK)	_image[i] 	= _image[j];
+
+  if(data_mask & Q_MASK) 		_q[i] 		= _q[j];
+
+  if(data_mask & MOLECULE_MASK) _molecule[i] = _molecule[j];
+
+  if(data_mask & RADIUS_MASK) 	_radius[i] 	= _radius[j];
+
+  if(data_mask & DENSITY_MASK) 	_density[i] = _density[j];
+
+  if(data_mask & RMASS_MASK) 	_rmass[i] 	= _rmass[j];
+
+  if(data_mask & OMEGA_MASK) {
+    _omega[i] = _omega[j];
+    _omega[i + _nmax] = _omega[j + _nmax];
+    _omega[i + 2 * _nmax] = _omega[j + 2 * _nmax];
+  }
+
+  /* if(data_mask & NSPECIAL_MASK)
+  {
+  _nspecial[i] = _nspecial[j];
+  _nspecial[i+_nmax] = _nspecial[j+_nmax];
+  _nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
+  }*/
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int* copylist)
+{
+  double* buf = (double*) _buffer;
+  int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(k >= nsend) return;
+
+  buf = &buf[1 + k];
+  int i = -1;
+  double xdim_tmp = buf[(1 + dim) * nsend];
+
+  if(xdim_tmp >= _sublo[dim] - SMALL && xdim_tmp < _subhi[dim] + SMALL) {
+    i = atomicAdd(_flag, 1) + _nlocal;
+
+    int m = 1;
+
+    if(data_mask & X_MASK) {
+      _x[i] = buf[(m++) * nsend];
+      _x[i + _nmax] = buf[(m++) * nsend];
+      _x[i + 2 * _nmax] = buf[(m++) * nsend];
+    }
+
+    if(data_mask & V_MASK) {
+      _v[i] = buf[(m++) * nsend];
+      _v[i + _nmax] = buf[(m++) * nsend];
+      _v[i + 2 * _nmax] = buf[(m++) * nsend];
+    }
+
+    if(data_mask & TAG_MASK) 	_tag[i] = buf[(m++) * nsend];
+
+    if(data_mask & TYPE_MASK) 	_type[i] = buf[(m++) * nsend];
+
+    if(data_mask & MASK_MASK) 	_mask[i] = buf[(m++) * nsend];
+
+    if(data_mask & IMAGE_MASK) _image[i] = buf[(m++) * nsend];
+
+    if(data_mask & Q_MASK) _q[i] = buf[(m++) * nsend];
+
+    if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++) * nsend];
+
+    if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++) * nsend];
+
+    if(data_mask & DENSITY_MASK) _density[i] = buf[(m++) * nsend];
+
+    if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++) * nsend];
+
+    if(data_mask & OMEGA_MASK) {
+      _omega[i] = buf[(m++) * nsend];
+      _omega[i + _nmax] = buf[(m++) * nsend];
+      _omega[i + 2 * _nmax] = buf[(m++) * nsend];
+    }
+
+    /*  if(data_mask & NSPECIAL_MASK)
+      {
+       _nspecial[i] = buf[(m++)*nsend];
+       _nspecial[i+_nmax] = buf[(m++)*nsend];
+       _nspecial[i+2*_nmax] = buf[(m++)*nsend];
+      }*/
+  }
+
+  copylist[k] = i;
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+    int m = 0;
+
+    if(data_mask & X_MASK) {
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
+    }
+
+    if(data_mask & V_MASK) {
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j];
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
+    }
+
+    if(data_mask & TAG_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _tag[j];
+
+    if(data_mask & TYPE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _type[j];
+
+    if(data_mask & MASK_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _mask[j];
+
+    if(data_mask & Q_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _q[j];
+
+    if(data_mask & MOLECULE_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
+
+    if(data_mask & RADIUS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _radius[i];
+
+    if(data_mask & DENSITY_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _density[i];
+
+    if(data_mask & RMASS_MASK)((X_FLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
+
+    if(data_mask & OMEGA_MASK) {
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i];
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
+      ((X_FLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
+    }
+  }
+}
+
+
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    if(data_mask & X_MASK) {
+      _x[i + first] = _x[j] + dx;
+      _x[i + first + _nmax] = _x[j + _nmax] + dy;
+      _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
+    }
+
+    if(data_mask & V_MASK) {
+      _v[i + first] = _v[j];
+      _v[i + first + _nmax] = _v[j + _nmax];
+      _v[i + first + 2 * _nmax] =  _v[j + 2 * _nmax];
+    }
+
+    if(data_mask & TAG_MASK) _tag[i + first] = _tag[j];
+
+    if(data_mask & TYPE_MASK) _type[i + first] = _type[j];
+
+    if(data_mask & MASK_MASK) _mask[i + first] = _mask[j];
+
+    if(data_mask & Q_MASK) _q[i + first] = _q[j];
+
+    if(data_mask & MOLECULE_MASK) _molecule[i + first] = _molecule[j];
+
+    if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
+
+    if(data_mask & DENSITY_MASK) _density[i + first] = _density[j];
+
+    if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
+
+    if(data_mask & OMEGA_MASK) {
+      _omega[i + first] = _omega[j];
+      _omega[i + first + _nmax] = _omega[j + _nmax];
+      _omega[i + first + 2 * _nmax] =  _omega[j + 2 * _nmax];
+    }
+  }
+}
+
+template <const unsigned int data_mask>
+__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    if(i + first < _nmax) {
+      int m = 0;
+
+      if(data_mask & X_MASK) {
+        _x[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+        _x[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+        _x[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+      }
+
+      if(data_mask & V_MASK) {
+        _v[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+        _v[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+        _v[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+      }
+
+      if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
+
+      if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
+
+      if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
+
+      if(data_mask & Q_MASK) _q[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+
+      if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_FLOAT*) _buffer)[i + (m++) * n]);
+
+      if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+
+      if(data_mask & DENSITY_MASK) _density[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+
+      if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+
+      if(data_mask & OMEGA_MASK) {
+        _omega[i + first] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+        _omega[i + first + _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+        _omega[i + first + 2 * _nmax] = ((X_FLOAT*) _buffer)[i + (m++) * n];
+      }
+    } else {
+      _flag[0] = 1;
+    }
+  }
+}
+
+
diff --git a/lib/cuda/atom_vec_full_cuda.cu b/lib/cuda/atom_vec_full_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3128f2cf7d91f0fa60117a04afa34ad8da7f2548
--- /dev/null
+++ b/lib/cuda/atom_vec_full_cuda.cu
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+const unsigned int FULL_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
+
+#include "atom_vec_full_cuda_cu.h"
+
+void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
+{
+  return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
+}
+
+int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
+}
+
+int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
+}
+
+int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
+}
+
+int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
+
+int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
+  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
+}
diff --git a/lib/cuda/atom_vec_full_cuda_cu.h b/lib/cuda/atom_vec_full_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..811d2df624dd78eb1fcd5551a4c556d3859ed515
--- /dev/null
+++ b/lib/cuda/atom_vec_full_cuda_cu.h
@@ -0,0 +1,15 @@
+#ifndef ATOM_VEC_FULL_CUDA_CU_H_
+#define ATOM_VEC_FULL_CUDA_CU_H_
+
+extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
+extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
+extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
+extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+
+#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/
diff --git a/lib/cuda/binning_kernel.cu b/lib/cuda/binning_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd40e3f0b940934d9f413da1b545016519d02ad0
--- /dev/null
+++ b/lib/cuda/binning_kernel.cu
@@ -0,0 +1,189 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+// load some variables from shared cuda data into device's constant memory:
+__device__ __constant__ X_FLOAT rez_bin_size[3];
+__device__ __constant__ unsigned* bin_error_count;
+
+__device__ __constant__ int cuda_dummy_type;
+__device__ __constant__ unsigned binned_size_all;
+__device__ __constant__ X_FLOAT outside[3];
+
+__global__ void PreBinning_Kernel()
+{
+  const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
+
+  if(bin < gridDim.x * gridDim.y) { // TODO: suspected always to be true
+    _binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type;
+
+    const int i = 3 * blockDim.x * bin + threadIdx.x;
+    X_FLOAT* binned_x = _binned_x + i;
+    *binned_x = _subhi[0] + outside[0] * (1 + i);
+    binned_x += blockDim.x;
+    *binned_x = _subhi[1] + outside[1] * (1 + i);
+    binned_x += blockDim.x;
+    *binned_x = _subhi[2] + outside[2] * (1 + i);
+    _binned_tag[i] = -1;
+  }
+}
+
+__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag)
+{
+  const unsigned i = blockDim.x * blockIdx.x + threadIdx.x + offset;
+
+  int binatoms = _natoms;
+
+  if(offset == 0) binatoms = _nlocal ;
+
+  if(i < binatoms) {
+    // copy atom position from global device memory to local register
+    // in this 3 steps to get as much coalesced access as possible
+    X_FLOAT my_xX, my_xY, my_xZ;
+    x += i;
+    my_xX = *x;
+    x += _nmax;
+    my_xY = *x;
+    x += _nmax;
+    my_xZ = *x;
+    //my_xX=x[i];
+    //my_xY=x[i+_nmax];
+    //my_xZ=x[i+2*_nmax];
+
+
+    // calculate flat bin index
+    int bx = __float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0])) + 2;
+    int by = __float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1])) + 2;
+    int bz = __float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2])) + 2;
+
+    bx -= bx * negativCUDA(1.0f * bx);
+    bx -= (bx - _bin_dim.x + 1) * negativCUDA(1.0f * _bin_dim.x - 1.0f - 1.0f * bx);
+    by -= by * negativCUDA(1.0f * by);
+    by -= (by - _bin_dim.y + 1) * negativCUDA(1.0f * _bin_dim.y - 1.0f - 1.0f * by);
+    bz -= bz * negativCUDA(1.0f * bz);
+    bz -= (bz - _bin_dim.z + 1) * negativCUDA(1.0f * _bin_dim.z - 1.0f - 1.0f * bz);
+
+
+    const unsigned j = _bin_dim.z * (_bin_dim.y * bx + by) + bz;
+
+    // add new atom to bin, get bin-array position
+    const unsigned k = atomicAdd(& _bin_count_all[j], 1);
+
+    if(offset == 0) atomicAdd(& _bin_count_local[j], 1);
+
+    if(k < _bin_nmax) {
+      // copy register values back to global device memory
+      unsigned pos = 3 * _bin_nmax * j + k;
+      _binpos[i] = pos;
+      binned_x += pos;
+      *binned_x = my_xX;
+      binned_x += _bin_nmax;
+      *binned_x = my_xY;
+      binned_x += _bin_nmax;
+      *binned_x = my_xZ;
+
+      // also copy velocity and force accordingly
+
+      binned_x  = _binned_v + pos;
+      x  = _v + i;
+      *binned_x = *x;
+      binned_x += _bin_nmax;
+      x += _nmax;
+      *binned_x = *x;
+      binned_x += _bin_nmax;
+      x += _nmax;
+      *binned_x = *x;
+
+      binned_x  = _binned_f + pos;
+      x  = _f + i;
+      *binned_x = *x;
+      binned_x += _bin_nmax;
+      x += _nmax;
+      *binned_x = *x;
+      binned_x += _bin_nmax;
+      x += _nmax;
+      *binned_x = *x;
+
+      pos = _bin_nmax * j + k;
+      _binned_type [pos] = _type[i];
+      _binned_tag  [pos] = _tag[i];
+
+      if(rmass_flag)
+        _binned_rmass[pos] = _rmass[i];
+
+      if(q_flag)
+        _binned_q    [pos] = _q[i];
+    } else {
+      // normally, this should not happen:
+      int errorn = atomicAdd(bin_error_count, 1);
+      MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
+    }
+  }
+}
+
+__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag)
+{
+  const unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if(i < _nlocal) {
+    unsigned bin_pos3 = _binpos[i];
+    unsigned bin_pos = bin_pos3 / (3 * _bin_nmax);
+    bin_pos *= _bin_nmax;
+    bin_pos += bin_pos3 - bin_pos * 3;
+
+    binned_x  = _binned_x + bin_pos3;
+    x  = x + i;
+    *x = *binned_x;
+    binned_x += _bin_nmax;
+    x += _nmax;
+    *x = *binned_x;
+    binned_x += _bin_nmax;
+    x += _nmax;
+    *x = *binned_x;
+
+    binned_x  = _binned_v + bin_pos3;
+    x  = _v + i;
+    *x = *binned_x;
+    binned_x += _bin_nmax;
+    x += _nmax;
+    *x = *binned_x;
+    binned_x += _bin_nmax;
+    x += _nmax;
+    *x = *binned_x;
+
+    binned_x  = _binned_f + bin_pos3;
+    x  = _f + i;
+    *x = *binned_x;
+    binned_x += _bin_nmax;
+    x += _nmax;
+    *x = *binned_x;
+    binned_x += _bin_nmax;
+    x += _nmax;
+    *x = *binned_x;
+
+
+    _type[i] = _binned_type[bin_pos];
+    _tag[i] = _binned_tag[bin_pos];
+
+    if(q_flag) _q[i] = _binned_q[bin_pos];
+  }
+}
diff --git a/lib/cuda/comm_cuda.cu b/lib/cuda/comm_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ca2d63cacf82c4ad1c6c5f5a8acca6f7668398c
--- /dev/null
+++ b/lib/cuda/comm_cuda.cu
@@ -0,0 +1,539 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX comm_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "comm_cuda_cu.h"
+#include "comm_cuda_kernel.cu"
+#include <ctime>
+
+void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n)
+{
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+
+void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
+}
+
+
+void Cuda_CommCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_CommCuda_UpdateNmax(sdata);
+  int ntypesp = sdata->atom.ntypes + 1;
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)   , &ntypesp, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(prd)   , sdata->domain.prd, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(flag)  , &sdata->flag, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(debugdata)  , &sdata->debugdata, sizeof(int*));
+}
+
+int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    cudaMemset(sdata->flag, 0, sizeof(int));
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
+    Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
+        , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_kernel_pack +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+
+    if(not sdata->overlap_comm)
+      cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+    //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_forward_download +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    int aflag;
+    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+
+  }
+
+  return 3 * n;
+}
+
+int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 6 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    cudaMemset(sdata->flag, 0, sizeof(int));
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
+    Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
+        , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_kernel_pack +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+
+    if(not sdata->overlap_comm)
+      cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+    //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_forward_download +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    int aflag;
+    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+
+  }
+
+  return 6 * n;
+}
+
+int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  static int count = -1;
+  count++;
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_kernel_self +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+  }
+
+  return 3 * n;
+}
+
+int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 6 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  static int count = -1;
+  count++;
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_kernel_self +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+  }
+
+  return 6 * n;
+}
+
+void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
+{
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    if(not sdata->overlap_comm || iswap < 0)
+      cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_upload +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+    void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
+    Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_forward_kernel_unpack +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+
+  }
+}
+
+void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
+{
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 6 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    clock_gettime(CLOCK_REALTIME, &time1);
+
+    if(not sdata->overlap_comm || iswap < 0)
+      cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+    clock_gettime(CLOCK_REALTIME, &time2);
+    sdata->cuda_timings.comm_forward_upload +=
+      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+    void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
+    Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
+    cudaThreadSynchronize();
+
+    clock_gettime(CLOCK_REALTIME, &time1);
+    sdata->cuda_timings.comm_forward_kernel_unpack +=
+      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
+
+    CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+
+  }
+}
+
+int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(F_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+
+  F_FLOAT* buf = (F_FLOAT*)buf_send;
+  F_FLOAT* f_dev = (F_FLOAT*)sdata->atom.f.dev_data;
+  f_dev += first;
+  cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+  buf += n;
+  f_dev += sdata->atom.nmax;
+  cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+  buf += n;
+  f_dev += sdata->atom.nmax;
+  cudaMemcpy(buf, f_dev, n * sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+  return 	n * 3;
+}
+
+
+void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(F_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    cudaMemcpy(sdata->buffer, buf_recv, size, cudaMemcpyHostToDevice);
+    Cuda_CommCuda_UnpackReverse_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");
+  }
+}
+
+void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, n);
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    Cuda_CommCuda_UnpackReverse_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, first);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
+
+  }
+}
+
+
+int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap)
+{
+  MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
+  timespec time1, time2;
+
+  if(sdata->atom.update_nmax)
+    Cuda_CommCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new or (80 > sdata->buffersize))
+    Cuda_CommCuda_UpdateBuffer(sdata, 10);
+
+  int n;
+
+  if(!bordergroup || ineed >= 2)
+    n = nlast - nfirst + 1;
+  else {
+    n = atom_nfirst;
+
+    if(nlast - sdata->atom.nlocal + 1 > n) n = nlast - sdata->atom.nlocal + 1;
+  }
+
+  int3 layout = getgrid(n, 0, 512, true);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x + 1, layout.y, 1);
+
+
+  cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &time1);
+
+  if(style == 1)
+    Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.slablo.dev_data, (X_FLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
+  else
+    Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_FLOAT*) sdata->comm.multilo.dev_data, (X_FLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
+
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.comm_border_kernel_buildlist +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+  CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
+  int nsend;
+  cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
+  return nsend;
+
+
+}
+
diff --git a/lib/cuda/comm_cuda_cu.h b/lib/cuda/comm_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..d5ac1560ca69d4e4fc3d9e02c2a3568f6ac79048
--- /dev/null
+++ b/lib/cuda/comm_cuda_cu.h
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
+extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
+extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
+extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
+extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
+extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
+extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send);
+extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv);
+extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first);
+extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap);
diff --git a/lib/cuda/comm_cuda_kernel.cu b/lib/cuda/comm_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f87b3af5406d10bfb91eb2dc7ff54e34f6068dd4
--- /dev/null
+++ b/lib/cuda/comm_cuda_kernel.cu
@@ -0,0 +1,394 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    if(j > _nmax) _flag[0] = 1;
+
+    ((X_FLOAT*) buffer)[i] = _x[j] + dx;
+    ((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
+    ((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
+  }
+}
+
+__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, void* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    if(j > _nmax) _flag[0] = 1;
+
+    ((X_FLOAT*) buffer)[i] = _x[j] + dx;
+    ((X_FLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
+    ((X_FLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
+    ((X_FLOAT*) buffer)[i + 3 * n] = _v[j];
+    ((X_FLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
+    ((X_FLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
+  }
+}
+
+__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = i;
+    j = list[i];
+
+    _x[i + first] = _x[j] + dx;
+    _x[i + first + _nmax] = _x[j + _nmax] + dy;
+    _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
+  }
+}
+
+__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = i;
+    j = list[i];
+
+    _x[i + first] = _x[j] + dx;
+    _x[i + first + _nmax] = _x[j + _nmax] + dy;
+    _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
+    _v[i + first] = _v[j];
+    _v[i + first + _nmax] = _v[j + _nmax];
+    _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
+  }
+}
+
+__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    _x[i + first] = ((X_FLOAT*) buffer)[i];
+    _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n];
+    _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n];
+  }
+}
+
+
+__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    _x[i + first] = ((X_FLOAT*) buffer)[i];
+    _x[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 1 * n];
+    _x[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 2 * n];
+    _v[i + first] = ((X_FLOAT*) buffer)[i + 3 * n];
+    _v[i + first + _nmax] = ((X_FLOAT*) buffer)[i + 4 * n];
+    _v[i + first + 2 * _nmax] = ((X_FLOAT*) buffer)[i + 5 * n];
+  }
+}
+
+__global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    ((F_FLOAT*) _buffer)[i] = _f[i + first];
+    ((F_FLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
+    ((F_FLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
+  }
+
+}
+
+__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int maxlistlength, int iswap)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+    _f[j] += ((F_FLOAT*)_buffer)[i];
+    _f[j + _nmax] += ((F_FLOAT*) _buffer)[i + n];
+    _f[j + 2 * _nmax] += ((F_FLOAT*) _buffer)[i + 2 * n];
+  }
+
+}
+
+__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    _f[j] += _f[i + first];
+    _f[j + _nmax] += _f[i + first + _nmax];
+    _f[j + 2 * _nmax] += _f[i + first + 2 * _nmax];
+  }
+
+}
+
+extern __shared__ int shared[];
+
+__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst,
+    int nfirst, int nlast, int dim, int iswap, X_FLOAT* slablo, X_FLOAT* slabhi, int* sendlist, int maxlistlength)
+{
+  int* list = sendlist + iswap * maxlistlength;
+  X_FLOAT lo = slablo[iswap];
+  X_FLOAT hi = slabhi[iswap];
+  bool add = false;
+
+  if(!bordergroup || ineed >= 2) {
+    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
+
+    if(i < nlast)
+      if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
+        add = true;
+      }
+
+    shared[threadIdx.x] = add ? 1 : 0;
+
+    __syncthreads();
+
+    int nsend = 0;
+
+    if(threadIdx.x == 0) {
+      for(int k = 0; k < blockDim.x; k++) {
+        if(shared[k]) {
+          nsend++;
+          shared[k] = nsend;
+        }
+      }
+
+      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+    }
+
+    __syncthreads();
+
+    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+    if(add && nsend < maxlistlength)
+      list[nsend] = i;
+
+
+  } else {
+
+    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+    if(i < atom_nfirst)
+      if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
+        add = true;
+      }
+
+    shared[threadIdx.x] = add ? 1 : 0;
+
+    __syncthreads();
+
+    int nsend = 0;
+
+    if(threadIdx.x == 0) {
+      for(int k = 0; k < blockDim.x; k++) {
+        if(shared[k]) {
+          nsend++;
+          shared[k] = nsend;
+        }
+      }
+
+      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+    }
+
+    __syncthreads();
+
+    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+    if(add && nsend < maxlistlength)
+      list[nsend] = i;
+
+    __syncthreads();
+
+    add = false;
+    i += _nlocal;
+
+    if(i < nlast)
+      if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
+        add = true;
+      }
+
+    shared[threadIdx.x] = add ? 1 : 0;
+
+    __syncthreads();
+
+    nsend = 0;
+
+    if(threadIdx.x == 0) {
+      for(int k = 0; k < blockDim.x; k++) {
+        if(shared[k]) {
+          nsend++;
+          shared[k] = nsend;
+        }
+      }
+
+      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+    }
+
+    __syncthreads();
+
+    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+    if(add && nsend < maxlistlength)
+      list[nsend] = i;
+
+  }
+}
+
+
+__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst
+    , int nfirst, int nlast, int dim, int iswap, X_FLOAT* multilo, X_FLOAT* multihi, int* sendlist, int maxlistlength)
+{
+  int* list = sendlist + iswap * maxlistlength;
+  X_FLOAT* mlo = &multilo[iswap * _cuda_ntypes];
+  X_FLOAT* mhi = &multihi[iswap * _cuda_ntypes];
+  int itype = 0;
+  bool add = false;
+
+  if(!bordergroup || ineed >= 2) {
+    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
+
+    if(i < nlast) {
+      itype = _type[i];
+
+      if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
+        add = true;
+      }
+    }
+
+    shared[threadIdx.x] = add ? 1 : 0;
+
+    __syncthreads();
+
+    int nsend = 0;
+
+    if(threadIdx.x == 0) {
+      for(int k = 0; k < blockDim.x; k++) {
+        if(shared[k]) {
+          nsend++;
+          shared[k] = nsend;
+        }
+      }
+
+      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+    }
+
+    __syncthreads();
+
+    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+    if(add && nsend < maxlistlength)
+      list[nsend] = i;
+
+
+  } else {
+
+    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+    if(i < atom_nfirst) {
+      itype = _type[i];
+
+      if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
+        add = true;
+      }
+    }
+
+    shared[threadIdx.x] = add ? 1 : 0;
+
+    __syncthreads();
+
+    int nsend = 0;
+
+    if(threadIdx.x == 0) {
+      for(int k = 0; k < blockDim.x; k++) {
+        if(shared[k]) {
+          nsend++;
+          shared[k] = nsend;
+        }
+      }
+
+      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+    }
+
+    __syncthreads();
+
+    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+    if(add && nsend < maxlistlength)
+      list[nsend] = i;
+
+    __syncthreads();
+
+    add = false;
+    i += _nlocal;
+
+    if(i < nlast) {
+      itype = _type[i];
+
+      if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
+        add = true;
+      }
+    }
+
+    shared[threadIdx.x] = add ? 1 : 0;
+
+    __syncthreads();
+
+    nsend = 0;
+
+    if(threadIdx.x == 0) {
+      for(int k = 0; k < blockDim.x; k++) {
+        if(shared[k]) {
+          nsend++;
+          shared[k] = nsend;
+        }
+      }
+
+      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
+    }
+
+    __syncthreads();
+
+    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
+
+    if(add && nsend < maxlistlength)
+      list[nsend] = i;
+
+  }
+}
diff --git a/lib/cuda/compute_temp_cuda.cu b/lib/cuda/compute_temp_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ece4cf93a9fed1f2a12d023f0f9bb7119df179da
--- /dev/null
+++ b/lib/cuda/compute_temp_cuda.cu
@@ -0,0 +1,126 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX compute_temp_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "compute_temp_cuda_cu.h"
+#include "compute_temp_cuda_kernel.cu"
+
+void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
+
+  if(sdata->atom.rmass_flag)
+    cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
+
+  cudaMemcpyToSymbol(MY_AP(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
+}
+
+void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_ComputeTempCuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t)
+{
+  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+  Cuda_ComputeTempCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  //if(sdata->buffer_new)
+  Cuda_ComputeTempCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
+
+    int oldgrid = grid.x * grid.y;
+    grid.x = 6;
+    grid.y = 1;
+    threads.x = 512;
+    Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
+  }
+}
+
+void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t)
+{
+  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+  Cuda_ComputeTempCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  //if(sdata->buffer_new)
+  Cuda_ComputeTempCuda_UpdateBuffer(sdata);
+  MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n", sdata->atom.nlocal);)
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
+    Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
+
+    int oldgrid = grid.x * grid.y;
+    grid.x = 1;
+    grid.y = 1;
+    threads.x = 512;
+    Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
+  }
+}
diff --git a/lib/cuda/compute_temp_cuda_cu.h b/lib/cuda/compute_temp_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..9ab43d727adb6b5928879a0884109bb4835c5500
--- /dev/null
+++ b/lib/cuda/compute_temp_cuda_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t);
+extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t);
diff --git a/lib/cuda/compute_temp_cuda_kernel.cu b/lib/cuda/compute_temp_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..79562a0e28fdb7636477be3f0e238f22bf6f92cf
--- /dev/null
+++ b/lib/cuda/compute_temp_cuda_kernel.cu
@@ -0,0 +1,118 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ ENERGY_FLOAT sharedmem[];
+
+
+__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+
+  if(i < _nlocal) {
+    if(_rmass_flag) {
+      if(_mask[i] & groupbit)
+        sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * _rmass[i];
+    } else {
+      if(_mask[i] & groupbit)
+        sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * (_mass[_type[i]]);
+    }
+  }
+
+  reduceBlock(sharedmem);
+  ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
+  }
+}
+
+__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+  sharedmem[threadIdx.x + blockDim.x] = 0;
+  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      V_FLOAT massone;
+
+      if(_rmass_flag) massone = _rmass[i];
+      else massone = _mass[_type[i]];
+
+      sharedmem[threadIdx.x] = massone * _v[i] * _v[i];
+      sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax];
+      sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax];
+      sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax];
+      sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax];
+      sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax];
+    }
+
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2 * blockDim.x]);
+  reduceBlock(&sharedmem[3 * blockDim.x]);
+  reduceBlock(&sharedmem[4 * blockDim.x]);
+  reduceBlock(&sharedmem[5 * blockDim.x]);
+  ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
+    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
+  }
+}
+
+
+__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t)
+{
+  int i = 0;
+  sharedmem[threadIdx.x] = 0;
+  ENERGY_FLOAT myforig = 0.0;
+  ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
+  buf = &buf[blockIdx.x * n];
+
+  while(i < n) {
+    sharedmem[threadIdx.x] = 0;
+
+    if(i + threadIdx.x < n)
+      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
+
+    __syncthreads();
+    reduceBlock(sharedmem);
+    i += blockDim.x;
+
+    if(threadIdx.x == 0)
+      myforig += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0)
+    t[blockIdx.x] = myforig;
+}
diff --git a/lib/cuda/compute_temp_partial_cuda.cu b/lib/cuda/compute_temp_partial_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bc78592640d2086c6877edaefa3928592e0dc468
--- /dev/null
+++ b/lib/cuda/compute_temp_partial_cuda.cu
@@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX compute_temp_partial_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "compute_temp_partial_cuda_cu.h"
+#include "compute_temp_partial_cuda_kernel.cu"
+
+void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*));
+
+  if(sdata->atom.rmass_flag)
+    cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
+
+  cudaMemcpyToSymbol(MY_AP(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
+}
+
+void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
+{
+  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  //if(sdata->buffer_new)
+  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
+
+    int oldgrid = grid.x * grid.y;
+    grid.x = 6;
+    threads.x = 512;
+    Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
+  }
+}
+
+void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag)
+{
+  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  //if(sdata->buffer_new)
+  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+  MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);)
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
+    Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (groupbit, xflag, yflag, zflag);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
+
+    int oldgrid = grid.x * grid.y;
+    grid.x = 1;
+    threads.x = 512;
+    Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>> (oldgrid, t);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
+  }
+}
+
+void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
+{
+  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  //if(sdata->buffer_new)
+  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
+  }
+}
+
+void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
+{
+  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
+  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  //if(sdata->buffer_new)
+  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_FLOAT*) vbiasall);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
+  }
+}
diff --git a/lib/cuda/compute_temp_partial_cuda_cu.h b/lib/cuda/compute_temp_partial_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..00fc8a7c3699d43b2cdca76f0e6ec415e45839d4
--- /dev/null
+++ b/lib/cuda/compute_temp_partial_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag);
+extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_FLOAT* t, int xflag, int yflag, int zflag);
+extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
+extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
diff --git a/lib/cuda/compute_temp_partial_cuda_kernel.cu b/lib/cuda/compute_temp_partial_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ec0fff334f8d1b4c2e8c048b0f880548d84ac77e
--- /dev/null
+++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ ENERGY_FLOAT sharedmem[];
+
+
+__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+
+  if(i < _nlocal) {
+    if(_rmass_flag) {
+      if(_mask[i] & groupbit)
+        sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * _rmass[i];
+    } else {
+      if(_mask[i] & groupbit)
+        sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * (_mass[_type[i]]);
+    }
+  }
+
+  reduceBlock(sharedmem);
+  ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
+  }
+}
+
+__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xflag, int yflag, int zflag)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+  sharedmem[threadIdx.x + blockDim.x] = 0;
+  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      V_FLOAT massone;
+
+      if(_rmass_flag) massone = _rmass[i];
+      else massone = _mass[_type[i]];
+
+      sharedmem[threadIdx.x] = massone * _v[i] * _v[i] * xflag;
+      sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax] * yflag;
+      sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag;
+      sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax] * xflag * yflag;
+      sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax] * xflag * zflag;
+      sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax] * yflag * zflag;
+    }
+
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2 * blockDim.x]);
+  reduceBlock(&sharedmem[3 * blockDim.x]);
+  reduceBlock(&sharedmem[4 * blockDim.x]);
+  reduceBlock(&sharedmem[5 * blockDim.x]);
+  ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
+  }
+}
+
+
+__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_FLOAT* t)
+{
+  int i = 0;
+  sharedmem[threadIdx.x] = 0;
+  ENERGY_FLOAT myforig = 0.0;
+  ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
+  buf = &buf[blockIdx.x * n];
+
+  while(i < n) {
+    sharedmem[threadIdx.x] = 0;
+
+    if(i + threadIdx.x < n)
+      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
+
+    __syncthreads();
+    reduceBlock(sharedmem);
+    i += blockDim.x;
+
+    if(threadIdx.x == 0)
+      myforig += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0)
+    t[blockIdx.x] = myforig;
+}
+
+__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      if(!xflag) {
+        vbiasall[i] = _v[i];
+        _v[i] = V_F(0.0);
+      }
+
+      if(!yflag) {
+        vbiasall[i + _nmax] = _v[i + _nmax];
+        _v[i + _nmax] = V_F(0.0);
+      }
+
+      if(!zflag) {
+        vbiasall[i + 2 * _nmax] = _v[i + 2 * _nmax];
+        _v[i + 2 * _nmax] = V_F(0.0);
+      }
+    }
+}
+
+__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_FLOAT* vbiasall)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      if(!xflag) {
+        _v[i] += vbiasall[i];
+      }
+
+      if(!yflag) {
+        _v[i + _nmax] += vbiasall[i + _nmax];
+      }
+
+      if(!zflag) {
+        _v[i + 2 * _nmax] += vbiasall[i + 2 * _nmax];
+      }
+    }
+}
diff --git a/lib/cuda/crm_cuda_utils.cu b/lib/cuda/crm_cuda_utils.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6337d0d01545b357b8daf0b471f4768618013b36
--- /dev/null
+++ b/lib/cuda/crm_cuda_utils.cu
@@ -0,0 +1,919 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CRM_CUDA_UTILS
+#define CRM_CUDA_UTILS
+
+//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
+{
+  int3 gridparams;
+  int sharedsize = 16000;
+
+  if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
+
+  if((n < 60 * 32) || (threadsmax < 64))
+    gridparams.z = 32;
+  else if((n < 60 * 64) || (threadsmax < 128))
+    gridparams.z = 64;
+  else if((n < 60 * 128) || (threadsmax < 256))
+    gridparams.z = 128;
+  else if((n < 60 * 256) || (threadsmax < 512))
+    gridparams.z = 256;
+  else gridparams.z = 512;
+
+  if(p2) {
+    gridparams.z = 16;
+
+    while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
+  }
+
+
+  int blocks = (n + gridparams.z - 1) / gridparams.z;
+
+  if(blocks > 10000)
+    gridparams.x = gridparams.y = int(sqrt(blocks));
+  else {
+    gridparams.x = blocks;
+    gridparams.y = 1;
+  }
+
+  while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
+
+  if(gridparams.x == 0) gridparams.x = 1;
+
+  return gridparams;
+}
+
+//return value: 1 if f<0; else: 0
+//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
+static inline __device__ int negativCUDA(float f)
+{
+  return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
+}
+
+//return value: -1 if f<0; else +1
+static inline __device__ float fsignCUDA(float f)
+{
+  return f < 0.0f ? -1.0f : 1.0f;
+}
+
+//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
+//blockDim.y and blockDim.z are assumed to be 1
+static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
+{
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  __syncthreads();
+}
+
+static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
+{
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  __syncthreads();
+}
+
+static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
+{
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    glob[i + threadIdx.x] = shared[i + threadIdx.x];
+  }
+
+  __syncthreads();
+}
+
+static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
+{
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  __syncthreads();
+}
+
+static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
+{
+  int i, k;
+  k = n - blockDim.x;
+
+  for(i = 0; i < k; i += blockDim.x) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  __syncthreads();
+}
+
+static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
+{
+  int i;
+
+  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  if(threadIdx.x < n - i) {
+    shared[i + threadIdx.x] = glob[i + threadIdx.x];
+  }
+
+  __syncthreads();
+}
+
+//copy data between two memory areas on device, 3d BlockDims are allowed
+static __device__ inline void copyData(double* source, double* target, const int &n)
+{
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
+}
+
+static __device__ inline void copyData(float* source, float* target, const int &n)
+{
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
+}
+
+static __device__ inline void copyData(int* source, int* target, const int &n)
+{
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
+}
+
+static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
+{
+  int i;
+  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
+
+  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
+    target[i + offset] = source[i + offset];
+  }
+
+  if(offset < n - i) {
+    target[i + offset] = source[i + offset];
+  }
+
+  __syncthreads();
+}
+
+//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
+//in the end in data[0]=sum_i=0^blockDim.x data[i]
+//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
+static __device__ inline void reduceBlockP2(int* data)
+{
+  __syncthreads();
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlockP2(unsigned int* data)
+{
+  __syncthreads();
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlockP2(float* data)
+{
+  __syncthreads();
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlockP2(double* data)
+{
+  __syncthreads();
+
+  for(int i = 2; i <= blockDim.x; i *= 2) {
+    if(threadIdx.x < blockDim.x / i)
+      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(float* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(int* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(unsigned int* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduceBlock(double* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
+{
+  int i;
+
+  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
+    data[i + threadIdx.x] = value;
+  }
+
+  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
+}
+
+static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
+{
+  int i;
+
+  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
+    data[i + threadIdx.x] = value;
+  }
+
+  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
+}
+
+static __device__ inline void reduce(float* data, int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
+    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
+    j++;
+  }
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
+      j++;
+    }
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void reduce(double* data, int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
+    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
+    j++;
+  }
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
+      j++;
+    }
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void minOfBlock(float* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfBlock(float* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void minOfBlock(double* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfBlock(double* data)
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
+
+    __syncthreads();
+  }
+}
+
+
+static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
+    j++;
+  }
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
+      j++;
+    }
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
+    j++;
+  }
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
+      j++;
+    }
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
+    j++;
+  }
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
+      j++;
+    }
+
+    __syncthreads();
+  }
+}
+
+static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
+{
+  __syncthreads();
+  int p2 = 1;
+
+  while(p2 * 2 < n) p2 *= 2;
+
+  int j = 0;
+
+  while((threadIdx.x + blockDim.x * j) < n - p2) {
+    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
+    j++;
+  }
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    while((threadIdx.x + blockDim.x * j) < p2 / i) {
+      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
+      j++;
+    }
+
+    __syncthreads();
+  }
+}
+
+#if X_PRECISION == 2
+static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
+{
+  int2 v = tex1Dfetch(t, i);
+  return __hiloint2double(v.y, v.x);
+}
+
+static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
+{
+  int4 v = tex1Dfetch(t, 2 * i);
+  int4 u = tex1Dfetch(t, 2 * i + 1);
+  X_FLOAT4 w;
+
+  w.x = __hiloint2double(v.y, v.x);
+  w.y = __hiloint2double(v.w, v.z);
+  w.z = __hiloint2double(u.y, u.x);
+  w.w = __hiloint2double(u.w, u.z);
+  return w;
+}
+#endif
+
+inline void BindXTypeTexture(cuda_shared_data* sdata)
+{
+#ifdef CUDA_USE_TEXTURE
+  _x_type_tex.normalized = false;                      // access with normalized texture coordinates
+  _x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
+
+#if X_PRECISION == 1
+  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
+#else
+  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
+#endif
+#endif
+}
+
+static __device__ inline X_FLOAT4 fetchXType(int i)
+{
+#ifdef CUDA_USE_TEXTURE
+#if X_PRECISION == 1
+  return tex1Dfetch(_x_type_tex, i);
+#else
+  return tex1Dfetch_double(_x_type_tex, i);
+#endif
+#else
+  return _x_type[i];
+#endif
+}
+
+#if V_PRECISION == 2
+static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
+{
+  int2 v = tex1Dfetch(t, i);
+  return __hiloint2double(v.y, v.x);
+}
+
+static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
+{
+  int4 v = tex1Dfetch(t, 2 * i);
+  int4 u = tex1Dfetch(t, 2 * i + 1);
+  V_FLOAT4 w;
+
+  w.x = __hiloint2double(v.y, v.x);
+  w.y = __hiloint2double(v.w, v.z);
+  w.z = __hiloint2double(u.y, u.x);
+  w.w = __hiloint2double(u.w, u.z);
+  return w;
+}
+#endif
+
+inline void BindVRadiusTexture(cuda_shared_data* sdata)
+{
+#ifdef CUDA_USE_TEXTURE
+  _v_radius_tex.normalized = false;                      // access with normalized texture coordinates
+  _v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
+
+#if V_PRECISION == 1
+  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
+#else
+  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
+#endif
+#endif
+}
+
+static __device__ inline V_FLOAT4 fetchVRadius(int i)
+{
+#ifdef CUDA_USE_TEXTURE
+#if V_PRECISION == 1
+  return tex1Dfetch(_v_radius_tex, i);
+#else
+  return tex1Dfetch_double_v(_v_radius_tex, i);
+#endif
+#else
+  return _v_radius[i];
+#endif
+}
+
+inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
+{
+#ifdef CUDA_USE_TEXTURE
+  _omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
+  _omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
+
+#if V_PRECISION == 1
+  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
+#else
+  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
+#endif
+#endif
+}
+
+static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
+{
+#ifdef CUDA_USE_TEXTURE
+#if V_PRECISION == 1
+  return tex1Dfetch(_omega_rmass_tex, i);
+#else
+  return tex1Dfetch_double_v(_omega_rmass_tex, i);
+#endif
+#else
+  return _omega_rmass[i];
+#endif
+}
+
+#if F_PRECISION == 2
+static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
+{
+  int2 v = tex1Dfetch(t, i);
+  return __hiloint2double(v.y, v.x);
+}
+
+static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
+{
+  int4 v = tex1Dfetch(t, 2 * i);
+  int4 u = tex1Dfetch(t, 2 * i + 1);
+  F_FLOAT4 w;
+
+  w.x = __hiloint2double(v.y, v.x);
+  w.y = __hiloint2double(v.w, v.z);
+  w.z = __hiloint2double(u.y, u.x);
+  w.w = __hiloint2double(u.w, u.z);
+  return w;
+}
+#endif
+
+inline void BindQTexture(cuda_shared_data* sdata)
+{
+#ifdef CUDA_USE_TEXTURE
+  _q_tex.normalized = false;                      // access with normalized texture coordinates
+  _q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+  const textureReference* q_texture_ptr = &MY_AP(q_tex);
+
+#if F_PRECISION == 1
+  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
+  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
+  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
+#endif
+#endif
+}
+
+static __device__ inline F_FLOAT fetchQ(int i)
+{
+#ifdef CUDA_USE_TEXTURE
+#if F_PRECISION == 1
+  return tex1Dfetch(_q_tex, i);
+#else
+  return tex1Dfetch_double_f(_q_tex, i);
+#endif
+#else
+  return _q[i];
+#endif
+}
+
+#endif
+
+/*
+
+inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
+{
+	#ifdef CUDA_USE_TEXTURE
+		_coeff_tex.normalized = false;                      // access with normalized texture coordinates
+		_coeff_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+		_coeff_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+		const textureReference* coeff_texture_ptr;
+		cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
+
+		#if F_PRECISION == 1
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
+		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
+		#else
+		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
+		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
+		#endif
+	#endif
+}
+
+static __device__ inline X_FLOAT4 fetchXType(int i)
+{
+		#ifdef CUDA_USE_TEXTURE
+		  #if X_PRECISION == 1
+		     return tex1Dfetch(_x_type_tex,i);
+		  #else
+		     return tex1Dfetch_double(_x_type_tex,i);
+		  #endif
+		#else
+		  return _x_type[i];
+		#endif
+}
+*/
+#define SBBITS 30
+
+static inline __device__ int sbmask(int j)
+{
+  return j >> SBBITS & 3;
+}
+
+static inline __device__ void minimum_image(X_FLOAT4 &delta)
+{
+  if(_triclinic == 0) {
+    if(_periodicity[0]) {
+      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
+                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
+    }
+
+    if(_periodicity[1]) {
+      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
+    }
+
+    if(_periodicity[2]) {
+      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
+    }
+
+  } else {
+    if(_periodicity[1]) {
+      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
+      delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
+      delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
+
+    }
+
+    if(_periodicity[1]) {
+      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
+      delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
+
+    }
+
+    if(_periodicity[0]) {
+      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
+                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
+    }
+  }
+}
+
+static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
+{
+  ci.x = x2.x - x1.x;
+  ci.y = x2.y - x1.y;
+  ci.z = x2.z - x1.z;
+  minimum_image(ci);
+  ci.x += x1.x;
+  ci.y += x1.y;
+  ci.z += x1.z;
+}
diff --git a/lib/cuda/cuda.cu b/lib/cuda/cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b0c7a917762da994322d6aeeab4e2827af8c3229
--- /dev/null
+++ b/lib/cuda/cuda.cu
@@ -0,0 +1,22 @@
+#include "cuda_precision.h"
+#include "cuda_shared.h"
+#include "cuda_cu.h"
+
+void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
+{
+  sdata->compile_settings.prec_glob = sizeof(CUDA_FLOAT) / 4;
+  sdata->compile_settings.prec_x = sizeof(X_FLOAT) / 4;
+  sdata->compile_settings.prec_v = sizeof(V_FLOAT) / 4;
+  sdata->compile_settings.prec_f = sizeof(F_FLOAT) / 4;
+  sdata->compile_settings.prec_pppm = sizeof(PPPM_FLOAT) / 4;
+  sdata->compile_settings.prec_fft = sizeof(FFT_FLOAT) / 4;
+
+#ifdef FFT_CUFFT
+  sdata->compile_settings.cufft = 1;
+#else
+  sdata->compile_settings.cufft = 0;
+#endif
+
+  sdata->compile_settings.arch = CUDA_ARCH;
+
+}
diff --git a/lib/cuda/cuda_common.h b/lib/cuda/cuda_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..a6806bcfd854065d62f87a98b74aa9673fd5b090
--- /dev/null
+++ b/lib/cuda/cuda_common.h
@@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_COMMON_H_
+#define _CUDA_COMMON_H_
+
+//#include "cutil.h"
+#include "cuda_precision.h"
+#include "cuda_wrapper_cu.h"
+
+#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
+//this can not be arbitrarly large, since constant space is limited.
+//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
+//Christian
+#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
+#define CUDA_MAX_NSPECIAL 25
+
+// define some easy-to-use debug and emulation macros
+#ifdef _DEBUG
+#define MYDBG(a) a
+#else
+#define MYDBG(a)
+#endif
+
+#if __DEVICE_EMULATION__
+#define MYEMU(a) a
+#else
+#define MYEMU(a)
+#endif
+
+#define MYEMUDBG(a) MYEMU(MYDBG(a))
+
+// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
+#define MY_ADD_PREFIX(prefix, var) prefix##_##var
+#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
+#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
+
+#define MY_VAR_TO_STR(var) #var
+#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
+//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
+//#define &MY_AP(var) &(MY_AP(var))
+#define CUDA_USE_TEXTURE
+#define CUDA_USE_FLOAT4
+
+//constants used by many classes
+
+//domain
+#define _boxhi       MY_AP(boxhi)
+#define _boxlo       MY_AP(boxlo)
+#define _subhi       MY_AP(subhi)
+#define _sublo       MY_AP(sublo)
+#define _box_size    MY_AP(box_size)
+#define _prd         MY_AP(prd)
+#define _periodicity MY_AP(periodicity)
+#define _triclinic	 MY_AP(triclinic)
+#define _boxhi_lamda MY_AP(boxhi_lamda)
+#define _boxlo_lamda MY_AP(boxlo_lamda)
+#define _prd_lamda   MY_AP(prd_lamda)
+#define _h		 	 MY_AP(h)
+#define _h_inv	 	 MY_AP(h_inv)
+#define _h_rate		 MY_AP(h_rate)
+__device__ __constant__ X_FLOAT _boxhi[3];
+__device__ __constant__ X_FLOAT _boxlo[3];
+__device__ __constant__ X_FLOAT _subhi[3];
+__device__ __constant__ X_FLOAT _sublo[3];
+__device__ __constant__ X_FLOAT _box_size[3];
+__device__ __constant__ X_FLOAT _prd[3];
+__device__ __constant__ int _periodicity[3];
+__device__ __constant__ int _triclinic;
+__device__ __constant__ X_FLOAT _boxhi_lamda[3];
+__device__ __constant__ X_FLOAT _boxlo_lamda[3];
+__device__ __constant__ X_FLOAT _prd_lamda[3];
+__device__ __constant__ X_FLOAT _h[6];
+__device__ __constant__ X_FLOAT _h_inv[6];
+__device__ __constant__ V_FLOAT _h_rate[6];
+
+
+//atom properties
+#define _x           MY_AP(x)
+#define _v           MY_AP(v)
+#define _f           MY_AP(f)
+#define _tag         MY_AP(tag)
+#define _type        MY_AP(type)
+#define _mask        MY_AP(mask)
+#define _image       MY_AP(image)
+#define _q           MY_AP(q)
+#define _mass        MY_AP(mass)
+#define _rmass       MY_AP(rmass)
+#define _rmass_flag  MY_AP(rmass_flag)
+#define _eatom       MY_AP(eatom)
+#define _vatom       MY_AP(vatom)
+#define _x_type      MY_AP(x_type)
+#define _radius      MY_AP(radius)
+#define _density     MY_AP(density)
+#define _omega       MY_AP(omega)
+#define _torque      MY_AP(torque)
+#define _special     MY_AP(special)
+#define _maxspecial  MY_AP(maxspecial)
+#define _nspecial    MY_AP(nspecial)
+#define _special_flag  MY_AP(special_flag)
+#define _molecule    MY_AP(molecule)
+#define _v_radius    MY_AP(v_radius)
+#define _omega_rmass MY_AP(omega_rmass)
+#define _freeze_group_bit MY_AP(freeze_group_bit)
+#define _map_array   MY_AP(map_array)
+__device__ __constant__ X_FLOAT* _x;  //holds pointer to positions
+__device__ __constant__ V_FLOAT* _v;
+__device__ __constant__ F_FLOAT* _f;
+__device__ __constant__ int* _tag;
+__device__ __constant__ int* _type;
+__device__ __constant__ int* _mask;
+__device__ __constant__ int* _image;
+__device__ __constant__ V_FLOAT* _mass;
+__device__ __constant__ F_FLOAT* _q;
+__device__ __constant__ V_FLOAT* _rmass;
+__device__ __constant__ int _rmass_flag;
+__device__ __constant__ ENERGY_FLOAT* _eatom;
+__device__ __constant__ ENERGY_FLOAT* _vatom;
+__device__ __constant__ X_FLOAT4* _x_type;  //holds pointer to positions
+__device__ __constant__ X_FLOAT* _radius;
+__device__ __constant__ F_FLOAT* _density;
+__device__ __constant__ V_FLOAT* _omega;
+__device__ __constant__ F_FLOAT* _torque;
+__device__ __constant__ int* _special;
+__device__ __constant__ int _maxspecial;
+__device__ __constant__ int* _nspecial;
+__device__ __constant__ int _special_flag[4];
+__device__ __constant__ int* _molecule;
+__device__ __constant__ V_FLOAT4* _v_radius;  //holds pointer to positions
+__device__ __constant__ V_FLOAT4* _omega_rmass;  //holds pointer to positions
+__device__ __constant__ int _freeze_group_bit;
+__device__ __constant__ int* _map_array;
+
+#ifdef CUDA_USE_TEXTURE
+
+#define _x_tex         MY_AP(x_tex)
+#if X_PRECISION == 1
+texture<float> _x_tex;
+#else
+texture<int2, 1> _x_tex;
+#endif
+
+#define _type_tex         MY_AP(type_tex)
+texture<int> _type_tex;
+
+#define _x_type_tex         MY_AP(x_type_tex)
+#if X_PRECISION == 1
+texture<float4, 1> _x_type_tex;
+#else
+texture<int4, 1> _x_type_tex;
+#endif
+
+#define _v_radius_tex         MY_AP(v_radius_tex)
+#if V_PRECISION == 1
+texture<float4, 1> _v_radius_tex;
+#else
+texture<int4, 1> _v_radius_tex;
+#endif
+
+#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
+#if V_PRECISION == 1
+texture<float4, 1> _omega_rmass_tex;
+#else
+texture<int4, 1> _omega_rmass_tex;
+#endif
+
+#define _q_tex         MY_AP(q_tex)
+#if F_PRECISION == 1
+texture<float> _q_tex;
+#else
+texture<int2, 1> _q_tex;
+#endif
+
+#endif
+
+//neighbor
+#ifdef IncludeCommonNeigh
+#define _inum        	MY_AP(inum)
+#define _inum_border    MY_AP(inum_border)
+#define _ilist       	MY_AP(ilist)
+#define _ilist_border 	MY_AP(ilist_border)
+#define _numneigh    	MY_AP(numneigh)
+#define _numneigh_border 	MY_AP(numneigh_border)
+#define _numneigh_inner		MY_AP(numneigh_inner)
+#define _firstneigh  	MY_AP(firstneigh)
+#define _neighbors 	MY_AP(neighbors)
+#define _neighbors_border 	MY_AP(neighbors_border)
+#define _neighbors_inner  	MY_AP(neighbors_inner)
+#define _reneigh_flag 	MY_AP(reneigh_flag)
+#define _triggerneighsq MY_AP(triggerneighsq)
+#define _xhold       	MY_AP(xhold)
+#define _maxhold     	MY_AP(maxhold)
+#define _dist_check     MY_AP(dist_check)
+#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
+#define _maxneighbors   MY_AP(maxneighbors)
+#define _overlap_comm   MY_AP(overlap_comm)
+__device__ __constant__ int _inum;
+__device__ __constant__ int* _inum_border;
+__device__ __constant__ int* _ilist;
+__device__ __constant__ int* _ilist_border;
+__device__ __constant__ int* _numneigh;
+__device__ __constant__ int* _numneigh_border;
+__device__ __constant__ int* _numneigh_inner;
+__device__ __constant__ int** _firstneigh;
+__device__ __constant__ int* _neighbors;
+__device__ __constant__ int* _neighbors_border;
+__device__ __constant__ int* _neighbors_inner;
+__device__ __constant__ int* _reneigh_flag;
+__device__ __constant__ X_FLOAT _triggerneighsq;
+__device__ __constant__ X_FLOAT* _xhold;  //holds pointer to positions
+__device__ __constant__ int _maxhold;
+__device__ __constant__ int _dist_check;
+__device__ __constant__ int _neighbor_maxlocal;
+__device__ __constant__ int _maxneighbors;
+__device__ __constant__ int _overlap_comm;
+#endif
+
+//system properties
+#define _nall        MY_AP(nall)
+#define _nghost      MY_AP(nghost)
+#define _nlocal      MY_AP(nlocal)
+#define _nmax        MY_AP(nmax)
+#define _cuda_ntypes MY_AP(cuda_ntypes)
+#define _dtf         MY_AP(dtf)
+#define _dtv         MY_AP(dtv)
+#define _factor      MY_AP(factor)
+#define _virial      MY_AP(virial)
+#define _eng_vdwl    MY_AP(eng_vdwl)
+#define _eng_coul    MY_AP(eng_coul)
+#define _molecular   MY_AP(molecular)
+__device__ __constant__ unsigned _nall;
+__device__ __constant__ unsigned _nghost;
+__device__ __constant__ unsigned _nlocal;
+__device__ __constant__ unsigned _nmax;
+__device__ __constant__ unsigned _cuda_ntypes;
+__device__ __constant__ V_FLOAT _dtf;
+__device__ __constant__ X_FLOAT _dtv;
+__device__ __constant__ V_FLOAT _factor;
+__device__ __constant__ ENERGY_FLOAT* _virial;
+__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
+__device__ __constant__ ENERGY_FLOAT* _eng_coul;
+__device__ __constant__ int _molecular;
+
+//other general constants
+#define _buffer      MY_AP(buffer)
+#define _flag		 MY_AP(flag)
+#define _debugdata   MY_AP(debugdata)
+__device__ __constant__ void* _buffer;
+__device__ __constant__ int* _flag;
+__device__ __constant__ int* _debugdata;
+
+// pointers to data fields on GPU are hold in constant space
+// -> reduces register usage and number of parameters for kernelcalls
+// will be variables of file scope in cuda files
+
+
+
+
+// maybe used to output cudaError_t
+#define MY_OUTPUT_RESULT(result) \
+  switch(result) \
+  { \
+  case cudaSuccess: printf(" => cudaSuccess\n"); break; \
+  case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
+  case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
+  case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
+  case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
+  default: printf(" => unknown\n"); break; \
+  }
+
+#ifdef _DEBUG
+#  define CUT_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+      exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+      exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+  }
+#else
+#  define CUT_CHECK_ERROR(errorMessage) {                                    \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+      exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+  }
+#endif
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+              __FILE__, __LINE__, cudaGetErrorString( err) );              \
+      exit(EXIT_FAILURE);                                                  \
+    } }
+
+#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
+
+#define X_MASK 1
+#define V_MASK 2
+#define F_MASK 4
+#define TAG_MASK 8
+#define TYPE_MASK 16
+#define MASK_MASK 32
+#define IMAGE_MASK 64
+#define Q_MASK 128
+#define MOLECULE_MASK 256
+#define RMASS_MASK 512
+#define RADIUS_MASK 1024
+#define DENSITY_MASK 2048
+#define OMEGA_MASK 4096
+#define TORQUE_MASK 8192
+
+
+
+#endif // #ifdef _CUDA_COMMON_H_
diff --git a/lib/cuda/cuda_cu.h b/lib/cuda/cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..48498b8d0f353cc5cf7f99a4cdeb0403483322c3
--- /dev/null
+++ b/lib/cuda/cuda_cu.h
@@ -0,0 +1 @@
+extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);
diff --git a/lib/cuda/cuda_data.cu b/lib/cuda/cuda_data.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e6669ea1196024408d4e5ecd5d3db770da19f05
--- /dev/null
+++ b/lib/cuda/cuda_data.cu
@@ -0,0 +1,220 @@
+enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
+
+#include "cuda_data_cu.h"
+#include "cuda_wrapper_cu.h"
+#include "cuda_data_kernel.cu"
+#include <cstdio>
+
+void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
+{
+  int size = n[0];
+
+  if(n[1] > 0) size *= n[1];
+
+  if(n[2] > 0) size *= n[2];
+
+  dim3 threads;
+  threads.x = 1;
+  threads.y = 1;
+  threads.z = 1;
+  dim3 grid;
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+
+  if(size <= 128 * 30)
+    threads.x = 32;
+  else if(size <= 256 * 30)
+    threads.x = 64;
+  else if(size <= 512 * 30)
+    threads.x = 128;
+  else
+    threads.x = 256;
+
+  grid.x = ((size - 1) + threads.x) / threads.x;
+
+  if(grid.x > 32000)
+    grid.x = 32000;
+
+  while(grid.x * grid.y * threads.x < size) grid.y++;
+
+  float debugdata[size];
+  //int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
+  size *= sizeof(double);
+  printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
+  CudaWrapper_UploadCudaData(host_data, buffer, size);
+  CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
+  cudaThreadSynchronize();
+  CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
+  double sum = 0;
+  printf("debugdata: ");
+
+  for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);
+
+  printf("%lf \n", sum);
+
+}
+
+void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
+{
+  int size = n[0];
+
+  if(n[1] > 0) size *= n[1];
+
+  if(n[2] > 0) size *= n[2];
+
+  dim3 threads;
+  threads.x = 1;
+  threads.y = 1;
+  threads.z = 1;
+  dim3 grid;
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+
+  if(size <= 128 * 30)
+    threads.x = 32;
+  else if(size <= 256 * 30)
+    threads.x = 64;
+  else if(size <= 512 * 30)
+    threads.x = 128;
+  else
+    threads.x = 256;
+
+  grid.x = ((size - 1) + threads.x) / threads.x;
+
+  if(grid.x > 32000)
+    grid.x = 32000;
+
+  while(grid.x * grid.y * threads.x < size) grid.y++;
+
+  size *= sizeof(double);
+
+  CudaWrapper_UploadCudaData(host_data, buffer, size);
+  CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
+  cudaThreadSynchronize();
+}
+
+void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
+{
+  int size = n[0];
+
+  if(n[1] > 0) size *= n[1];
+
+  if(n[2] > 0) size *= n[2];
+
+  dim3 threads;
+  threads.x = 1;
+  threads.y = 1;
+  threads.z = 1;
+  dim3 grid;
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+
+  if(size <= 128 * 30)
+    threads.x = 32;
+  else if(size <= 256 * 30)
+    threads.x = 64;
+  else if(size <= 512 * 30)
+    threads.x = 128;
+  else
+    threads.x = 256;
+
+  grid.x = ((size - 1) + threads.x) / threads.x;
+
+  if(grid.x > 32000)
+    grid.x = 32000;
+
+  while(grid.x * grid.y * threads.x < size) grid.y++;
+
+  size *= sizeof(float);
+
+  CudaWrapper_UploadCudaData(host_data, buffer, size);
+  CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
+  cudaThreadSynchronize();
+}
+
+void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
+{
+  int size = n[0];
+
+  if(n[1] > 0) size *= n[1];
+
+  if(n[2] > 0) size *= n[2];
+
+  dim3 threads;
+  threads.x = 1;
+  threads.y = 1;
+  threads.z = 1;
+  dim3 grid;
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+
+  if(size <= 128 * 30)
+    threads.x = 32;
+  else if(size <= 256 * 30)
+    threads.x = 64;
+  else if(size <= 512 * 30)
+    threads.x = 128;
+  else
+    threads.x = 256;
+
+  grid.x = ((size - 1) + threads.x) / threads.x;
+
+  if(grid.x > 32000)
+    grid.x = 32000;
+
+  while(grid.x * grid.y * threads.x < size) grid.y++;
+
+  size *= sizeof(float);
+
+  CudaWrapper_UploadCudaData(host_data, buffer, size);
+  CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
+  cudaThreadSynchronize();
+}
+
+void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
+{
+  int size = n[0];
+
+  if(n[1] > 0) size *= n[1];
+
+  if(n[2] > 0) size *= n[2];
+
+  dim3 threads;
+  threads.x = 1;
+  threads.y = 1;
+  threads.z = 1;
+  dim3 grid;
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+
+  if(size <= 128 * 30)
+    threads.x = 32;
+  else if(size <= 256 * 30)
+    threads.x = 64;
+  else if(size <= 512 * 30)
+    threads.x = 128;
+  else
+    threads.x = 256;
+
+  grid.x = ((size - 1) + threads.x) / threads.x;
+
+  if(grid.x > 32000)
+    grid.x = 32000;
+
+  while(grid.x * grid.y * threads.x < size) grid.y++;
+
+  size *= sizeof(int);
+
+  CudaWrapper_UploadCudaData(host_data, buffer, size);
+  CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
+  cudaThreadSynchronize();
+}
+
+void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
+{
+}
diff --git a/lib/cuda/cuda_data_cu.h b/lib/cuda/cuda_data_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..90dbd141b65682fb682855088020d0d3e650f33b
--- /dev/null
+++ b/lib/cuda/cuda_data_cu.h
@@ -0,0 +1,13 @@
+#ifndef CUDA_DATA_CU_H_
+#define CUDA_DATA_CU_H_
+
+extern "C" void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
+extern "C" void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
+extern "C" void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
+extern "C" void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
+extern "C" void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
+
+extern "C" void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer);
+
+
+#endif /*CUDA_DATA_CU_H_*/
diff --git a/lib/cuda/cuda_data_kernel.cu b/lib/cuda/cuda_data_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..41eea01564fa3667937e431e8e3bf7bb932fa1a6
--- /dev/null
+++ b/lib/cuda/cuda_data_kernel.cu
@@ -0,0 +1,195 @@
+__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data,
+    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
+{
+  if(mode == x) mode = xx;
+
+  unsigned length = nx;
+
+  if(ny > 0) length *= ny;
+
+  if(nz > 0) length *= nz;
+
+  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
+
+
+  if(i >= length) return;
+
+  switch(mode) {
+    case xx: {
+      dev_data[i] = buffer[i];
+    }
+
+    case xy: {
+      dev_data[i] = buffer[i];
+    }
+
+    case yx: {
+      j = i / ny;
+      k = i % ny;
+      dev_data[k * nx + j] = buffer[j * ny + k];
+    }
+
+    case xyz: {
+      dev_data[i] = buffer[i];
+    }
+
+    case xzy: {
+      j = i / (ny * nz);
+      k = (i % (ny * nz)) / nz;
+      l = i % nz;
+      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
+    }
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data,
+    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
+{
+  if(mode == x) mode = xx;
+
+  unsigned length = nx;
+
+  if(ny > 0) length *= ny;
+
+  if(nz > 0) length *= nz;
+
+  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
+
+  if(i >= length) return;
+
+  switch(mode) {
+    case xx:
+      dev_data[i] = buffer[i];
+
+    case xy:
+      dev_data[i] = buffer[i];
+
+    case yx:
+      j = i / ny;
+      k = i % ny;
+      dev_data[k * nx + j] = buffer[j * ny + k];
+
+    case xyz:
+      dev_data[i] = buffer[i];
+
+    case xzy:
+      j = i / (ny * nz);
+      k = (i % (ny * nz)) / nz;
+      l = i % nz;
+      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data,
+    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
+{
+  if(mode == x) mode = xx;
+
+  unsigned length = nx;
+
+  if(ny > 0) length *= ny;
+
+  if(nz > 0) length *= nz;
+
+  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
+
+  if(i >= length) return;
+
+  switch(mode) {
+    case xx:
+      dev_data[i] = buffer[i];
+
+    case xy:
+      dev_data[i] = buffer[i];
+
+    case yx:
+      j = i / ny;
+      k = i % ny;
+      dev_data[k * nx + j] = buffer[j * ny + k];
+
+    case xyz:
+      dev_data[i] = buffer[i];
+
+    case xzy:
+      j = i / (ny * nz);
+      k = (i % (ny * nz)) / nz;
+      l = i % nz;
+      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data,
+    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
+{
+  if(mode == x) mode = xx;
+
+  unsigned length = nx;
+
+  if(ny > 0) length *= ny;
+
+  if(nz > 0) length *= nz;
+
+  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
+
+  if(i >= length) return;
+
+  switch(mode) {
+    case xx:
+      dev_data[i] = buffer[i];
+
+    case xy:
+      dev_data[i] = buffer[i];
+
+    case yx:
+      j = i / ny;
+      k = i % ny;
+      dev_data[k * nx + j] = buffer[j * ny + k];
+
+    case xyz:
+      dev_data[i] = buffer[i];
+
+    case xzy:
+      j = i / (ny * nz);
+      k = (i % (ny * nz)) / nz;
+      l = i % nz;
+      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
+  }
+}
+
+__global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data,
+    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
+{
+  if(mode == x) mode = xx;
+
+  unsigned length = nx;
+
+  if(ny > 0) length *= ny;
+
+  if(nz > 0) length *= nz;
+
+  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
+
+  if(i >= length) return;
+
+  switch(mode) {
+    case xx:
+      dev_data[i] = buffer[i];
+
+    case xy:
+      dev_data[i] = buffer[i];
+
+    case yx:
+      j = i / ny;
+      k = i % ny;
+      dev_data[k * nx + j] = buffer[j * ny + k];
+
+    case xyz:
+      dev_data[i] = buffer[i];
+
+    case xzy:
+      j = i / (ny * nz);
+      k = (i % (ny * nz)) / nz;
+      l = i % nz;
+      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
+  }
+}
diff --git a/lib/cuda/cuda_kernel.cu b/lib/cuda/cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lib/cuda/cuda_pair.cu b/lib/cuda/cuda_pair.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9f9900a2d8da708b043bbb0c206094188d17f5c6
--- /dev/null
+++ b/lib/cuda/cuda_pair.cu
@@ -0,0 +1,1015 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+enum PAIR_FORCES {PAIR_NONE, PAIR_BORN, PAIR_BUCK, PAIR_CG_CMM, PAIR_LJ_CHARMM, PAIR_LJ_CLASS2, PAIR_LJ_CUT, PAIR_LJ_EXPAND, PAIR_LJ_GROMACS, PAIR_LJ_SMOOTH, PAIR_LJ96_CUT, PAIR_MORSE, PAIR_MORSE_R6};
+enum COUL_FORCES {COUL_NONE, COUL_CHARMM, COUL_CHARMM_IMPLICIT, COUL_CUT, COUL_LONG, COUL_DEBYE, COUL_GROMACS, COUL_SPECIAL};
+#define DATA_NONE 0
+#define DATA_V 1
+#define DATA_TAG 2
+#define DATA_RMASS 4
+#define DATA_MASS 8
+#define DATA_TORQUE 16
+#define DATA_OMEGA 32
+#define DATA_RADIUS 64
+#define DATA_DENSITY 128
+#define DATA_MASK 256
+#define DATA_V_RADIUS 512
+#define DATA_OMEGA_RMASS 1024
+
+#define NEIGHMASK 0x3FFFFFFF
+
+#define MY_PREFIX cuda_pair
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "cuda_wrapper_cu.h"
+#include "crm_cuda_utils.cu"
+
+//constants used by multiple forces
+
+//general
+#define _cutsq MY_AP(cutsq)
+#define _offset MY_AP(offset)
+#define _special_lj MY_AP(special_lj)
+#define _special_coul MY_AP(special_coul)
+#define _cutsq_global MY_AP(cutsq_global)
+#define _collect_forces_later MY_AP(collect_forces_later)
+
+__device__ __constant__ X_FLOAT _cutsq[CUDA_MAX_TYPES2];
+__device__ __constant__ ENERGY_FLOAT _offset[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT _special_lj[4];
+__device__ __constant__ F_FLOAT _special_coul[4];
+__device__ __constant__ X_FLOAT _cutsq_global;
+__device__ __constant__ int _collect_forces_later;
+
+__device__ __constant__ F_FLOAT MY_AP(coeff1)[CUDA_MAX_TYPES2]; //pair force coefficients in case ntypes < CUDA_MAX_TYPES (coeffs fit into constant space)
+__device__ __constant__ F_FLOAT MY_AP(coeff2)[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT MY_AP(coeff3)[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT MY_AP(coeff4)[CUDA_MAX_TYPES2];
+__device__ __constant__ F_FLOAT MY_AP(coeff5)[CUDA_MAX_TYPES2];
+
+
+__device__ __constant__ F_FLOAT* MY_AP(coeff1_gm); //pair force coefficients in case ntypes > CUDA_MAX_TYPES (coeffs do not fit into constant space)
+__device__ __constant__ F_FLOAT* MY_AP(coeff2_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff3_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff4_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff5_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff6_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff7_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff8_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff9_gm);
+__device__ __constant__ F_FLOAT* MY_AP(coeff10_gm);
+
+#define _coeff1_gm_tex         MY_AP(coeff1_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff1_gm_tex;
+#else
+texture<int2, 1> _coeff1_gm_tex;
+#endif
+
+#define _coeff2_gm_tex         MY_AP(coeff2_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff2_gm_tex;
+#else
+texture<int2, 1> _coeff2_gm_tex;
+#endif
+
+#define _coeff3_gm_tex         MY_AP(coeff3_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff3_gm_tex;
+#else
+texture<int2, 1> _coeff3_gm_tex;
+#endif
+
+#define _coeff4_gm_tex         MY_AP(coeff4_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff4_gm_tex;
+#else
+texture<int2, 1> _coeff4_gm_tex;
+#endif
+
+#define _coeff5_gm_tex         MY_AP(coeff5_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff5_gm_tex;
+#else
+texture<int2, 1> _coeff5_gm_tex;
+#endif
+
+#define _coeff6_gm_tex         MY_AP(coeff6_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff6_gm_tex;
+#else
+texture<int2, 1> _coeff6_gm_tex;
+#endif
+
+#define _coeff7_gm_tex         MY_AP(coeff7_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff7_gm_tex;
+#else
+texture<int2, 1> _coeff7_gm_tex;
+#endif
+
+#define _coeff8_gm_tex         MY_AP(coeff8_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff8_gm_tex;
+#else
+texture<int2, 1> _coeff8_gm_tex;
+#endif
+
+#define _coeff9_gm_tex         MY_AP(coeff9_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff9_gm_tex;
+#else
+texture<int2, 1> _coeff9_gm_tex;
+#endif
+
+#define _coeff10_gm_tex         MY_AP(coeff10_gm_tex)
+#if F_PRECISION == 1
+texture<float> _coeff10_gm_tex;
+#else
+texture<int2, 1> _coeff10_gm_tex;
+#endif
+
+//if more than 5 coefficients are needed for a pair potential add them here
+
+
+//coulomb
+#define _cut_coulsq MY_AP(cut_coulsq)
+#define _cut_coulsq_global MY_AP(cut_coulsq_global)
+#define _g_ewald MY_AP(g_ewald)
+#define _qqrd2e MY_AP(qqrd2e)
+#define _kappa MY_AP(kappa)
+__device__ __constant__ X_FLOAT _cut_coulsq[CUDA_MAX_TYPES2];
+__device__ __constant__ X_FLOAT _cut_coulsq_global;
+__device__ __constant__ F_FLOAT _g_ewald;
+__device__ __constant__ F_FLOAT _qqrd2e;
+__device__ __constant__ F_FLOAT _kappa;
+
+//inner cutoff
+#define _cut_innersq MY_AP(cut_innersq)
+#define _cut_innersq_global MY_AP(cut_innersq_global)
+__device__ __constant__ X_FLOAT _cut_innersq[CUDA_MAX_TYPES2];
+__device__ __constant__ X_FLOAT _cut_innersq_global;
+
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom);
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom);
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase);
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase);
+
+#include <stdio.h>
+#include "cuda_pair_cu.h"
+#include "cuda_pair_virial_kernel_nc.cu"
+
+//Functions which are shared by pair styles
+
+//Update Buffersize
+void Cuda_UpdateBuffer(cuda_shared_data* sdata, int size)
+{
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles: before updateBuffer failed");
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateBuffer_AllStyles failed");
+}
+
+void Cuda_Pair_UpdateNeighbor_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  //Neighbor
+  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal)  , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(firstneigh)     , & sneighlist->firstneigh.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ilist)          , & sneighlist->ilist     .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(inum)           , & sneighlist->inum               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(numneigh)       , & sneighlist->numneigh  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(neighbors)      , & sneighlist->neighbors  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(maxneighbors)       , & sneighlist->maxneighbors     , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(overlap_comm)       , & sdata->overlap_comm, sizeof(int));
+
+  if(sdata->overlap_comm) {
+    cudaMemcpyToSymbol(MY_AP(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*));
+  }
+
+}
+//Update constants after nmax change which are generally needed by all pair styles
+void Cuda_Pair_UpdateNmax_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: Begin");
+
+  //System
+  cudaMemcpyToSymbol(MY_AP(nlocal)    			, & sdata->atom.nlocal             , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)      			, & sdata->atom.nall               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      			, & sdata->atom.nmax               , sizeof(int));
+
+  //Atom
+  cudaMemcpyToSymbol(MY_AP(x)         			, & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)      			, & sdata->atom.type      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(q)         			, & sdata->atom.q         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
+
+
+  //Other
+  cudaMemcpyToSymbol(MY_AP(debugdata)      , & sdata->debugdata      , sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_Pair_UpdateNmax_AllStyles: End");
+}
+
+//Initialisation of GPU Constants which rarely change
+void Cuda_Pair_Init_AllStyles(cuda_shared_data* sdata, int ncoeff, bool need_q = false, bool use_global_params = false, bool need_innercut = false, bool need_cut = true)
+{
+  unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+  unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
+  unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
+  unsigned nx = sizeof(X_FLOAT) * cuda_ntypes2;
+
+  //check if enough constant memory is available
+  if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
+    printf("# CUDA: Cuda_Pair_Init: you need %u types. this is more than %u "
+           "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+           "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
+
+  if((cuda_ntypes2 > CUDA_MAX_TYPES2) && !use_global_params)
+    exit(0);
+
+  //type conversion of cutoffs and parameters
+  if(need_cut) {
+    X_FLOAT cutsq[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
+      }
+    }
+
+    int cutsqdiffer = 0;
+    X_FLOAT cutsq_global;
+    cutsq_global = (X_FLOAT)(sdata->pair.cut_global * sdata->pair.cut_global);
+
+    if(sdata->pair.cut) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut[i][j] > 1e-6) {
+            cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
+            cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut[i][j] * sdata->pair.cut[i][j]);
+          }
+
+          if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
+
+          if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
+
+    if(sdata->pair.cutsq) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut[i][j] > 1e-6) {
+            cutsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
+            cutsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cutsq[i][j]);
+          }
+
+          if(i == 1 && j == 1) cutsq_global = cutsq[i * cuda_ntypes + j];
+
+          if((cutsq_global - cutsq[i * cuda_ntypes + j]) * (cutsq_global - cutsq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
+
+    //printf("CUTSQGLOB: %i %e\n",cutsqdiffer,cutsq_global);
+    if(cutsqdiffer) {
+
+      cutsq_global = -1.0;
+      cudaMemcpyToSymbol(MY_AP(cutsq)      	, cutsq                    		, nx);
+    }
+
+    cudaMemcpyToSymbol(MY_AP(cutsq_global)	, &cutsq_global  				, sizeof(X_FLOAT));
+  }
+
+  if(need_innercut) {
+    X_FLOAT cut_innersq[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
+      }
+    }
+
+    int cutsqdiffer = 0;
+    X_FLOAT cut_innersq_global;
+    cut_innersq_global = (X_FLOAT)(sdata->pair.cut_inner_global * sdata->pair.cut_inner_global);
+
+    if(sdata->pair.cut_inner) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut_inner[i][j] > 1e-6) {
+            cut_innersq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
+            cut_innersq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_inner[i][j] * sdata->pair.cut_inner[i][j]);
+          }
+
+          if(i == 1 && j == 1) cut_innersq_global = cut_innersq[i * cuda_ntypes + j];
+
+          if((cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) * (cut_innersq_global - cut_innersq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
+
+    if(cutsqdiffer) {
+      cut_innersq_global = -1.0;
+      cudaMemcpyToSymbol(MY_AP(cut_innersq)      	, cut_innersq                    		, nx);
+    }
+
+    cudaMemcpyToSymbol(MY_AP(cut_innersq_global)	, &cut_innersq_global  				, sizeof(X_FLOAT));
+  }
+
+  if(need_q) {
+    X_FLOAT cut_coulsq[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
+      }
+    }
+
+    int cutsqdiffer = 0;
+    X_FLOAT cut_coulsq_global;
+    cut_coulsq_global = (X_FLOAT)(sdata->pair.cut_coul_global * sdata->pair.cut_coul_global);
+
+    if(sdata->pair.cut_coulsq_global > cut_coulsq_global)  cut_coulsq_global = (X_FLOAT) sdata->pair.cut_coulsq_global;
+
+    if(sdata->pair.cut_coul) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = i; j <= sdata->atom.ntypes; ++j) {
+          if(sdata->pair.cut_coul[i][j] > 1e-6) {
+            cut_coulsq[i * cuda_ntypes + j] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
+            cut_coulsq[j * cuda_ntypes + i] = (X_FLOAT)(sdata->pair.cut_coul[i][j] * sdata->pair.cut_coul[i][j]);
+          }
+
+          if(i == 1 && j == 1) cut_coulsq_global = cut_coulsq[i * cuda_ntypes + j];
+
+          if((cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) * (cut_coulsq_global - cut_coulsq[i * cuda_ntypes + j]) > 1e-6)
+            cutsqdiffer++;
+        }
+      }
+    }
+
+    if(cutsqdiffer) {
+      cut_coulsq_global = -1.0;
+      cudaMemcpyToSymbol(MY_AP(cut_coulsq)      	, cut_coulsq                    		, nx);
+    }
+
+    cudaMemcpyToSymbol(MY_AP(cut_coulsq_global), &cut_coulsq_global  					, sizeof(X_FLOAT));
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init pre Coeff failed");
+
+  if(ncoeff > 0) {
+    F_FLOAT coeff1[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff1[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff1[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff1_gm)  , &sdata->pair.coeff1_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy((sdata->pair.coeff1_gm.dev_data), coeff1, n, cudaMemcpyHostToDevice);
+
+      _coeff1_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff1_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff1_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff1_gm_texture_ptr = &MY_AP(coeff1_gm_tex);
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 a failed");
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b failed");
+      cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c failed");
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 b-d failed");
+      cudaBindTexture(0, coeff1_gm_texture_ptr, sdata->pair.coeff1_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+      CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 c-d failed");
+#endif
+
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff1), coeff1 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff0 failed");
+
+  if(ncoeff > 1) {
+    F_FLOAT coeff2[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff2[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff2[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff2_gm)  , &sdata->pair.coeff2_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff2_gm.dev_data, coeff2, n, cudaMemcpyHostToDevice);
+
+      _coeff2_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff2_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff2_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff2_gm_texture_ptr = &MY_AP(coeff2_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff2_gm_texture_ptr, sdata->pair.coeff2_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff2), coeff2 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff1 failed");
+
+  if(ncoeff > 2) {
+    F_FLOAT coeff3[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff3[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff3[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff3_gm)  , &sdata->pair.coeff3_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff3_gm.dev_data, coeff3, n, cudaMemcpyHostToDevice);
+      _coeff3_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff3_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff3_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff3_gm_texture_ptr = &MY_AP(coeff3_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff3_gm_texture_ptr, sdata->pair.coeff3_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff3), coeff3 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff3 failed");
+
+  if(ncoeff > 3) {
+    F_FLOAT coeff4[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff4[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff4[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff4_gm)  , &sdata->pair.coeff4_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff4_gm.dev_data, coeff4, n, cudaMemcpyHostToDevice);
+      _coeff4_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff4_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff4_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff4_gm_texture_ptr = &MY_AP(coeff4_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff4_gm_texture_ptr, sdata->pair.coeff4_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff4), coeff4 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff4 failed");
+
+  if(ncoeff > 4) {
+    F_FLOAT coeff5[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff5[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff5[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff5_gm)  , &sdata->pair.coeff5_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff5_gm.dev_data, coeff5, n, cudaMemcpyHostToDevice);
+      _coeff5_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff5_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff5_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff5_gm_texture_ptr = &MY_AP(coeff5_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff5_gm_texture_ptr, sdata->pair.coeff5_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    } else
+      cudaMemcpyToSymbol(MY_AP(coeff5), coeff5 , n);
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff5 failed");
+
+  if(ncoeff > 5) {
+    F_FLOAT coeff6[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff6[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff6[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff6_gm)  , &sdata->pair.coeff6_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff6_gm.dev_data, coeff6, n, cudaMemcpyHostToDevice);
+      _coeff6_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff6_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff6_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff6_gm_texture_ptr = &MY_AP(coeff6_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff6_gm_texture_ptr, sdata->pair.coeff6_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff6 failed");
+
+  if(ncoeff > 6) {
+    F_FLOAT coeff7[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff7[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff7[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff7_gm)  , &sdata->pair.coeff7_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff7_gm.dev_data, coeff7, n, cudaMemcpyHostToDevice);
+      _coeff7_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff7_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff7_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff7_gm_texture_ptr = &MY_AP(coeff7_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff7_gm_texture_ptr, sdata->pair.coeff7_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff7 failed");
+
+  if(ncoeff > 7) {
+    F_FLOAT coeff8[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff8[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff8[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff8_gm)  , &sdata->pair.coeff8_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff8_gm.dev_data, coeff8, n, cudaMemcpyHostToDevice);
+      _coeff8_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff8_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff8_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff8_gm_texture_ptr = &MY_AP(coeff8_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff8_gm_texture_ptr, sdata->pair.coeff8_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff8 failed");
+
+  if(ncoeff > 8) {
+    F_FLOAT coeff9[cuda_ntypes2];
+
+    for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+      for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+        coeff9[i * cuda_ntypes + j] = (F_FLOAT) sdata->pair.coeff9[i][j];
+      }
+    }
+
+    if(use_global_params) {
+      cudaMemcpyToSymbol(MY_AP(coeff9_gm)  , &sdata->pair.coeff9_gm.dev_data   , sizeof(F_FLOAT*));
+      cudaMemcpy(sdata->pair.coeff9_gm.dev_data, coeff9, n, cudaMemcpyHostToDevice);
+      _coeff9_gm_tex.normalized = false;                      // access with normalized texture coordinates
+      _coeff9_gm_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+      _coeff9_gm_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+      const textureReference* coeff9_gm_texture_ptr = &MY_AP(coeff9_gm_tex);
+
+#if F_PRECISION == 1
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float>();
+      cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(F_FLOAT));
+#else
+      cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int2>();
+      cudaBindTexture(0, coeff9_gm_texture_ptr, sdata->pair.coeff9_gm.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int2));
+#endif
+    }
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init Coeff9 failed");
+
+  F_FLOAT special_lj[4];
+  special_lj[0] = sdata->pair.special_lj[0];
+  special_lj[1] = sdata->pair.special_lj[1];
+  special_lj[2] = sdata->pair.special_lj[2];
+  special_lj[3] = sdata->pair.special_lj[3];
+
+
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+
+  cudaMemcpyToSymbol(MY_AP(box_size)   	, box_size                 		, sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)	, &cuda_ntypes            		, sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(special_lj) 	, special_lj               		, sizeof(F_FLOAT) * 4);
+  cudaMemcpyToSymbol(MY_AP(virial)     	, &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     	, &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity)	, sdata->domain.periodicity		, sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+
+  if(need_q) {
+    F_FLOAT qqrd2e_tmp = sdata->pppm.qqrd2e;
+    F_FLOAT special_coul[4];
+    special_coul[0] = sdata->pair.special_coul[0];
+    special_coul[1] = sdata->pair.special_coul[1];
+    special_coul[2] = sdata->pair.special_coul[2];
+    special_coul[3] = sdata->pair.special_coul[3];
+
+    cudaMemcpyToSymbol(MY_AP(special_coul)	, special_coul             		, sizeof(F_FLOAT) * 4);
+    cudaMemcpyToSymbol(MY_AP(g_ewald)    	, &sdata->pair.g_ewald	   		, sizeof(F_FLOAT));
+    cudaMemcpyToSymbol(MY_AP(qqrd2e)     	, &qqrd2e_tmp	   				, sizeof(F_FLOAT));
+    cudaMemcpyToSymbol(MY_AP(kappa)     	, &sdata->pair.kappa				, sizeof(F_FLOAT));
+    cudaMemcpyToSymbol(MY_AP(eng_coul)     , &sdata->pair.eng_coul.dev_data , sizeof(ENERGY_FLOAT*));
+  }
+
+  CUT_CHECK_ERROR("Cuda_Pair: init failed");
+}
+timespec startpairtime, endpairtime;
+//Function which is called prior to kernel invocation, determins grid, Binds Textures, updates constant memory if necessary
+void Cuda_Pair_PreKernel_AllStyles(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, dim3 &grid, dim3 &threads, int &sharedperproc, bool need_q = false, int maxthreads = 256)
+{
+  if(sdata->atom.nlocal == 0) return;
+
+  if(sdata->atom.update_neigh)
+    Cuda_Pair_UpdateNeighbor_AllStyles(sdata, sneighlist);
+
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax_AllStyles(sdata, sneighlist);
+
+  if(sdata->atom.update_nlocal) {
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  }
+
+
+
+  BindXTypeTexture(sdata);
+
+  if(need_q) BindQTexture(sdata);
+
+
+  sharedperproc = 0;
+
+  if(sdata->pair.use_block_per_atom) sharedperproc += 3;
+
+  if(eflag) sharedperproc += 1;
+
+  if(need_q && eflag) sharedperproc += 1;
+
+  if(vflag) sharedperproc += 6;
+
+  int threadnum = sneighlist->inum;
+
+  if(sdata->comm.comm_phase == 2)threadnum = sneighlist->inum_border2;
+
+  if(sdata->pair.use_block_per_atom) {
+    threadnum *= 64;
+    maxthreads = 64;
+  }
+
+  int3 layout = getgrid(threadnum, sharedperproc * sizeof(ENERGY_FLOAT), maxthreads, true); //need to limit to 192 threads due to register limit
+  threads.x = layout.z;
+  threads.y = 1;
+  threads.z = 1;
+  grid.x = layout.x;
+  grid.y = layout.y;
+  grid.z = 1;
+
+  int size = (unsigned)(layout.y * layout.x) * sharedperproc * sizeof(ENERGY_FLOAT);
+
+  if(sdata->pair.collect_forces_later) size += (unsigned)(sdata->atom.nmax * 3 * sizeof(F_FLOAT));
+
+  Cuda_UpdateBuffer(sdata, size);
+
+  if(sdata->pair.use_block_per_atom)
+    cudaMemset(sdata->buffer, 0, size);
+
+  sdata->pair.lastgridsize = grid.x * grid.y;
+  sdata->pair.n_energy_virial = sharedperproc;
+
+  if(sdata->pair.use_block_per_atom) sdata->pair.n_energy_virial -= 3;
+
+  clock_gettime(CLOCK_REALTIME, &startpairtime);
+
+  MYDBG(printf("# CUDA: Cuda_Pair: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
+}
+
+//Function which is called after the kernel invocation, collects energy and virial
+void Cuda_Pair_PostKernel_AllStyles(cuda_shared_data* sdata, dim3 &grid, int &sharedperproc, int eflag, int vflag)
+{
+  if((not sdata->pair.collect_forces_later) && (eflag || vflag)) { //not sdata->comm.comm_phase==2))
+    cudaThreadSynchronize();
+    clock_gettime(CLOCK_REALTIME, &endpairtime);
+    sdata->cuda_timings.pair_kernel +=
+      endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
+    CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
+
+    if(eflag || vflag) {
+      int n = grid.x * grid.y;
+
+      if(sdata->pair.use_block_per_atom)
+        grid.x = sharedperproc - 3;
+      else
+        grid.x = sharedperproc;
+
+      grid.y = 1;
+      dim3 threads(128, 1, 1);
+      MYDBG(printf("# CUDA: Cuda_Pair: virial compute kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
+      MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
+      cudaThreadSynchronize();
+      CUT_CHECK_ERROR("Cuda_Pair: virial compute Kernel execution failed");
+    }
+
+    MYDBG(printf("# CUDA: Cuda_Pair: kernel done\n");)
+  }
+}
+
+
+#include "pair_born_coul_long_cuda.cu"
+#include "pair_buck_coul_cut_cuda.cu"
+#include "pair_buck_coul_long_cuda.cu"
+#include "pair_buck_cuda.cu"
+#include "pair_lj_sdk_cuda.cu"
+#include "pair_lj_sdk_coul_cut_cuda.cu"
+#include "pair_lj_sdk_coul_debye_cuda.cu"
+#include "pair_lj_sdk_coul_long_cuda.cu"
+#include "pair_gran_hooke_cuda.cu"
+#include "pair_lj_charmm_coul_charmm_implicit_cuda.cu"
+#include "pair_lj_charmm_coul_charmm_cuda.cu"
+#include "pair_lj_charmm_coul_long_cuda.cu"
+#include "pair_lj_class2_coul_cut_cuda.cu"
+#include "pair_lj_class2_coul_long_cuda.cu"
+#include "pair_lj_class2_cuda.cu"
+#include "pair_lj_cut_coul_cut_cuda.cu"
+#include "pair_lj_cut_coul_debye_cuda.cu"
+#include "pair_lj_cut_coul_long_cuda.cu"
+#include "pair_lj_cut_cuda.cu"
+#include "pair_lj_cut_experimental_cuda.cu"
+#include "pair_lj_expand_cuda.cu"
+#include "pair_lj_gromacs_cuda.cu"
+#include "pair_lj_gromacs_coul_gromacs_cuda.cu"
+#include "pair_lj_smooth_cuda.cu"
+#include "pair_lj96_cut_cuda.cu"
+#include "pair_morse_coul_long_cuda.cu"
+#include "pair_morse_cuda.cu"
+#include "pair_eam_cuda.cu"
+
+#include "cuda_pair_kernel.cu"
+
+#include "pair_manybody_const.h"
+#include "pair_tersoff_cuda.cu"
+#include "pair_sw_cuda.cu"
+
+void Cuda_Pair_UpdateNmax(cuda_shared_data* sdata)
+{
+  CUT_CHECK_ERROR("Cuda_Pair: before updateNmax failed");
+  cudaMemcpyToSymbol(MY_AP(nlocal)    , & sdata->atom.nlocal             , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)      , & sdata->atom.nall               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      , & sdata->atom.nmax               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(type)      , & sdata->atom.type       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(x)         , & sdata->atom.x          .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)    , & sdata->atom.x_type     .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(xhold)     , & sdata->atom.xhold      .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v)         , & sdata->atom.v          .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(radius)    , & sdata->atom.radius     .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v_radius)  , & sdata->atom.v_radius   .dev_data, sizeof(V_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(omega)     , & sdata->atom.omega      .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(rmass)     , & sdata->atom.rmass      .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_Pair: updateNmax failed");
+}
+
+
+void Cuda_Pair_GenerateXType(cuda_shared_data* sdata)
+{
+  MYDBG(printf(" # CUDA: GenerateXType ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
+
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal) {
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  }
+
+  MYDBG(printf(" # CUDA: GenerateXType ... getgrid\n"); fflush(stdout);)
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  MYDBG(printf(" # CUDA: GenerateXType ... kernel start test\n");  fflush(stdout);)
+  Pair_GenerateXType_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+  MYDBG(printf(" # CUDA: GenerateXType ... end\n");  fflush(stdout);)
+}
+
+void Cuda_Pair_RevertXType(cuda_shared_data* sdata)
+{
+  MYDBG(printf(" # CUDA: RevertXType ... start\n");)
+
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Pair_RevertXType_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+  MYDBG(printf(" # CUDA: RevertXType ... end\n");)
+}
+
+void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata)
+{
+  MYDBG(printf(" # CUDA: GenerateVRadius ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
+
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  MYDBG(printf(" # CUDA: GenerateVRadius ... getgrid\n"); fflush(stdout);)
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  MYDBG(printf(" # CUDA: GenerateVRadius ... kernel start test\n");  fflush(stdout);)
+  Pair_GenerateVRadius_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateVRadius: Kernel failed");
+  MYDBG(printf(" # CUDA: GenerateVRadius ... end\n");  fflush(stdout);)
+}
+
+void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata)
+{
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... start %i %i %i %p %p %p %p\n", sdata->atom.nlocal, sdata->atom.nall, sdata->atom.nmax, sdata->atom.x.dev_data, sdata->atom.x_type.dev_data, sdata->atom.xhold.dev_data, sdata->atom.type.dev_data);)
+
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... getgrid\n"); fflush(stdout);)
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... kernel start test\n");  fflush(stdout);)
+  Pair_GenerateOmegaRmass_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateOmegaRmass: Kernel failed");
+  MYDBG(printf(" # CUDA: GenerateOmegaRmass ... end\n");  fflush(stdout);)
+}
+
+void Cuda_Pair_BuildXHold(cuda_shared_data* sdata)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_Pair_UpdateNmax(sdata);
+
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nall);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Pair_BuildXHold_Kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair GenerateXType: Kernel failed");
+}
+
+void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag)
+{
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &endpairtime);
+  sdata->cuda_timings.pair_kernel +=
+    endpairtime.tv_sec - startpairtime.tv_sec + 1.0 * (endpairtime.tv_nsec - startpairtime.tv_nsec) / 1000000000;
+  CUT_CHECK_ERROR("Cuda_Pair: Kernel execution failed");
+  dim3 threads;
+  dim3 grid;
+
+  if(eflag || vflag) {
+    int n = sdata->pair.lastgridsize;
+    grid.x = sdata->pair.n_energy_virial;
+    grid.y = 1;
+    threads.x = 128;
+    //printf("A grid.x: %i\n",grid.x);
+    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_Pair_CollectForces: virial compute Kernel execution failed");
+  }
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  threads.x = layout.z;
+  grid.x = layout.x;
+  grid.y = layout.y;
+  Pair_CollectForces_Kernel <<< grid, threads, 0>>>(sdata->pair.n_energy_virial, sdata->pair.lastgridsize);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Pair_CollectForces: Force Summation Kernel execution failed");
+
+}
diff --git a/lib/cuda/cuda_pair_cu.h b/lib/cuda/cuda_pair_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..bec7e82229d380b420c2e451f60097c480707971
--- /dev/null
+++ b/lib/cuda/cuda_pair_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include "cuda_shared.h"
+
+extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
+extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag);
diff --git a/lib/cuda/cuda_pair_kernel.cu b/lib/cuda/cuda_pair_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c697f9c7ea278c03aa3de265c374fdf1e127cb2
--- /dev/null
+++ b/lib/cuda/cuda_pair_kernel.cu
@@ -0,0 +1,1437 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+  ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+
+  ENERGY_FLOAT* sharedE;
+  ENERGY_FLOAT* sharedECoul;
+  ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+  if(eflag || eflag_atom) {
+    sharedE = &sharedmem[threadIdx.x];
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+
+    if(coul_type != COUL_NONE) {
+      sharedECoul = sharedE + blockDim.x;
+      sharedECoul[0] = ENERGY_F(0.0);
+      sharedV += blockDim.x;
+    }
+  }
+
+  if(vflag || vflag_atom) {
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT xtmp, ytmp, ztmp;
+  X_FLOAT4 myxtype;
+  F_FLOAT fxtmp, fytmp, fztmp, fpair;
+  F_FLOAT delx, dely, delz;
+  F_FLOAT factor_lj, factor_coul;
+  F_FLOAT qtmp;
+  int itype, i, j;
+  int jnum = 0;
+  int* jlist;
+
+  if(ii < _inum) {
+    i = _ilist[ii];
+
+    myxtype = fetchXType(i);
+    xtmp = myxtype.x;
+    ytmp = myxtype.y;
+    ztmp = myxtype.z;
+    itype = static_cast <int>(myxtype.w);
+
+
+    fxtmp = F_F(0.0);
+    fytmp = F_F(0.0);
+    fztmp = F_F(0.0);
+
+    if(coul_type != COUL_NONE)
+      qtmp = fetchQ(i);
+
+    jnum = _numneigh[i];
+    jlist = &_neighbors[i];
+  }
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(ii < _inum)
+      if(jj < jnum) {
+        fpair = F_F(0.0);
+        j = jlist[jj * _nlocal];
+        factor_lj =  _special_lj[sbmask(j)];
+
+        if(coul_type != COUL_NONE)
+          factor_coul = _special_coul[sbmask(j)];
+
+        j &= NEIGHMASK;
+
+        myxtype = fetchXType(j);
+        delx = xtmp - myxtype.x;
+        dely = ytmp - myxtype.y;
+        delz = ztmp - myxtype.z;
+        int jtype = static_cast <int>(myxtype.w);
+
+
+        const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+        bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
+
+        if(in_cutoff) {
+          switch(pair_type) {
+            case PAIR_BORN:
+              fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_BUCK:
+              fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_CG_CMM:
+              fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_CHARMM:
+              fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_CLASS2:
+              fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_CUT:
+              fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_EXPAND:
+              fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_GROMACS:
+              fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_SMOOTH:
+              fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ96_CUT:
+              fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_MORSE_R6:
+              fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_MORSE:
+              fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+          }
+        }
+
+        if(coul_type != COUL_NONE) {
+          const F_FLOAT qiqj = qtmp * fetchQ(j);
+
+          if(qiqj * qiqj > 1e-8) {
+            const bool in_coul_cutoff =
+              rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
+
+            if(in_coul_cutoff) {
+              switch(coul_type) {
+                case COUL_CHARMM:
+                  fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                  break;
+
+                case COUL_CHARMM_IMPLICIT:
+                  fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                  break;
+
+                case COUL_CUT: {
+                  const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
+
+                  if(eflag) {
+                    ecoul += forcecoul;
+                  }
+
+                  fpair += forcecoul * (F_F(1.0) / rsq);
+                }
+                break;
+
+                case COUL_DEBYE: {
+                  const F_FLOAT r2inv = F_F(1.0) / rsq;
+                  const X_FLOAT r = _RSQRT_(r2inv);
+                  const X_FLOAT rinv = F_F(1.0) / r;
+                  const F_FLOAT screening = _EXP_(-_kappa * r);
+                  F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+
+                  if(eflag) {
+                    ecoul += forcecoul * rinv;
+                  }
+
+                  forcecoul *= (_kappa + rinv);
+                  fpair += forcecoul * r2inv;
+                }
+                break;
+
+                case COUL_GROMACS:
+                  fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
+                  break;
+
+                case COUL_LONG: {
+                  const F_FLOAT r2inv = F_F(1.0) / rsq;
+                  const F_FLOAT r = _RSQRT_(r2inv);
+                  const F_FLOAT grij = _g_ewald * r;
+                  const F_FLOAT expm2 = _EXP_(-grij * grij);
+                  const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
+                  const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
+                  const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
+                  F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+
+                  if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
+
+                  if(eflag) {
+                    ecoul += prefactor * erfc;
+
+                    if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
+                  }
+
+                  fpair += forcecoul * r2inv;
+                }
+                break;
+              }
+            }
+
+            in_cutoff = in_cutoff || in_coul_cutoff;
+          }
+        }
+
+
+        if(in_cutoff) {
+          F_FLOAT dxfp, dyfp, dzfp;
+          fxtmp += dxfp = delx * fpair;
+          fytmp += dyfp = dely * fpair;
+          fztmp += dzfp = delz * fpair;
+
+          if(vflag) {
+            sharedV[0 * blockDim.x] += delx * dxfp;
+            sharedV[1 * blockDim.x] += dely * dyfp;
+            sharedV[2 * blockDim.x] += delz * dzfp;
+            sharedV[3 * blockDim.x] += delx * dyfp;
+            sharedV[4 * blockDim.x] += delx * dzfp;
+            sharedV[5 * blockDim.x] += dely * dzfp;
+          }
+        }
+      }
+  }
+
+  __syncthreads();
+
+  if(ii < _inum) {
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+      if(eflag) {
+        buffer = &buffer[1 * gridDim.x * gridDim.y];
+
+        if(coul_type != COUL_NONE)
+          buffer = &buffer[1 * gridDim.x * gridDim.y];
+      }
+
+      if(vflag) {
+        buffer = &buffer[6 * gridDim.x * gridDim.y];
+      }
+
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = fxtmp;
+      my_f += _nmax;
+      *my_f = fytmp;
+      my_f += _nmax;
+      *my_f = fztmp;
+    } else {
+      my_f = _f + i;
+      *my_f += fxtmp;
+      my_f += _nmax;
+      *my_f += fytmp;
+      my_f += _nmax;
+      *my_f += fztmp;
+    }
+  }
+
+  __syncthreads();
+
+  if(eflag) {
+    sharedE[0] = evdwl;
+
+    if(coul_type != COUL_NONE)
+      sharedECoul[0] = ecoul;
+  }
+
+  if(eflag_atom && i < _nlocal) {
+    if(coul_type != COUL_NONE)
+      _eatom[i] += evdwl + ecoul;
+    else
+      _eatom[i] += evdwl;
+  }
+
+  if(vflag_atom && i < _nlocal) {
+    _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+    _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+    _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+    _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+    _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+    _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+  }
+
+  if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
+}
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y);
+
+  if(ii >= _inum)
+    return;
+
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+  ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+  F_FLOAT3* sharedVirial1;
+  F_FLOAT3* sharedVirial2;
+  F_FLOAT* sharedEnergy;
+  F_FLOAT* sharedEnergyCoul;
+
+  F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
+
+  if(vflag) {
+    sharedVirial1 = &sharedForce[64];
+    sharedVirial2 = &sharedVirial1[64];
+  } else {
+    sharedVirial1 = &sharedForce[0];
+    sharedVirial2 = &sharedVirial1[0];
+  }
+
+  if(eflag) {
+    if(vflag || vflag_atom)
+      sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
+    else
+      sharedEnergy = (F_FLOAT*) &sharedForce[64];
+
+    if(coul_type != COUL_NONE)
+      sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
+
+  }
+
+  F_FLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
+  F_FLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+  F_FLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+
+  X_FLOAT xtmp, ytmp, ztmp;
+  X_FLOAT4 myxtype;
+  F_FLOAT delx, dely, delz;
+  F_FLOAT factor_lj, factor_coul;
+  F_FLOAT fpair;
+  F_FLOAT qtmp;
+  int itype, jnum, i, j;
+  int* jlist;
+
+  i = _ilist[ii];
+
+  myxtype = fetchXType(i);
+
+  xtmp = myxtype.x;
+  ytmp = myxtype.y;
+  ztmp = myxtype.z;
+  itype = static_cast <int>(myxtype.w);
+
+  if(coul_type != COUL_NONE)
+    qtmp = fetchQ(i);
+
+  jnum = _numneigh[i];
+
+  jlist = &_neighbors[i * _maxneighbors];
+  __syncthreads();
+
+  for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
+    if(jj < jnum) {
+      fpair = F_F(0.0);
+      j = jlist[jj];
+      factor_lj =  _special_lj[sbmask(j)];
+
+      if(coul_type != COUL_NONE)
+        factor_coul = _special_coul[sbmask(j)];
+
+      j &= NEIGHMASK;
+
+      myxtype = fetchXType(j);
+
+      delx = xtmp - myxtype.x;
+      dely = ytmp - myxtype.y;
+      delz = ztmp - myxtype.z;
+      int jtype = static_cast <int>(myxtype.w);
+
+      const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+      bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
+      bool in_coul_cutoff;
+
+      if(in_cutoff) {
+        switch(pair_type) {
+          case PAIR_BORN:
+            fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_BUCK:
+            fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_CG_CMM:
+            fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_CHARMM:
+            fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_CLASS2:
+            fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_CUT:
+            fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_EXPAND:
+            fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_GROMACS:
+            fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_SMOOTH:
+            fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ96_CUT:
+            fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_MORSE_R6:
+            fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_MORSE:
+            fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+        }
+      }
+
+      if(coul_type != COUL_NONE) {
+        const F_FLOAT qiqj = qtmp * fetchQ(j);
+
+        if(qiqj * qiqj > (1e-8f)) {
+          in_coul_cutoff =
+            rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
+
+          if(in_coul_cutoff) {
+            switch(coul_type) {
+              case COUL_CHARMM:
+                fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                break;
+
+              case COUL_CHARMM_IMPLICIT:
+                fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                break;
+
+              case COUL_GROMACS:
+                fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
+                break;
+
+              case COUL_LONG: {
+                const F_FLOAT r2inv = F_F(1.0) / rsq;
+                const F_FLOAT r = _RSQRT_(r2inv);
+                const F_FLOAT grij = _g_ewald * r;
+                const F_FLOAT expm2 = _EXP_(-grij * grij);
+                const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
+                const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
+                const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
+                F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+
+                if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
+
+                if(eflag) {
+                  ecoul += prefactor * erfc;
+
+                  if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
+                }
+
+                fpair += forcecoul * r2inv;
+              }
+              break;
+
+              case COUL_DEBYE: {
+                const F_FLOAT r2inv = F_F(1.0) / rsq;
+                const X_FLOAT r = _RSQRT_(r2inv);
+                const X_FLOAT rinv = F_F(1.0) / r;
+                const F_FLOAT screening = _EXP_(-_kappa * r);
+                F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+
+                if(eflag) {
+                  ecoul += forcecoul * rinv;
+                }
+
+                forcecoul *= (_kappa + rinv);
+                fpair += forcecoul * r2inv;
+              }
+              break;
+
+              case COUL_CUT: {
+                const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
+
+                if(eflag) {
+                  ecoul += forcecoul;
+                }
+
+                fpair += forcecoul * (F_F(1.0) / rsq);
+              }
+              break;
+
+
+            }
+          }
+        }
+      }
+
+
+
+      if(in_cutoff || in_coul_cutoff) {
+        F_FLOAT dxfp, dyfp, dzfp;
+        partialForce.x += dxfp = delx * fpair;
+        partialForce.y += dyfp = dely * fpair;
+        partialForce.z += dzfp = delz * fpair;
+
+        if(vflag) {
+          partialVirial1.x += delx * dxfp;
+          partialVirial1.y += dely * dyfp;
+          partialVirial1.z += delz * dzfp;
+          partialVirial2.x += delx * dyfp;
+          partialVirial2.y += delx * dzfp;
+          partialVirial2.z += dely * dzfp;
+        }
+      }
+    }
+  }
+
+  if(eflag) {
+    sharedEnergy[threadIdx.x] = evdwl;
+
+    if(coul_type != COUL_NONE)
+      sharedEnergyCoul[threadIdx.x] = ecoul;
+  }
+
+  sharedForce[threadIdx.x] = partialForce;
+
+  if(vflag) {
+    sharedVirial1[threadIdx.x] = partialVirial1;
+    sharedVirial2[threadIdx.x] = partialVirial2;
+  }
+
+  __syncthreads();
+
+
+  for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
+
+    if(threadIdx.x < s) {
+      sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
+      sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
+      sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
+
+      if(vflag) {
+        sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
+        sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
+        sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
+
+        sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
+        sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
+        sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
+      }
+
+      if(eflag) {
+        sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
+
+        if(coul_type != COUL_NONE)
+          sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
+      }
+    }
+
+    __syncthreads();
+  }
+
+  if(threadIdx.x == 0) {
+
+    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+    if(eflag) {
+      ENERGY_FLOAT tmp_evdwl;
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
+
+      if(eflag_atom)
+        _eatom[i] = tmp_evdwl;
+
+      buffer = &buffer[gridDim.x * gridDim.y];
+
+      if(coul_type != COUL_NONE) {
+        buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
+
+        if(eflag_atom)
+          _eatom[i] += tmp_evdwl;
+
+        buffer = &buffer[gridDim.x * gridDim.y];
+      }
+    }
+
+    if(vflag) {
+      ENERGY_FLOAT tmp;
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
+
+      if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
+
+      if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
+
+      if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
+
+      if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
+
+      if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
+
+      if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
+
+      buffer = &buffer[6 * gridDim.x * gridDim.y];
+    }
+
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = sharedForce[0].x;
+      my_f += _nmax;
+      *my_f = sharedForce[0].y;
+      my_f += _nmax;
+      *my_f = sharedForce[0].z;
+    } else {
+      my_f = _f + i;
+      *my_f += sharedForce[0].x;
+      my_f += _nmax;
+      *my_f += sharedForce[0].y;
+      my_f += _nmax;
+      *my_f += sharedForce[0].z;
+    }
+  }
+}
+
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
+{
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+  ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+
+  ENERGY_FLOAT* sharedE;
+  ENERGY_FLOAT* sharedECoul;
+  ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+  if(eflag || eflag_atom) {
+    sharedE = &sharedmem[threadIdx.x];
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+
+    if(coul_type != COUL_NONE) {
+      sharedECoul = sharedE + blockDim.x;
+      sharedECoul[0] = ENERGY_F(0.0);
+      sharedV += blockDim.x;
+    }
+  }
+
+  if(vflag || vflag_atom) {
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT xtmp, ytmp, ztmp;
+  X_FLOAT4 myxtype;
+  F_FLOAT fxtmp, fytmp, fztmp, fpair;
+  F_FLOAT delx, dely, delz;
+  F_FLOAT factor_lj, factor_coul;
+  F_FLOAT qtmp;
+  int itype, i, j;
+  int jnum = 0;
+  int* jlist;
+
+  if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
+    i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ;
+
+    myxtype = fetchXType(i);
+    myxtype = _x_type[i];
+    xtmp = myxtype.x;
+    ytmp = myxtype.y;
+    ztmp = myxtype.z;
+    itype = static_cast <int>(myxtype.w);
+
+
+    fxtmp = F_F(0.0);
+    fytmp = F_F(0.0);
+    fztmp = F_F(0.0);
+
+    if(coul_type != COUL_NONE)
+      qtmp = fetchQ(i);
+
+    jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
+
+
+    jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]);
+  }
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(ii < (comm_phase < 2 ? _inum : _inum_border[0]))
+      if(jj < jnum) {
+        fpair = F_F(0.0);
+        j = jlist[jj * _nlocal];
+
+        factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
+
+        if(coul_type != COUL_NONE)
+          factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
+
+        j = j < _nall ? j : j % _nall;
+
+        myxtype = fetchXType(j);
+        delx = xtmp - myxtype.x;
+        dely = ytmp - myxtype.y;
+        delz = ztmp - myxtype.z;
+        int jtype = static_cast <int>(myxtype.w);
+
+
+        const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+        bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
+
+        if(in_cutoff) {
+          switch(pair_type) {
+            case PAIR_BORN:
+              fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_BUCK:
+              fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_CG_CMM:
+              fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_CHARMM:
+              fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_CLASS2:
+              fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_CUT:
+              fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_EXPAND:
+              fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_GROMACS:
+              fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ_SMOOTH:
+              fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_LJ96_CUT:
+              fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_MORSE_R6:
+              fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+
+            case PAIR_MORSE:
+              fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+              break;
+          }
+        }
+
+        if(coul_type != COUL_NONE) {
+          const F_FLOAT qiqj = qtmp * fetchQ(j);
+
+          if(qiqj * qiqj > 1e-8) {
+            const bool in_coul_cutoff =
+              rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
+
+            if(in_coul_cutoff) {
+              switch(coul_type) {
+                case COUL_CHARMM:
+                  fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                  break;
+
+                case COUL_CHARMM_IMPLICIT:
+                  fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                  break;
+
+                case COUL_CUT: {
+                  const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
+
+                  if(eflag) {
+                    ecoul += forcecoul;
+                  }
+
+                  fpair += forcecoul * (F_F(1.0) / rsq);
+                }
+                break;
+
+                case COUL_DEBYE: {
+                  const F_FLOAT r2inv = F_F(1.0) / rsq;
+                  const X_FLOAT r = _RSQRT_(r2inv);
+                  const X_FLOAT rinv = F_F(1.0) / r;
+                  const F_FLOAT screening = _EXP_(-_kappa * r);
+                  F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+
+                  if(eflag) {
+                    ecoul += forcecoul * rinv;
+                  }
+
+                  forcecoul *= (_kappa + rinv);
+                  fpair += forcecoul * r2inv;
+                }
+                break;
+
+                case COUL_GROMACS:
+                  fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
+                  break;
+
+                case COUL_LONG: {
+                  const F_FLOAT r2inv = F_F(1.0) / rsq;
+                  const F_FLOAT r = _RSQRT_(r2inv);
+                  const F_FLOAT grij = _g_ewald * r;
+                  const F_FLOAT expm2 = _EXP_(-grij * grij);
+                  const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
+                  const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
+                  const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
+                  F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+
+                  if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
+
+                  if(eflag) {
+                    ecoul += prefactor * erfc;
+
+                    if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
+                  }
+
+                  fpair += forcecoul * r2inv;
+                }
+                break;
+
+              }
+            }
+
+            in_cutoff = in_cutoff || in_coul_cutoff;
+          }
+        }
+
+
+        if(in_cutoff) {
+          F_FLOAT dxfp, dyfp, dzfp;
+          fxtmp += dxfp = delx * fpair;
+          fytmp += dyfp = dely * fpair;
+          fztmp += dzfp = delz * fpair;
+
+          if(vflag) {
+            sharedV[0 * blockDim.x] += delx * dxfp;
+            sharedV[1 * blockDim.x] += dely * dyfp;
+            sharedV[2 * blockDim.x] += delz * dzfp;
+            sharedV[3 * blockDim.x] += delx * dyfp;
+            sharedV[4 * blockDim.x] += delx * dzfp;
+            sharedV[5 * blockDim.x] += dely * dzfp;
+          }
+        }
+      }
+  }
+
+  __syncthreads();
+
+  if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+      if(eflag) {
+        buffer = &buffer[1 * gridDim.x * gridDim.y];
+
+        if(coul_type != COUL_NONE)
+          buffer = &buffer[1 * gridDim.x * gridDim.y];
+      }
+
+      if(vflag) {
+        buffer = &buffer[6 * gridDim.x * gridDim.y];
+      }
+
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = fxtmp;
+      my_f += _nmax;
+      *my_f = fytmp;
+      my_f += _nmax;
+      *my_f = fztmp;
+    } else {
+      my_f = _f + i;
+      *my_f += fxtmp;
+      my_f += _nmax;
+      *my_f += fytmp;
+      my_f += _nmax;
+      *my_f += fztmp;
+    }
+  }
+
+  __syncthreads();
+
+  if(eflag) {
+    sharedE[0] = evdwl;
+
+    if(coul_type != COUL_NONE)
+      sharedECoul[0] = ecoul;
+  }
+
+  if(eflag_atom && i < _nlocal) {
+    if(coul_type != COUL_NONE)
+      _eatom[i] += evdwl + ecoul;
+    else
+      _eatom[i] += evdwl;
+  }
+
+  if(vflag_atom && i < _nlocal) {
+    _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+    _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+    _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+    _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+    _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+    _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+  }
+
+  if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
+}
+
+template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
+__global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
+{
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y);
+
+  if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
+    return;
+
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+  ENERGY_FLOAT ecoul = ENERGY_F(0.0);
+  F_FLOAT3* sharedVirial1;
+  F_FLOAT3* sharedVirial2;
+  F_FLOAT* sharedEnergy;
+  F_FLOAT* sharedEnergyCoul;
+
+  F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
+
+  if(vflag) {
+    sharedVirial1 = &sharedForce[64];
+    sharedVirial2 = &sharedVirial1[64];
+  } else {
+    sharedVirial1 = &sharedForce[0];
+    sharedVirial2 = &sharedVirial1[0];
+  }
+
+  if(eflag) {
+    if(vflag || vflag_atom)
+      sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
+    else
+      sharedEnergy = (F_FLOAT*) &sharedForce[64];
+
+    if(coul_type != COUL_NONE)
+      sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
+
+  }
+
+  F_FLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
+  F_FLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+  F_FLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
+
+  X_FLOAT xtmp, ytmp, ztmp;
+  X_FLOAT4 myxtype;
+  F_FLOAT delx, dely, delz;
+  F_FLOAT factor_lj, factor_coul;
+  F_FLOAT fpair;
+  F_FLOAT qtmp;
+  int itype, jnum, i, j;
+  int* jlist;
+
+  i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii];
+
+  myxtype = fetchXType(i);
+
+  xtmp = myxtype.x;
+  ytmp = myxtype.y;
+  ztmp = myxtype.z;
+  itype = static_cast <int>(myxtype.w);
+
+  if(coul_type != COUL_NONE)
+    qtmp = fetchQ(i);
+
+  jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
+
+  jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]);
+  __syncthreads();
+
+  for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
+    if(jj < jnum) {
+      fpair = F_F(0.0);
+      j = jlist[jj];
+      factor_lj   = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
+
+      if(coul_type != COUL_NONE)
+        factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
+
+      j 			= j < _nall ? j : j % _nall;
+
+      myxtype = fetchXType(j);
+
+      delx = xtmp - myxtype.x;
+      dely = ytmp - myxtype.y;
+      delz = ztmp - myxtype.z;
+      int jtype = static_cast <int>(myxtype.w);
+
+      const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+      bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
+      bool in_coul_cutoff;
+
+      if(in_cutoff) {
+        switch(pair_type) {
+          case PAIR_BORN:
+            fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_BUCK:
+            fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_CG_CMM:
+            fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_CHARMM:
+            fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_CLASS2:
+            fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_CUT:
+            fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_EXPAND:
+            fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_GROMACS:
+            fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ_SMOOTH:
+            fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_LJ96_CUT:
+            fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_MORSE_R6:
+            fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+
+          case PAIR_MORSE:
+            fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
+            break;
+        }
+      }
+
+      if(coul_type != COUL_NONE) {
+        const F_FLOAT qiqj = qtmp * fetchQ(j);
+
+        if(qiqj * qiqj > (1e-8f)) {
+          in_coul_cutoff =
+            rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
+
+          if(in_coul_cutoff) {
+            switch(coul_type) {
+              case COUL_CHARMM:
+                fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                break;
+
+              case COUL_CHARMM_IMPLICIT:
+                fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
+                break;
+
+              case COUL_GROMACS:
+                fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
+                break;
+
+              case COUL_LONG: {
+                const F_FLOAT r2inv = F_F(1.0) / rsq;
+                const F_FLOAT r = _RSQRT_(r2inv);
+                const F_FLOAT grij = _g_ewald * r;
+                const F_FLOAT expm2 = _EXP_(-grij * grij);
+                const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
+                const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
+                const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
+                F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+
+                if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
+
+                if(eflag) {
+                  ecoul += prefactor * erfc;
+
+                  if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
+                }
+
+                fpair += forcecoul * r2inv;
+              }
+              break;
+
+              case COUL_DEBYE: {
+                const F_FLOAT r2inv = F_F(1.0) / rsq;
+                const X_FLOAT r = _RSQRT_(r2inv);
+                const X_FLOAT rinv = F_F(1.0) / r;
+                const F_FLOAT screening = _EXP_(-_kappa * r);
+                F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
+
+                if(eflag) {
+                  ecoul += forcecoul * rinv;
+                }
+
+                forcecoul *= (_kappa + rinv);
+                fpair += forcecoul * r2inv;
+              }
+              break;
+
+              case COUL_CUT: {
+                const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
+
+                if(eflag) {
+                  ecoul += forcecoul;
+                }
+
+                fpair += forcecoul * (F_F(1.0) / rsq);
+              }
+              break;
+
+
+            }
+          }
+        }
+      }
+
+
+
+      if(in_cutoff || in_coul_cutoff) {
+        F_FLOAT dxfp, dyfp, dzfp;
+        partialForce.x += dxfp = delx * fpair;
+        partialForce.y += dyfp = dely * fpair;
+        partialForce.z += dzfp = delz * fpair;
+
+        if(vflag) {
+          partialVirial1.x += delx * dxfp;
+          partialVirial1.y += dely * dyfp;
+          partialVirial1.z += delz * dzfp;
+          partialVirial2.x += delx * dyfp;
+          partialVirial2.y += delx * dzfp;
+          partialVirial2.z += dely * dzfp;
+        }
+      }
+    }
+  }
+
+  if(eflag) {
+    sharedEnergy[threadIdx.x] = evdwl;
+
+    if(coul_type != COUL_NONE)
+      sharedEnergyCoul[threadIdx.x] = ecoul;
+  }
+
+  sharedForce[threadIdx.x] = partialForce;
+
+  if(vflag) {
+    sharedVirial1[threadIdx.x] = partialVirial1;
+    sharedVirial2[threadIdx.x] = partialVirial2;
+  }
+
+  __syncthreads();
+
+
+  for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
+
+    if(threadIdx.x < s) {
+      sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
+      sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
+      sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
+
+      if(vflag) {
+        sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
+        sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
+        sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
+
+        sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
+        sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
+        sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
+      }
+
+      if(eflag) {
+        sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
+
+        if(coul_type != COUL_NONE)
+          sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
+      }
+    }
+
+    __syncthreads();
+  }
+
+  if(threadIdx.x == 0) {
+
+    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+    if(eflag) {
+      ENERGY_FLOAT tmp_evdwl;
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
+
+      if(eflag_atom)
+        _eatom[i] = tmp_evdwl;
+
+      buffer = &buffer[gridDim.x * gridDim.y];
+
+      if(coul_type != COUL_NONE) {
+        buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
+
+        if(eflag_atom)
+          _eatom[i] += tmp_evdwl;
+
+        buffer = &buffer[gridDim.x * gridDim.y];
+      }
+    }
+
+    if(vflag) {
+      ENERGY_FLOAT tmp;
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
+
+      if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
+
+      if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
+
+      if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
+
+      if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
+
+      if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
+
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
+
+      if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
+
+      buffer = &buffer[6 * gridDim.x * gridDim.y];
+    }
+
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = sharedForce[0].x;
+      my_f += _nmax;
+      *my_f = sharedForce[0].y;
+      my_f += _nmax;
+      *my_f = sharedForce[0].z;
+    } else {
+      my_f = _f + i;
+      *my_f += sharedForce[0].x;
+      my_f += _nmax;
+      *my_f += sharedForce[0].y;
+      my_f += _nmax;
+      *my_f += sharedForce[0].z;
+    }
+  }
+}
+
+__global__ void Pair_GenerateXType_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nall) {
+    X_FLOAT4 xtype;
+    xtype.x = _x[i];
+    xtype.y = _x[i + _nmax];
+    xtype.z = _x[i + 2 * _nmax];
+    xtype.w = _type[i];
+    _x_type[i] = xtype;
+  }
+
+}
+
+__global__ void Pair_GenerateVRadius_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nall) {
+    V_FLOAT4 vradius;
+    vradius.x = _v[i];
+    vradius.y = _v[i + _nmax];
+    vradius.z = _v[i + 2 * _nmax];
+    vradius.w = _radius[i];
+    _v_radius[i] = vradius;
+  }
+}
+
+__global__ void Pair_GenerateOmegaRmass_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nall) {
+    V_FLOAT4 omegarmass;
+    omegarmass.x = _omega[i];
+    omegarmass.y = _omega[i + _nmax];
+    omegarmass.z = _omega[i + 2 * _nmax];
+    omegarmass.w = _rmass[i];
+    _omega_rmass[i] = omegarmass;
+  }
+}
+
+__global__ void Pair_RevertXType_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nall) {
+    X_FLOAT4 xtype = _x_type[i];
+    _x[i] = xtype.x;
+    _x[i + _nmax] = xtype.y;
+    _x[i + 2 * _nmax] = xtype.z;
+    _type[i] = static_cast <int>(xtype.w);
+  }
+
+}
+
+__global__ void Pair_BuildXHold_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nall) {
+    X_FLOAT4 xtype = _x_type[i];
+    _xhold[i] = xtype.x;
+    _xhold[i + _nmax] = xtype.y;
+    _xhold[i + 2 * _nmax] = xtype.z;
+  }
+
+}
+
+__global__ void Pair_CollectForces_Kernel(int nperblock, int n)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i >= _nlocal) return;
+
+  ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
+
+  F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
+  F_FLOAT* my_f = _f + i;
+  buf_f += i;
+  *my_f += * buf_f;
+  my_f += _nmax;
+  buf_f += _nmax;
+  *my_f += * buf_f;
+  my_f += _nmax;
+  buf_f += _nmax;
+  *my_f += * buf_f;
+  my_f += _nmax;
+}
diff --git a/lib/cuda/cuda_pair_virial_kernel_nc.cu b/lib/cuda/cuda_pair_virial_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3987bde43e3ccd9b1d2e8bc2343d8a1f9d6c01bc
--- /dev/null
+++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu
@@ -0,0 +1,126 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ ENERGY_FLOAT sharedmem[];
+
+static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0)
+{
+  __syncthreads();
+  ENERGY_FLOAT* shared = sharedmem;
+
+  if(eflag) {
+    reduceBlock(shared);
+    shared += blockDim.x;
+
+    if(coulflag) {
+      reduceBlock(shared);
+      shared += blockDim.x;
+    }
+  }
+
+  if(vflag) {
+    reduceBlock(shared + 0 * blockDim.x);
+    reduceBlock(shared + 1 * blockDim.x);
+    reduceBlock(shared + 2 * blockDim.x);
+    reduceBlock(shared + 3 * blockDim.x);
+    reduceBlock(shared + 4 * blockDim.x);
+    reduceBlock(shared + 5 * blockDim.x);
+  }
+
+  if(threadIdx.x == 0) {
+    shared = sharedmem;
+    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+    if(eflag) {
+      buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
+      shared += blockDim.x;
+      buffer += gridDim.x * gridDim.y;
+
+      if(coulflag) {
+        buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
+        shared += blockDim.x;
+        buffer += gridDim.x * gridDim.y;
+      }
+    }
+
+    if(vflag) {
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x];
+    }
+  }
+
+  __syncthreads();
+}
+
+__global__ void MY_AP(PairVirialCompute_reduce)(int n)
+{
+  sharedmem[threadIdx.x] = ENERGY_F(0.0);
+  ENERGY_FLOAT sum = ENERGY_F(0.0);
+  ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
+  buf = &buf[blockIdx.x * n];
+  //if(blockIdx.x==2) buf=&buf[n];
+
+  for(int i = 0; i < n; i += blockDim.x) {
+    sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
+    __syncthreads();
+    reduceBlock(sharedmem);
+
+    if(threadIdx.x == 0) sum += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0) {
+    if(gridDim.x == 1) { //evdwl
+      _eng_vdwl[0] += sum;
+    }
+
+    if(gridDim.x == 2) { //evdwl + ecoul only
+      if(blockIdx.x == 0)
+        _eng_vdwl[0] += sum;
+      else
+        _eng_coul[0] += sum;
+    }
+
+    if(gridDim.x == 6) { //virial
+      _virial[blockIdx.x] += sum;
+    }
+
+    if(gridDim.x == 7) { //evdwl+virial
+      if(blockIdx.x == 0)
+        _eng_vdwl[0] += sum;
+      else _virial[blockIdx.x - 1] += sum;
+    }
+
+    if(gridDim.x == 8) { //evdwl+ecoul+virial
+      if(blockIdx.x == 0)
+        _eng_vdwl[0] += sum;
+      else if(blockIdx.x == 1)
+        _eng_coul[0] += sum;
+      else
+        _virial[blockIdx.x - 2] += sum;
+    }
+  }
+}
diff --git a/lib/cuda/cuda_precision.h b/lib/cuda/cuda_precision.h
new file mode 100644
index 0000000000000000000000000000000000000000..2dc4ab5607cc42ced1d3a1ae88005730a7b7dde3
--- /dev/null
+++ b/lib/cuda/cuda_precision.h
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef CUDA_PRECISION_H_
+#define CUDA_PRECISION_H_
+/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
+ * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
+ * ***_FLOAT: type definition of given property
+ * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
+ */
+
+#ifdef CUDA_USE_BINNING
+#define CUDA_IF_BINNING(a) a
+#else
+#define CUDA_IF_BINNING(a)
+#endif
+
+//GLOBAL
+
+#ifdef CUDA_PRECISION
+#if CUDA_PRECISION == 1
+#define CUDA_FLOAT float
+#define CUDA_F(x) x##f
+#endif
+#if CUDA_PRECISION == 2
+#define CUDA_FLOAT double
+#define CUDA_F(x) x
+#endif
+#endif
+
+#ifndef CUDA_PRECISION
+#define CUDA_FLOAT double
+#define CUDA_F(x) x
+#define CUDA_PRECISION 2
+#endif
+//--------------------------------
+//-----------FFT-----------------
+//--------------------------------
+
+#ifdef FFT_PRECISION_CU
+#if FFT_PRECISION_CU == 1
+#define FFT_FLOAT float
+#define FFT_F(x) x##f
+#endif
+#if FFT_PRECISION_CU == 2
+#define FFT_FLOAT double
+#define FFT_F(x) x
+#endif
+#endif
+
+#ifndef FFT_PRECISION_CU
+#define FFT_FLOAT CUDA_FLOAT
+#define FFT_F(x) CUDA_F(x)
+#define FFT_PRECISION_CU CUDA_PRECISION
+#endif
+
+//--------------------------------
+//-----------PPPM-----------------
+//--------------------------------
+
+#ifndef PPPM_PRECISION
+#define PPPM_PRECISION CUDA_PRECISION
+#endif
+
+#ifdef PPPM_PRECISION
+#if PPPM_PRECISION == 1
+#define PPPM_FLOAT float
+#ifdef float3
+#define PPPM_FLOAT3 float3
+#else
+struct PPPM_FLOAT3 {
+  PPPM_FLOAT x;
+  PPPM_FLOAT y;
+  PPPM_FLOAT z;
+};
+#endif
+#define PPPM_F(x) x##f
+#endif
+#if PPPM_PRECISION == 2
+#define PPPM_FLOAT double
+struct PPPM_FLOAT3 {
+  PPPM_FLOAT x;
+  PPPM_FLOAT y;
+  PPPM_FLOAT z;
+};
+#define PPPM_F(x) x
+#endif
+#endif
+
+
+//--------------------------------
+//-----------FORCE-----------------
+//--------------------------------
+
+
+#ifdef F_PRECISION
+#if F_PRECISION == 1
+#define F_FLOAT float
+#define F_F(x) x##f
+#endif
+#if F_PRECISION == 2
+#define F_FLOAT double
+#define F_F(x) x
+#endif
+#endif
+
+#ifndef F_PRECISION
+#define F_FLOAT CUDA_FLOAT
+#define F_F(x) CUDA_F(x)
+#define F_PRECISION CUDA_PRECISION
+#endif
+
+#if F_PRECISION == 1
+#define _SQRT_ sqrtf
+#define _RSQRT_ rsqrtf
+#define _EXP_ expf
+#else
+#define _SQRT_ sqrt
+#define _RSQRT_ rsqrt
+#define _EXP_ exp
+#endif
+
+#if F_PRECISION == 2
+struct F_FLOAT2 {
+  F_FLOAT x;
+  F_FLOAT y;
+};
+struct F_FLOAT3 {
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+};
+struct F_FLOAT4 {
+  F_FLOAT x;
+  F_FLOAT y;
+  F_FLOAT z;
+  F_FLOAT w;
+};
+#else
+#define F_FLOAT2 float2
+#define F_FLOAT3 float3
+#define F_FLOAT4 float4
+#endif
+//--------------------------------
+//-----------ENERGY-----------------
+//--------------------------------
+
+#ifndef ENERGY_PRECISION
+#define ENERGY_FLOAT CUDA_FLOAT
+#define ENERGY_F(x) CUDA_F(x)
+#endif
+
+#ifdef ENERGY_PRECISION
+#if ENERGY_PRECISION == 1
+#define ENERGY_FLOAT float
+#define ENERGY_F(x) x##f
+#endif
+#if ENERGY_PRECISION == 2
+#define ENERGY_FLOAT double
+#define ENERGY_F(x) x
+#endif
+#endif
+
+#ifndef ENERGY_PRECISION
+#define ENERGY_FLOAT CUDA_FLOAT
+#define ENERGY_F(x) CUDA_F(x)
+#define ENERGY_PRECISION CUDA_PRECISION
+#endif
+
+//--------------------------------
+//-----------POSITIONS------------
+//--------------------------------
+
+#ifdef X_PRECISION
+#if X_PRECISION == 1
+#define X_FLOAT float
+#define X_F(x) x##f
+#endif
+#if X_PRECISION == 2
+#define X_FLOAT double
+#define X_F(x) x
+#endif
+#endif
+
+#ifndef X_PRECISION
+#define X_FLOAT CUDA_FLOAT
+#define X_F(x) CUDA_F(x)
+#define X_PRECISION CUDA_PRECISION
+#endif
+
+#if X_PRECISION == 2
+struct X_FLOAT2 {
+  X_FLOAT x;
+  X_FLOAT y;
+};
+struct X_FLOAT3 {
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+};
+struct X_FLOAT4 {
+  X_FLOAT x;
+  X_FLOAT y;
+  X_FLOAT z;
+  X_FLOAT w;
+};
+#else
+#define X_FLOAT2 float2
+#define X_FLOAT3 float3
+#define X_FLOAT4 float4
+#endif
+
+//--------------------------------
+//-----------velocities-----------
+//--------------------------------
+
+#ifdef V_PRECISION
+#if V_PRECISION == 1
+#define V_FLOAT float
+#define V_F(x) x##f
+#endif
+#if V_PRECISION == 2
+#define V_FLOAT double
+#define V_F(x) x
+#endif
+#endif
+
+#ifndef V_PRECISION
+#define V_FLOAT CUDA_FLOAT
+#define V_F(x) CUDA_F(x)
+#define V_PRECISION CUDA_PRECISION
+#endif
+
+#if V_PRECISION == 2
+struct V_FLOAT4 {
+  V_FLOAT x;
+  V_FLOAT y;
+  V_FLOAT z;
+  V_FLOAT w;
+};
+#else
+#define V_FLOAT4 float4
+#endif
+
+#ifdef NO_PREC_TIMING
+struct timespec_2 {
+  unsigned int tv_sec;
+  unsigned int tv_nsec;
+};
+
+#define timespec timespec_2
+#define clock_gettime(a,b)
+#endif
+#endif /*CUDA_PRECISION_H_*/
diff --git a/lib/cuda/cuda_shared.h b/lib/cuda/cuda_shared.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d29336b003665155a1e221217a2568158de9836
--- /dev/null
+++ b/lib/cuda/cuda_shared.h
@@ -0,0 +1,370 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_SHARED_H_
+#define _CUDA_SHARED_H_
+#include "cuda_precision.h"
+
+#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
+
+struct dev_array {
+  void* dev_data;			// pointer to memory address on cuda device
+  unsigned dim[3];		// array dimensions
+};
+
+struct cuda_shared_atom {	// relevent data from atom class
+  dev_array dx; 			// cumulated distance for binning settings
+  dev_array x;			// position
+  dev_array v;			// velocity
+  dev_array f;			// force
+  dev_array tag;
+  dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)
+  dev_array mask;
+  dev_array image;
+  dev_array q;			// charges
+  dev_array mass;			// per-type masses
+  dev_array rmass;		// per-atom masses
+  dev_array radius;		// per-atom radius
+  dev_array density;
+  dev_array omega;
+  dev_array torque;
+  dev_array molecule;
+
+  dev_array special;
+  int maxspecial;
+  dev_array nspecial;
+  int* special_flag;
+  int molecular;
+
+  dev_array eatom;		// per-atom energy
+  dev_array vatom;		// per-atom virial
+  int need_eatom;
+  int need_vatom;
+
+  dev_array x_type;		// position + type in X_FLOAT4 struct
+  dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+  dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
+
+  double* mass_host;		// remember per-type host pointer to masses
+  //int natoms;				// total # of atoms in system, could be 0
+  int nghost;				// and ghost atoms on this proc
+  int nlocal;				// # of owned
+  int nall;			    // total # of atoms in this proc
+  int nmax;				// max # of owned+ghost in arrays on this proc
+  int ntypes;
+  int q_flag;				// do we have charges?
+  int rmass_flag;			// do we have per-atom masses?
+  int firstgroup;
+  int nfirst;
+
+  int update_nlocal;
+  int update_nmax;
+  int update_neigh;
+
+  dev_array xhold;	    // position at last neighboring
+  X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
+  int reneigh_flag;		// is reneighboring necessary
+  int maxhold;			// size of xhold
+  int dist_check; 		//perform distance check for reneighboring
+  dev_array binned_id;    //id of each binned atom (not tag!!)
+  dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
+  float bin_extraspace;
+  int bin_dim[3];
+  int bin_nmax;
+  dev_array map_array;
+};
+
+struct cuda_shared_pair {	// relevent data from pair class
+  char cudable_force;		// check for (cudable_force!=0)
+  X_FLOAT cut_global;
+  X_FLOAT cut_inner_global;
+  X_FLOAT cut_coul_global;
+  double** cut;			// type-type cutoff
+  double** cutsq;			// type-type cutoff
+  double** cut_inner;			// type-type cutoff for coul
+  double** cut_coul;			// type-type cutoff for coul
+  double** coeff1;		// tpye-type pair parameters
+  double** coeff2;
+  double** coeff3;
+  double** coeff4;
+  double** coeff5;
+  double** coeff6;
+  double** coeff7;
+  double** coeff8;
+  double** coeff9;
+  double** coeff10;
+  double** offset;
+  double* special_lj;
+  double* special_coul;
+  dev_array virial; // ENERGY_FLOAT
+  dev_array eng_vdwl; // ENERGY_FLOAT
+  dev_array eng_coul; // ENERGY_FLOAT
+  X_FLOAT cut_coulsq_global;
+  F_FLOAT g_ewald, kappa;
+  int freeze_group_bit;
+
+  dev_array coeff1_gm;
+  dev_array coeff2_gm;
+  dev_array coeff3_gm;
+  dev_array coeff4_gm;
+  dev_array coeff5_gm;
+  dev_array coeff6_gm;
+  dev_array coeff7_gm;
+  dev_array coeff8_gm;
+  dev_array coeff9_gm;
+  dev_array coeff10_gm;
+
+  int lastgridsize;
+  int n_energy_virial;
+  int collect_forces_later;
+  int use_block_per_atom;
+  int override_block_per_atom;
+  bool neighall;
+
+};
+
+struct cuda_shared_domain {	// relevent data from domain class
+  X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
+  X_FLOAT subhi[3];
+  X_FLOAT boxlo[3];
+  X_FLOAT boxhi[3];
+  X_FLOAT prd[3];
+  int periodicity[3];		// xyz periodicity as array
+
+  int triclinic;
+  X_FLOAT xy;
+  X_FLOAT xz;
+  X_FLOAT yz;
+  X_FLOAT boxlo_lamda[3];
+  X_FLOAT boxhi_lamda[3];
+  X_FLOAT prd_lamda[3];
+  X_FLOAT h[6];
+  X_FLOAT h_inv[6];
+  V_FLOAT h_rate[6];
+  int update;
+};
+
+struct cuda_shared_pppm {
+  char cudable_force;
+#ifdef FFT_CUFFT
+  FFT_FLOAT* work1;
+  FFT_FLOAT* work2;
+  FFT_FLOAT* work3;
+  PPPM_FLOAT* greensfn;
+  PPPM_FLOAT* fkx;
+  PPPM_FLOAT* fky;
+  PPPM_FLOAT* fkz;
+  PPPM_FLOAT* vg;
+#endif
+  int* part2grid;
+  PPPM_FLOAT* density_brick;
+  int* density_brick_int;
+  PPPM_FLOAT density_intScale;
+  PPPM_FLOAT* vdx_brick;
+  PPPM_FLOAT* vdy_brick;
+  PPPM_FLOAT* vdz_brick;
+  PPPM_FLOAT* density_fft;
+  ENERGY_FLOAT* energy;
+  ENERGY_FLOAT* virial;
+  int nxlo_in;
+  int nxhi_in;
+  int nxlo_out;
+  int nxhi_out;
+  int nylo_in;
+  int nyhi_in;
+  int nylo_out;
+  int nyhi_out;
+  int nzlo_in;
+  int nzhi_in;
+  int nzlo_out;
+  int nzhi_out;
+  int nx_pppm;
+  int ny_pppm;
+  int nz_pppm;
+  PPPM_FLOAT qqrd2e;
+  int order;
+  // float3 sublo;
+  PPPM_FLOAT* rho_coeff;
+  int nmax;
+  int nlocal;
+  PPPM_FLOAT* debugdata;
+  PPPM_FLOAT delxinv;
+  PPPM_FLOAT delyinv;
+  PPPM_FLOAT delzinv;
+  int nlower;
+  int nupper;
+  PPPM_FLOAT shiftone;
+  PPPM_FLOAT3* fH;
+};
+
+struct cuda_shared_comm {
+  int maxswap;
+  int maxlistlength;
+  dev_array pbc;
+  dev_array slablo;
+  dev_array slabhi;
+  dev_array multilo;
+  dev_array multihi;
+  dev_array sendlist;
+  int grow_flag;
+  int comm_phase;
+
+  int nsend;
+  int* nsend_swap;
+  int* send_size;
+  int* recv_size;
+  double** buf_send;
+  void** buf_send_dev;
+  double** buf_recv;
+  void** buf_recv_dev;
+  void* buffer;
+  int buffer_size;
+  double overlap_split_ratio;
+};
+
+struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
+  int maxlocal;
+  int inum;                // # of I atoms neighbors are stored for local indices of I atoms
+  int inum_border2;
+  dev_array inum_border;         // # of atoms which interact with border atoms
+  dev_array ilist;
+  dev_array ilist_border;
+  dev_array numneigh;
+  dev_array numneigh_inner;
+  dev_array numneigh_border;
+  dev_array firstneigh;
+  dev_array neighbors;
+  dev_array neighbors_border;
+  dev_array neighbors_inner;
+  int maxpage;
+  dev_array page_pointers;
+  dev_array* pages;
+  int maxneighbors;
+  int neigh_lists_per_page;
+  double** cutneighsq;
+  CUDA_FLOAT* cu_cutneighsq;
+  int* binned_id;
+  int* bin_dim;
+  int bin_nmax;
+  float bin_extraspace;
+  double maxcut;
+  dev_array ex_type;
+  int nex_type;
+  dev_array ex1_bit;
+  dev_array ex2_bit;
+  int nex_group;
+  dev_array ex_mol_bit;
+  int nex_mol;
+
+};
+
+struct cuda_compile_settings {	// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
+  int prec_glob;
+  int prec_x;
+  int prec_v;
+  int prec_f;
+  int prec_pppm;
+  int prec_fft;
+  int cufft;
+  int arch;
+};
+
+struct cuda_timings_struct {
+  //Debug:
+  double test1;
+  double test2;
+  //transfers
+  double transfer_upload_tmp_constr;
+  double transfer_download_tmp_deconstr;
+
+  //communication
+  double comm_forward_total;
+  double comm_forward_mpi_upper;
+  double comm_forward_mpi_lower;
+  double comm_forward_kernel_pack;
+  double comm_forward_kernel_unpack;
+  double comm_forward_kernel_self;
+  double comm_forward_upload;
+  double comm_forward_download;
+
+  double comm_exchange_total;
+  double comm_exchange_mpi;
+  double comm_exchange_kernel_pack;
+  double comm_exchange_kernel_unpack;
+  double comm_exchange_kernel_fill;
+  double comm_exchange_cpu_pack;
+  double comm_exchange_upload;
+  double comm_exchange_download;
+
+  double comm_border_total;
+  double comm_border_mpi;
+  double comm_border_kernel_pack;
+  double comm_border_kernel_unpack;
+  double comm_border_kernel_self;
+  double comm_border_kernel_buildlist;
+  double comm_border_upload;
+  double comm_border_download;
+
+  //pair forces
+  double pair_xtype_conversion;
+  double pair_kernel;
+  double pair_virial;
+  double pair_force_collection;
+
+  //neighbor
+  double neigh_bin;
+  double neigh_build;
+  double neigh_special;
+
+  //PPPM
+  double pppm_particle_map;
+  double pppm_make_rho;
+  double pppm_brick2fft;
+  double pppm_poisson;
+  double pppm_fillbrick;
+  double pppm_fieldforce;
+  double pppm_compute;
+
+};
+
+struct cuda_shared_data {	// holds space for all relevent data from the different classes
+  void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
+  int buffersize; //maxsize of buffer
+  int buffer_new; //should be 1 if the pointer to buffer has changed
+  void* flag;
+  void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
+  cuda_shared_atom atom;
+  cuda_shared_pair pair;
+  cuda_shared_domain domain;
+  cuda_shared_pppm pppm;
+  cuda_shared_comm comm;
+  cuda_compile_settings compile_settings;
+  cuda_timings_struct cuda_timings;
+  int exchange_dim;
+  int me; //mpi rank
+  unsigned int datamask;
+  int overlap_comm;
+};
+
+
+#endif // #ifndef _CUDA_SHARED_H_
diff --git a/lib/cuda/cuda_wrapper.cu b/lib/cuda/cuda_wrapper.cu
new file mode 100644
index 0000000000000000000000000000000000000000..50366a87da1f9d739d3585a52935560d3ba1a247
--- /dev/null
+++ b/lib/cuda/cuda_wrapper.cu
@@ -0,0 +1,337 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "cuda_wrapper_cu.h"
+#include "cuda_wrapper_kernel.cu"
+
+static int CudaWrapper_total_gpu_mem = 0;
+static double CudaWrapper_total_upload_time = 0;
+static double CudaWrapper_total_download_time = 0;
+static double CudaWrapper_cpubuffer_upload_time = 0;
+static double CudaWrapper_cpubuffer_download_time = 0;
+static cudaStream_t* streams;
+static int nstreams = 0;
+
+void CudaWrapper_Init(int argc, char** argv, int me, int ppn, int* devicelist)
+{
+  MYDBG(printf("# CUDA: debug mode on\n");)
+
+#if __DEVICE_EMULATION__
+
+  printf("# CUDA: emulation mode on\n");
+
+#else
+
+  // modified from cutil.h
+  static int deviceCount = 0;
+  static bool sharedmode = false;
+
+  if(deviceCount && !sharedmode) return;
+
+  if(deviceCount && sharedmode) cudaThreadExit();
+
+  CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
+
+  if(deviceCount == 0) {
+    fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  MYDBG(printf("# CUDA There are %i devices supporting CUDA in this system.\n", deviceCount);)
+
+  cudaDeviceProp deviceProp[deviceCount];
+
+  for(int i = 0; i < deviceCount; i++)
+    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&(deviceProp[i]), i));
+
+
+  int dev_list[deviceCount];
+
+  for(int i = 0; i < deviceCount; i++) dev_list[i] = i;
+
+  for(int i = 0; i < deviceCount; i++) {
+    for(int j = 0; j < deviceCount - 1 - i; j++)
+      if(deviceProp[dev_list[j]].multiProcessorCount < deviceProp[dev_list[j + 1]].multiProcessorCount) {
+        int k = dev_list[j];
+        dev_list[j] = dev_list[j + 1];
+        dev_list[j + 1] = k;
+      }
+  }
+
+  for(int i = 0; i < deviceCount; i++) {
+    if((deviceProp[dev_list[i]].computeMode == 0)) sharedmode = true;
+
+    cudaSetDevice(i);
+    cudaSetDeviceFlags(cudaDeviceMapHost);
+  }
+
+  if(sharedmode) {
+    if(ppn && (me % ppn + 1) > deviceCount) {
+      printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n");
+      exit(0);
+    }
+
+    int devicea = me % ppn;
+
+    if(devicelist) devicea = devicelist[devicea];
+    else
+      devicea = dev_list[devicea];
+
+    if(devicea >= deviceCount)  {
+      printf("Asking for non existent GPU %i. Found only %i GPUs.\n", devicea, deviceCount);
+      exit(0);
+    }
+
+    MYDBG(
+      printf(" # CUDA  myid: %i take device: %i\n", me, devicea);
+    )
+    CUDA_SAFE_CALL(cudaSetDevice(devicea));
+  } else {
+    CUDA_SAFE_CALL(cudaSetValidDevices(dev_list, deviceCount));
+  }
+
+  cudaThreadSynchronize();
+
+  int dev;
+  CUDA_SAFE_CALL(cudaGetDevice(&dev));
+
+  if(deviceProp[dev].major < 1) {
+    fprintf(stderr, "CUDA error: device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  } else if((deviceProp[dev].major == 1) && (deviceProp[dev].minor != 3)) {
+    fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n", dev, deviceProp[dev].name, deviceProp[dev].major, deviceProp[dev].minor);
+    exit(EXIT_FAILURE);
+  }
+
+  if((deviceProp[dev].major == 2) && (CUDA_ARCH < 20)) {
+    fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n", deviceProp[dev].major, deviceProp[dev].minor);
+  }
+
+  if((deviceProp[dev].major == 1) && (CUDA_ARCH >= 20)) {
+    fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n", CUDA_ARCH);
+    exit(EXIT_FAILURE);
+  }
+
+
+  fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
+  MYDBG(fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
+
+  MYDBG
+  (
+    printf("name = %s\n", deviceProp[dev].name);
+    printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
+    printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
+    printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
+    printf("warpSize = %i\n", deviceProp[dev].warpSize);
+    printf("memPitch = %i\n", deviceProp[dev].memPitch);
+    printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
+    printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
+    printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
+    printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
+    printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
+    printf("clockRate = %i\n", deviceProp[dev].clockRate);
+    printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
+    printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
+    printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
+    printf("computeMode = %i\n", deviceProp[dev].computeMode);
+  )
+
+#endif
+}
+
+void* CudaWrapper_AllocCudaData(unsigned nbytes)
+{
+  void* dev_data;
+  CUDA_SAFE_CALL(cudaMalloc((void**)&dev_data, nbytes));
+  MYDBG(printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data);)
+  CudaWrapper_total_gpu_mem += nbytes;
+  return dev_data;
+}
+
+void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
+{
+  MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
+  cudaThreadSynchronize();
+  timespec time1, time2;
+  clock_gettime(CLOCK_REALTIME, &time1);
+  CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
+  clock_gettime(CLOCK_REALTIME, &time2);
+  CudaWrapper_total_upload_time +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+}
+
+void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
+{
+  MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
+  cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice, streams[stream]);
+}
+
+void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
+{
+  MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
+  cudaThreadSynchronize();
+  timespec time1, time2;
+  clock_gettime(CLOCK_REALTIME, &time1);
+  CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
+  clock_gettime(CLOCK_REALTIME, &time2);
+  CudaWrapper_total_download_time +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+}
+
+void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
+{
+  MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
+  cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost, streams[stream]);
+}
+
+void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes)
+{
+  MYDBG(printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data, nbytes, (char*)dev_data + nbytes);)
+  CUDA_SAFE_CALL(cudaFree(dev_data));
+  CudaWrapper_total_gpu_mem -= nbytes;
+}
+
+void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
+{
+  MYDBG(printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data);)
+  CUDA_SAFE_CALL(cudaMemset(dev_data, value, nbytes));
+}
+
+void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
+{
+  MYDBG(printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source, dev_dest);)
+  CUDA_SAFE_CALL(cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice));
+}
+
+void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped, bool writeCombined)
+{
+  void* host_data;
+  int flags = 0;
+
+  if(mapped) flags = flags | cudaHostAllocMapped;
+
+  if(writeCombined) flags = flags | cudaHostAllocWriteCombined;
+
+  CUDA_SAFE_CALL(cudaHostAlloc((void**)&host_data, nbytes, flags));
+  //	CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
+  MYDBG(printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data);)
+  return host_data;
+}
+
+void CudaWrapper_FreePinnedHostData(void* host_data)
+{
+  MYDBG(printf("# CUDA: freeing pinned host memory at %p \n", host_data);)
+
+  if(host_data)
+    CUDA_SAFE_CALL(cudaFreeHost(host_data));
+}
+
+void cuda_check_error(char* comment)
+{
+  printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError()));
+}
+
+int CudaWrapper_CheckMemUseage()
+{
+  size_t free, total;
+  cudaMemGetInfo(&free, &total);
+  return total - free; //possible with cuda 3.0 ???
+  //return CudaWrapper_total_gpu_mem;
+}
+
+double CudaWrapper_CheckUploadTime(bool reset)
+{
+  if(reset) CudaWrapper_total_upload_time = 0.0;
+
+  return CudaWrapper_total_upload_time;
+}
+
+double CudaWrapper_CheckDownloadTime(bool reset)
+{
+  if(reset) CudaWrapper_total_download_time = 0.0;
+
+  return CudaWrapper_total_download_time;
+}
+
+double CudaWrapper_CheckCPUBufUploadTime(bool reset)
+{
+  if(reset) CudaWrapper_cpubuffer_upload_time = 0.0;
+
+  return CudaWrapper_cpubuffer_upload_time;
+}
+
+double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
+{
+  if(reset) CudaWrapper_cpubuffer_download_time = 0.0;
+
+  return CudaWrapper_cpubuffer_download_time;
+}
+
+void CudaWrapper_AddCPUBufUploadTime(double dt)
+{
+  CudaWrapper_cpubuffer_upload_time += dt;
+}
+
+void CudaWrapper_AddCPUBufDownloadTime(double dt)
+{
+  CudaWrapper_cpubuffer_download_time += dt;
+}
+
+void CudaWrapper_Sync()
+{
+  cudaThreadSynchronize();
+}
+
+void CudaWrapper_SyncStream(int stream)
+{
+  cudaStreamSynchronize(streams[stream]);
+}
+
+void CudaWrapper_AddStreams(int n)
+{
+  cudaStream_t* new_streams = new cudaStream_t[nstreams + n];
+
+  for(int i = 0; i < nstreams; i++) new_streams[i] = streams[i];
+
+  for(int i = nstreams; i < nstreams + n; i++) cudaStreamCreate(&new_streams[i]);
+
+  if(nstreams > 0)
+    delete [] streams;
+
+  streams = new_streams;
+  nstreams += n;
+}
+
+void* CudaWrapper_returnStreams()
+{
+  return (void*) streams;
+}
+
+int CudaWrapper_returnNStreams()
+{
+  return nstreams;
+}
+
diff --git a/lib/cuda/cuda_wrapper_cu.h b/lib/cuda/cuda_wrapper_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bcfaffd4466f0dc6ddc4883e7d16f0c45785d3a
--- /dev/null
+++ b/lib/cuda/cuda_wrapper_cu.h
@@ -0,0 +1,52 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef _CUDA_DATA_WRAPPER_H_
+#define _CUDA_DATA_WRAPPER_H_
+
+extern "C" void  CudaWrapper_Init(int argc, char** argv, int me = 0, int ppn = 2, int* devicelist = NULL);
+extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
+extern "C" void  CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
+extern "C" void  CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
+extern "C" void  CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
+extern "C" void  CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
+extern "C" void  CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes = 0);
+extern "C" void  CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
+extern "C" void  CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
+extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped = false, bool writeCombind = false);
+extern "C" void  CudaWrapper_FreePinnedHostData(void* dev_data);
+extern "C" void  cuda_check_error(char* comment);
+extern "C" int   CudaWrapper_CheckMemUseage();
+extern "C" double CudaWrapper_CheckUploadTime(bool reset = false);
+extern "C" double CudaWrapper_CheckDownloadTime(bool reset = false);
+extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset = false);
+extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset = false);
+extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
+extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
+extern "C" void CudaWrapper_Sync();
+extern "C" void CudaWrapper_SyncStream(int n);
+extern "C" void CudaWrapper_AddStreams(int n);
+extern "C" void* CudaWrapper_returnStreams();
+extern "C" int CudaWrapper_returnNStreams();
+
+#endif // _CUDA_DATA_WRAPPER_H_
diff --git a/lib/cuda/cuda_wrapper_kernel.cu b/lib/cuda/cuda_wrapper_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9e0369856ba913c2d88af2fb81b6b73cca194db3
--- /dev/null
+++ b/lib/cuda/cuda_wrapper_kernel.cu
@@ -0,0 +1,24 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+// empty file to obay common make rule
diff --git a/lib/cuda/domain.cu b/lib/cuda/domain.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9dddbf65fd294b0a0c63fd668f82ed4637935af8
--- /dev/null
+++ b/lib/cuda/domain.cu
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX domain
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "domain_cu.h"
+#include "domain_kernel.cu"
+
+void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata, int size)
+{
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)    , & sdata->atom.tag .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(image)   , & sdata->atom.image.dev_data, sizeof(int*));
+}
+
+void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(boxlo)   ,  sdata->domain.boxlo       , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(boxhi)   ,  sdata->domain.boxhi       , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(sublo)   ,  sdata->domain.sublo       , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(subhi)   ,  sdata->domain.subhi       , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(prd)     ,  sdata->domain.prd         , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(periodicity)   ,   sdata->domain.periodicity , 3 * sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(triclinic)     , & sdata->domain.triclinic   , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(boxlo_lamda)   ,   sdata->domain.boxlo_lamda , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(boxhi_lamda)   ,   sdata->domain.boxhi_lamda , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(prd_lamda)	   ,   sdata->domain.prd_lamda   , 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(h)	   	 ,   sdata->domain.h   		  , 6 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(h_inv)	 ,   sdata->domain.h_inv   	  , 6 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(h_rate)	 ,   sdata->domain.h_rate     , 6 * sizeof(V_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(flag)	 ,   &sdata->flag     , sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(debugdata)	 ,   &sdata->debugdata     , sizeof(int*));
+}
+
+void Cuda_Domain_Init(cuda_shared_data* sdata)
+{
+  Cuda_Domain_UpdateNmax(sdata);
+  Cuda_Domain_UpdateDomain(sdata);
+}
+
+void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent)
+{
+  Cuda_Domain_UpdateNmax(sdata);
+  //if(sdata->domain.update)
+  Cuda_Domain_UpdateDomain(sdata);
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int box_change = 0;
+
+  if(extent) box_change = 1;
+
+  int sharedmem = 0;
+
+  if(box_change) sharedmem = 6 * sizeof(X_FLOAT);
+
+  int3 layout = getgrid(sdata->atom.nlocal, sharedmem);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  sharedmem *= threads.x;
+
+  if((box_change) && (sdata->buffer_new or (6 * sizeof(X_FLOAT)*grid.x * grid.y > sdata->buffersize)))
+    Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_FLOAT));
+
+
+  Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change);
+  cudaThreadSynchronize();
+
+  CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
+
+  if(box_change) {
+    X_FLOAT buf2[6 * layout.x * layout.y];
+    X_FLOAT* buf = buf2;
+    int flag;
+    cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+    //printf("Flag: %i\n",flag);
+    X_FLOAT min, max;
+    min = 1.0 * BIG;
+    max = -1.0 * BIG;
+
+    for(int i = 0; i < layout.x * layout.y; i++) {
+      if(buf[i] < min) min = buf[i];
+
+      if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
+    }
+
+    extent[0] = min;
+    extent[1] = max;
+
+    buf += 2 * layout.x * layout.y;
+    min = 1.0 * BIG;
+    max = -1.0 * BIG;
+
+    for(int i = 0; i < layout.x * layout.y; i++) {
+      if(buf[i] < min) min = buf[i];
+
+      if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
+    }
+
+    extent[2] = min;
+    extent[3] = max;
+
+    buf += 2 * layout.x * layout.y;
+    min = 1.0 * BIG;
+    max = -1.0 * BIG;
+
+    for(int i = 0; i < layout.x * layout.y; i++) {
+      if(buf[i] < min) min = buf[i];
+
+      if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
+    }
+
+    extent[4] = min;
+    extent[5] = max;
+    //printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
+    /*	   int n=grid.x*grid.y;
+    	   if(n<128) threads.x=32;
+    	   else if(n<256) threads.x=64;
+    	   else threads.x=128;
+    	   sharedmem=n*sizeof(X_FLOAT);
+    	   grid.x=6;
+    	   grid.y=1;
+    	   Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
+    	   cudaThreadSynchronize();
+    	   CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
+  }
+}
+
+void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n)
+{
+  Cuda_Domain_UpdateNmax(sdata);
+  //if(sdata->domain.update)
+  Cuda_Domain_UpdateDomain(sdata);
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Domain_lamda2x_Kernel <<< grid, threads, 0>>>(n);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
+}
+
+void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n)
+{
+  Cuda_Domain_UpdateNmax(sdata);
+  //if(sdata->domain.update)
+  Cuda_Domain_UpdateDomain(sdata);
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Domain_x2lamda_Kernel <<< grid, threads, 0>>>(n);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
+}
diff --git a/lib/cuda/domain_cu.h b/lib/cuda/domain_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..38eb78e8568acf5e6451cd67924493735c20d533
--- /dev/null
+++ b/lib/cuda/domain_cu.h
@@ -0,0 +1,29 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent = NULL);
+extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n);
+extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n);
diff --git a/lib/cuda/domain_kernel.cu b/lib/cuda/domain_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fedb7807a80ee659108c9a805f488b47ec4c3a6f
--- /dev/null
+++ b/lib/cuda/domain_kernel.cu
@@ -0,0 +1,293 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ X_FLOAT sharedmem[];
+
+#define BIG 1e10
+__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change)
+{
+  int idim, otherdims;
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT lo[3];
+  X_FLOAT hi[3];
+  X_FLOAT* period;
+
+  if(_triclinic == 0) {
+    lo[0] = _boxlo[0];
+    lo[1] = _boxlo[1];
+    lo[2] = _boxlo[2];
+
+    hi[0] = _boxhi[0];
+    hi[1] = _boxhi[1];
+    hi[2] = _boxhi[2];
+    period = _prd;
+  } else {
+    lo[0] = _boxlo_lamda[0];
+    lo[1] = _boxlo_lamda[1];
+    lo[2] = _boxlo_lamda[2];
+
+    hi[0] = _boxhi_lamda[0];
+    hi[1] = _boxhi_lamda[1];
+    hi[2] = _boxhi_lamda[2];
+    period = _prd_lamda;
+  }
+
+
+  X_FLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
+  X_FLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
+  X_FLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
+
+  X_FLOAT* buf = (X_FLOAT*) _buffer;
+  buf += blockIdx.x * gridDim.y + blockIdx.y;
+  buf[0] = tmpx;
+  buf += gridDim.x * gridDim.y;
+  buf[0] = tmpx;
+  buf += gridDim.x * gridDim.y;
+  buf[0] = tmpy;
+  buf += gridDim.x * gridDim.y;
+  buf[0] = tmpy;
+  buf += gridDim.x * gridDim.y;
+  buf[0] = tmpz;
+  buf += gridDim.x * gridDim.y;
+  buf[0] = tmpz;
+
+  if(i < _nlocal) {
+
+    if(_periodicity[0]) {
+      if(_x[i] < lo[0]) {
+        _x[i] += period[0];
+
+        if(deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
+
+        idim = _image[i] & 1023;
+        otherdims = _image[i] ^ idim;
+        idim--;
+        idim &= 1023;
+        _image[i] = otherdims | idim;
+      }
+
+      if(_x[i] >= hi[0]) {
+        _x[i] -= period[0];
+        _x[i] = MAX(_x[i], lo[0]);
+
+        if(deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
+
+        idim = _image[i] & 1023;
+        otherdims = _image[i] ^ idim;
+        idim++;
+        idim &= 1023;
+        _image[i] = otherdims | idim;
+      }
+    }
+
+    if(_periodicity[1]) {
+      if(_x[i + _nmax] < lo[1]) {
+        _x[i + _nmax] += period[1];
+
+        if(deform_remap && _mask[i] & deform_groupbit) {
+          _v[i] += _h_rate[5];
+          _v[i + _nmax] += _h_rate[1];
+        }
+
+        idim = (_image[i] >> 10) & 1023;
+        otherdims = _image[i] ^ (idim << 10);
+        idim--;
+        idim &= 1023;
+        _image[i] = otherdims | (idim << 10);
+      }
+
+      if(_x[i + _nmax] >= hi[1]) {
+        _x[i + _nmax] -= period[1];
+        _x[i + _nmax] = MAX(_x[i + _nmax], lo[1]);
+
+        if(deform_remap && _mask[i] & deform_groupbit) {
+          _v[i] -= _h_rate[5];
+          _v[i + _nmax] -= _h_rate[1];
+        }
+
+        idim = (_image[i] >> 10) & 1023;
+        otherdims = _image[i] ^ (idim << 10);
+        idim++;
+        idim &= 1023;
+        _image[i] = otherdims | (idim << 10);
+      }
+    }
+
+    if(_periodicity[2]) {
+      if(_x[i + 2 * _nmax] < lo[2]) {
+        _x[i + 2 * _nmax] += period[2];
+
+        if(deform_remap && _mask[i] & deform_groupbit) {
+          _v[i] += _h_rate[4];
+          _v[i + _nmax] += _h_rate[3];
+          _v[i + 2 * _nmax] += _h_rate[2];
+        }
+
+        idim = _image[i] >> 20;
+        otherdims = _image[i] ^ (idim << 20);
+        idim--;
+        idim &= 1023;
+        _image[i] = otherdims | (idim << 20);
+      }
+
+      if(_x[i + 2 * _nmax] >= hi[2]) {
+        _x[i + 2 * _nmax] -= period[2];
+        _x[i + 2 * _nmax] = MAX(_x[i + 2 * _nmax], lo[2]);
+
+        if(deform_remap && _mask[i] & deform_groupbit) {
+          _v[i] -= _h_rate[4];
+          _v[i + _nmax] -= _h_rate[3];
+          _v[i + 2 * _nmax] -= _h_rate[2];
+        }
+
+        idim = _image[i] >> 20;
+        otherdims = _image[i] ^ (idim << 20);
+        idim++;
+        idim &= 1023;
+        _image[i] = otherdims | (idim << 20);
+      }
+    }
+
+    if(box_change) {
+      tmpx = _x[i];
+      tmpy = _x[i + _nmax];
+      tmpz = _x[i + 2 * _nmax];
+
+
+    }
+  }
+
+  __syncthreads();
+
+  if(box_change) {
+    X_FLOAT minx = BIG;
+    X_FLOAT maxx = -BIG;
+    X_FLOAT miny = BIG;
+    X_FLOAT maxy = -BIG;
+    X_FLOAT minz = BIG;
+    X_FLOAT maxz = -BIG;
+
+    if(not _periodicity[0]) {
+      sharedmem[threadIdx.x] = tmpx;
+      minOfBlock(sharedmem);
+      minx = sharedmem[0];
+      __syncthreads();
+      sharedmem[threadIdx.x] = tmpx;
+      maxOfBlock(sharedmem);
+      maxx = sharedmem[0];
+      __syncthreads();
+    } else {
+      minx = lo[0];
+      maxx = hi[0];
+    }
+
+    if(not _periodicity[1]) {
+      sharedmem[threadIdx.x] = tmpy;
+      minOfBlock(sharedmem);
+      miny = sharedmem[0];
+      __syncthreads();
+      sharedmem[threadIdx.x] = tmpy;
+      maxOfBlock(sharedmem);
+      maxy = sharedmem[0];
+      __syncthreads();
+    } else {
+      minx = lo[1];
+      maxx = hi[1];
+    }
+
+    if(not _periodicity[2]) {
+      sharedmem[threadIdx.x] = tmpz;
+      minOfBlock(sharedmem);
+      minz = sharedmem[0];
+      __syncthreads();
+      sharedmem[threadIdx.x] = tmpz;
+      maxOfBlock(sharedmem);
+      maxz = sharedmem[0];
+      __syncthreads();
+    } else {
+      minz = lo[2];
+      maxz = hi[2];
+    }
+
+    if(threadIdx.x == 0) {
+      buf = (X_FLOAT*) _buffer;
+      buf += blockIdx.x * gridDim.y + blockIdx.y;
+      buf[0] = minx;
+      buf += gridDim.x * gridDim.y;
+      buf[0] = maxx;
+      buf += gridDim.x * gridDim.y;
+      buf[0] = miny;
+      buf += gridDim.x * gridDim.y;
+      buf[0] = maxy;
+      buf += gridDim.x * gridDim.y;
+      buf[0] = minz;
+      buf += gridDim.x * gridDim.y;
+      buf[0] = maxz;
+    }
+  }
+}
+
+__global__ void Domain_reduceBoxExtent(double* extent, int n)
+{
+  X_FLOAT* buf = (X_FLOAT*) _buffer;
+  buf += blockIdx.x * n;
+  copyGlobToShared(buf, sharedmem, n);
+
+  if(blockIdx.x % 2 == 0)
+    minOfData(sharedmem, n);
+  else
+    maxOfData(sharedmem, n);
+
+  extent[blockIdx.x] = sharedmem[0];
+}
+
+__global__ void Domain_lamda2x_Kernel(int n)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    X_FLOAT ytmp = _x[i + _nmax];
+    X_FLOAT ztmp = _x[i + 2 * _nmax];
+    _x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0];
+    _x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1];
+    _x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2];
+  }
+}
+
+__global__ void Domain_x2lamda_Kernel(int n)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT delta[3];
+
+  if(i < n) {
+    delta[0] = _x[i] - _boxlo[0];
+    delta[1] = _x[i + _nmax] - _boxlo[1];
+    delta[2] = _x[i + 2 * _nmax] - _boxlo[2];
+
+    _x[i] = _h_inv[0] * delta[0] + _h_inv[5] * delta[1] + _h_inv[4] * delta[2];
+    _x[i + _nmax] = _h_inv[1] * delta[1] + _h_inv[3] * delta[2];
+    _x[i + 2 * _nmax] = _h_inv[2] * delta[2];
+  }
+}
diff --git a/lib/cuda/fft3d_cuda.cu b/lib/cuda/fft3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d5ac077f9d0784758b946c25b9dcf276cad6677b
--- /dev/null
+++ b/lib/cuda/fft3d_cuda.cu
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+//#define CUDA_PRECISION 1
+#include "cuda_precision.h"
+#include "cuda_common.h"
+struct  FFT_DATA {
+  FFT_FLOAT re;
+  FFT_FLOAT im;
+};
+
+#include "fft3d_cuda_cu.h"
+#include "fft3d_cuda_kernel.cu"
+#include <stdio.h>
+
+void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow)
+{
+
+  dim3 grid;
+  grid.x = nslow;
+  grid.y = nmid;
+  grid.z = 1;
+  dim3 threads;
+  threads.x = nfast;
+  threads.y = 1;
+  threads.z = 1;
+  cudaThreadSynchronize();
+  initfftdata_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
+}
+
+
+void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
+{
+
+  dim3 grid;
+  grid.x = nslow;
+  grid.y = nmid;
+  grid.z = 1;
+  dim3 threads;
+  threads.x = nfast * 2;
+  threads.y = 1;
+  threads.z = 1;
+  permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
+  cudaThreadSynchronize();
+  MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
+}
+
+void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
+{
+
+  dim3 grid;
+  grid.x = nslow;
+  grid.y = nmid;
+  grid.z = 1;
+  dim3 threads;
+  threads.x = nfast * 2;
+  threads.y = 1;
+  threads.z = 1;
+  permute_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out);
+  cudaThreadSynchronize();
+}
+void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
+{
+
+  dim3 grid;
+  grid.x = (ihi - ilo + 1);
+  grid.y = (jhi - jlo + 1);
+  grid.z = 1;
+  dim3 threads;
+  threads.x = (khi - klo + 1) * 2;
+  threads.y = 1;
+  threads.z = 1;
+  permute_part_kernel <<< grid, threads, 0>>>((FFT_FLOAT*)in, (FFT_FLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
+  cudaThreadSynchronize();
+}
+
+void FFTsyncthreads()
+{
+  cudaThreadSynchronize();
+}
+
diff --git a/lib/cuda/fft3d_cuda_cu.h b/lib/cuda/fft3d_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..6447d8e125463da7a5d485bf3010434c9e4a1222
--- /dev/null
+++ b/lib/cuda/fft3d_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void initfftdata(double* in, FFT_FLOAT* out, int nfast, int nmid, int nslow);
+extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
+extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
+extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo);
+extern "C" void FFTsyncthreads();
diff --git a/lib/cuda/fft3d_cuda_kernel.cu b/lib/cuda/fft3d_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a1be74bb1eff790c9f4eb1b654171493c0c70de
--- /dev/null
+++ b/lib/cuda/fft3d_cuda_kernel.cu
@@ -0,0 +1,46 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void initfftdata_kernel(double* in, FFT_FLOAT* out)
+{
+  out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
+  out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0;
+}
+
+
+__global__ void permute_kernel(FFT_FLOAT* in, FFT_FLOAT* out)
+{
+  out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
+}
+
+__global__ void permute_scale_kernel(FFT_FLOAT* in, FFT_FLOAT* out)
+{
+  out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5;
+}
+
+__global__ void permute_part_kernel(FFT_FLOAT* in, FFT_FLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
+{
+  {
+    out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo];
+  }
+}
diff --git a/lib/cuda/fix_addforce_cuda.cu b/lib/cuda/fix_addforce_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c1ecefba45f59d252ea60374a3cc622d3332e354
--- /dev/null
+++ b/lib/cuda/fix_addforce_cuda.cu
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_add_force_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "fix_addforce_cuda_cu.h"
+#include "fix_addforce_cuda_kernel.cu"
+
+void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+}
+
+void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixAddForceCuda_UpdateNmax(sdata);
+}
+
+void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixAddForceCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixAddForceCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
+
+  int oldgrid = grid.x;
+  grid.x = 4;
+  threads.x = 512;
+  reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_addforce_cuda_cu.h b/lib/cuda/fix_addforce_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bf59300c928d0384709f89f257f0d15e00d2c1c
--- /dev/null
+++ b/lib/cuda/fix_addforce_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue, F_FLOAT* aforiginal);
diff --git a/lib/cuda/fix_addforce_cuda_kernel.cu b/lib/cuda/fix_addforce_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e0265f3797570fe361c4b3ed87607f00f1a0eff8
--- /dev/null
+++ b/lib/cuda/fix_addforce_cuda_kernel.cu
@@ -0,0 +1,90 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  sharedmem[threadIdx.x] = 0;
+  sharedmem[threadIdx.x + blockDim.x] = 0;
+  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit)
+      //if (iregion >= 0 &&
+      //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
+    {
+      sharedmem[threadIdx.x] = -xvalue * _x[i] - yvalue * _x[i + 1 * _nmax] - zvalue * _x[i + 2 * _nmax];
+      sharedmem[threadIdx.x + blockDim.x] = _f[i];
+      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 1 * _nmax];
+      sharedmem[threadIdx.x + 3 * blockDim.x] = _f[i + 2 * _nmax];
+      _f[i] += xvalue;
+      _f[i + 1 * _nmax] += yvalue;
+      _f[i + 2 * _nmax] += zvalue;
+    }
+
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2 * blockDim.x]);
+  reduceBlock(&sharedmem[3 * blockDim.x]);
+  F_FLOAT* buffer = (F_FLOAT*) _buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
+  }
+
+}
+
+
+__global__ void reduce_foriginal(int n, F_FLOAT* foriginal)
+{
+  int i = 0;
+  sharedmem[threadIdx.x] = 0;
+  F_FLOAT myforig = 0.0;
+  F_FLOAT* buf = (F_FLOAT*) _buffer;
+  buf = &buf[blockIdx.x * n];
+
+  while(i < n) {
+    sharedmem[threadIdx.x] = 0;
+
+    if(i + threadIdx.x < n)
+      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
+
+    __syncthreads();
+    reduceBlock(sharedmem);
+    i += blockDim.x;
+
+    if(threadIdx.x == 0)
+      myforig += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0)
+    foriginal[blockIdx.x] = myforig;
+}
diff --git a/lib/cuda/fix_aveforce_cuda.cu b/lib/cuda/fix_aveforce_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..600f1d95e08e3856c34c1578604a065e1e82e455
--- /dev/null
+++ b/lib/cuda/fix_aveforce_cuda.cu
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_ave_force_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "fix_aveforce_cuda_cu.h"
+#include "fix_aveforce_cuda_kernel.cu"
+
+void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+}
+
+void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixAveForceCuda_UpdateNmax(sdata);
+}
+
+void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixAveForceCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixAveForceCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+
+  Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_FLOAT)>>> (groupbit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
+
+  int oldgrid = grid.x;
+  grid.x = 4;
+  threads.x = 512;
+  Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, aforiginal);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
+
+}
+
+void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue)
+{
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+
+  Cuda_FixAveForceCuda_PostForce_Set_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, axvalue, ayvalue, azvalue);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_aveforce_cuda_cu.h b/lib/cuda/fix_aveforce_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..6d58a472e0c184638ffabc8c052d75aebc212083
--- /dev/null
+++ b/lib/cuda/fix_aveforce_cuda_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_FLOAT* aforiginal);
+extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_FLOAT axvalue, F_FLOAT ayvalue, F_FLOAT azvalue);
diff --git a/lib/cuda/fix_aveforce_cuda_kernel.cu b/lib/cuda/fix_aveforce_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..37d80d92e8616b043c56343afe165b6a3d682ff6
--- /dev/null
+++ b/lib/cuda/fix_aveforce_cuda_kernel.cu
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+  sharedmem[threadIdx.x + blockDim.x] = 0;
+  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
+  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      sharedmem[threadIdx.x] = _f[i];
+      sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
+      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
+      sharedmem[threadIdx.x + 3 * blockDim.x] = 1;
+    }
+
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2 * blockDim.x]);
+  reduceBlock(&sharedmem[3 * blockDim.x]);
+  F_FLOAT* buffer = (F_FLOAT*) _buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
+  }
+}
+
+
+__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_FLOAT* foriginal)
+{
+  int i = 0;
+  sharedmem[threadIdx.x] = 0;
+  F_FLOAT myforig = 0.0;
+  F_FLOAT* buf = (F_FLOAT*) _buffer;
+  buf = &buf[blockIdx.x * n];
+
+  while(i < n) {
+    sharedmem[threadIdx.x] = 0;
+
+    if(i + threadIdx.x < n)
+      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
+
+    __syncthreads();
+    reduceBlock(sharedmem);
+    i += blockDim.x;
+
+    if(threadIdx.x == 0)
+      myforig += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0)
+    foriginal[blockIdx.x] = myforig;
+}
+
+__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      if(xflag) _f[i] = xvalue;
+
+      if(yflag) _f[i + 1 * _nmax] = yvalue;
+
+      if(zflag) _f[i + 2 * _nmax] = zvalue;
+    }
+}
diff --git a/lib/cuda/fix_enforce2d_cuda.cu b/lib/cuda/fix_enforce2d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cc48ed070d3ca07648ce3749c779aeb7aa02ecc4
--- /dev/null
+++ b/lib/cuda/fix_enforce2d_cuda.cu
@@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_enforce2d_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_enforce2d_cuda_cu.h"
+#include "fix_enforce2d_cuda_kernel.cu"
+
+void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+}
+
+void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixEnforce2dCuda_Init(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  Cuda_FixEnforce2dCuda_PostForce_Kernel <<< grid, threads>>> (groupbit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
+}
diff --git a/lib/cuda/fix_enforce2d_cuda_cu.h b/lib/cuda/fix_enforce2d_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..7a1cfadef402e7d10dcf3bc5bf2a0cb9feaafb48
--- /dev/null
+++ b/lib/cuda/fix_enforce2d_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);
diff --git a/lib/cuda/fix_enforce2d_cuda_kernel.cu b/lib/cuda/fix_enforce2d_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5950df2e54e96ed9151cb41ad93d46a5016bdf04
--- /dev/null
+++ b/lib/cuda/fix_enforce2d_cuda_kernel.cu
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      _v[i + 2 * _nmax] = V_F(0.0);
+      _f[i + 2 * _nmax] = F_F(0.0);
+    }
+}
diff --git a/lib/cuda/fix_freeze_cuda.cu b/lib/cuda/fix_freeze_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..613c76bbde002a02ca2d50d589f782c21ce51f5e
--- /dev/null
+++ b/lib/cuda/fix_freeze_cuda.cu
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_freeze_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_freeze_cuda_cu.h"
+#include "fix_freeze_cuda_kernel.cu"
+
+void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(torque)  , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*));
+}
+
+
+void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixFreezeCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixFreezeCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixFreezeCuda_UpdateBuffer(sdata);
+
+
+  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+
+  int oldgrid = grid.x;
+  grid.x = 3;
+  threads.x = 512;
+  Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_freeze_cuda_cu.h b/lib/cuda/fix_freeze_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e8b21a9558acf78324f676f6eee4889206740aa2
--- /dev/null
+++ b/lib/cuda/fix_freeze_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT* foriginal);
diff --git a/lib/cuda/fix_freeze_cuda_kernel.cu b/lib/cuda/fix_freeze_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f5057c87d82d5c32ecf7630d62245bfa7c0e5c6
--- /dev/null
+++ b/lib/cuda/fix_freeze_cuda_kernel.cu
@@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+  sharedmem[threadIdx.x + blockDim.x] = 0;
+  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      sharedmem[threadIdx.x] = _f[i];
+      sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
+      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
+
+      _f[i] = F_F(0.0);
+      _f[i + 1 * _nmax] = F_F(0.0);
+      _f[i + 2 * _nmax] = F_F(0.0);
+      _torque[i] = F_F(0.0);
+      _torque[i + 1 * _nmax] = F_F(0.0);
+      _torque[i + 2 * _nmax] = F_F(0.0);
+    }
+
+
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2 * blockDim.x]);
+  F_FLOAT* buffer = (F_FLOAT*)_buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
+  }
+}
+
+
+__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
+{
+  int i = 0;
+  sharedmem[threadIdx.x] = 0;
+  F_FLOAT myforig = 0.0;
+  F_FLOAT* buf = (F_FLOAT*)_buffer;
+  buf = &buf[blockIdx.x * n];
+
+  while(i < n) {
+    sharedmem[threadIdx.x] = 0;
+
+    if(i + threadIdx.x < n)
+      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
+
+    __syncthreads();
+    reduceBlock(sharedmem);
+    i += blockDim.x;
+
+    if(threadIdx.x == 0)
+      myforig += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0)
+    foriginal[blockIdx.x] = myforig;
+}
+
diff --git a/lib/cuda/fix_gravity_cuda.cu b/lib/cuda/fix_gravity_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0fc7051b86556e280ea60be491651be76cb9821e
--- /dev/null
+++ b/lib/cuda/fix_gravity_cuda.cu
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_gravity_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_gravity_cuda_cu.h"
+#include "fix_gravity_cuda_kernel.cu"
+
+void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(rmass_flag)       , & sdata->atom.rmass_flag, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rmass)       , & sdata->atom.rmass    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mass)       , & sdata->atom.mass    .dev_data, sizeof(V_FLOAT*));
+}
+
+void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixGravityCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixGravityCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixGravityCuda_UpdateBuffer(sdata);
+
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixGravityCuda_PostForce_Kernel <<< grid, threads>>> (groupbit, xacc, yacc, zacc);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_gravity_cuda_cu.h b/lib/cuda/fix_gravity_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..014b71f0114131682c92314c2a285191a915fdb3
--- /dev/null
+++ b/lib/cuda/fix_gravity_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc);
diff --git a/lib/cuda/fix_gravity_cuda_kernel.cu b/lib/cuda/fix_gravity_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba58d39bc863e90192c56f815cd6414ee7ea68e5
--- /dev/null
+++ b/lib/cuda/fix_gravity_cuda_kernel.cu
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_FLOAT xacc, F_FLOAT yacc, F_FLOAT zacc)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      F_FLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
+      _f[i] += mass * xacc;
+      _f[i + 1 * _nmax] += mass * yacc;
+      _f[i + 2 * _nmax] += mass * zacc;
+    }
+}
+
diff --git a/lib/cuda/fix_nh_cuda.cu b/lib/cuda/fix_nh_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1a9d84061f05dc0aaa0fe32e8f1d0cf03af4be66
--- /dev/null
+++ b/lib/cuda/fix_nh_cuda.cu
@@ -0,0 +1,255 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_nh_cuda
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_nh_cuda_cu.h"
+#include "fix_nh_cuda_kernel.cu"
+
+void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(debugdata)     , & sdata->debugdata, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*));  //might be moved to a neighbor record in sdata
+  cudaMemcpyToSymbol(MY_AP(maxhold)   , & sdata->atom.maxhold, sizeof(int));  //might be moved to a neighbor record in sdata
+  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
+  cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int size = (unsigned)10 * sizeof(int);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
+{
+  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(dtf)     , & dtf                       		, sizeof(V_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(dtv)     , & dtv                            , sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check       , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int));       //
+  Cuda_FixNHCuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  timespec atime1, atime2;
+  clock_gettime(CLOCK_REALTIME, &atime1);
+
+  if(sdata->atom.update_nmax)
+    Cuda_FixNHCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &atime2);
+  sdata->cuda_timings.test1 +=
+    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+
+  if(sdata->buffer_new)
+    Cuda_FixNHCuda_UpdateBuffer(sdata);
+
+  F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
+  F_FLOAT3 factor2;
+
+  if(p_triclinic) {
+    factor2.x = factor_h[3], factor2.y = factor_h[4];
+    factor2.z = factor_h[5];
+  }
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  FixNHCuda_nh_v_press_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
+
+}
+
+void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixNHCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixNHCuda_UpdateBuffer(sdata);
+
+  F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
+  F_FLOAT3 factor2;
+
+  if(p_triclinic) {
+    factor2.x = factor_h[3], factor2.y = factor_h[4];
+    factor2.z = factor_h[5];
+  }
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed");
+  FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
+
+}
+
+void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  timespec atime1, atime2;
+  clock_gettime(CLOCK_REALTIME, &atime1);
+
+  if(sdata->atom.update_nmax)
+    Cuda_FixNHCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &atime2);
+  sdata->cuda_timings.test1 +=
+    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+
+  if(sdata->buffer_new)
+    Cuda_FixNHCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  FixNHCuda_nh_v_temp_Kernel <<< grid, threads>>> (groupbit, factor_eta);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
+
+}
+void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  timespec atime1, atime2;
+  clock_gettime(CLOCK_REALTIME, &atime1);
+
+  if(sdata->atom.update_nmax)
+    Cuda_FixNHCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &atime2);
+  sdata->cuda_timings.test1 +=
+    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+
+  if(sdata->buffer_new)
+    Cuda_FixNHCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  FixNHCuda_nve_v_Kernel <<< grid, threads>>> (groupbit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
+}
+
+
+void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  timespec atime1, atime2;
+  clock_gettime(CLOCK_REALTIME, &atime1);
+
+  if(sdata->atom.update_nmax)
+    Cuda_FixNHCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  clock_gettime(CLOCK_REALTIME, &atime2);
+  sdata->cuda_timings.test1 +=
+    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
+
+  if(sdata->buffer_new)
+    Cuda_FixNHCuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  cudaMemset(sdata->buffer, 0, sizeof(int));
+  FixNHCuda_nve_x_Kernel <<< grid, threads>>> (groupbit);
+  cudaThreadSynchronize();
+  int reneigh_flag;
+  cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
+  sdata->atom.reneigh_flag += reneigh_flag;
+  CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
+}
+
+void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixNHCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixNHCuda_UpdateBuffer(sdata);
+
+  F_FLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
+  F_FLOAT3 factor2;
+
+  if(p_triclinic) {
+    factor2.x = factor_h[3], factor2.y = factor_h[4];
+    factor2.z = factor_h[5];
+  }
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias:   Kernel execution failed");
+}
+
diff --git a/lib/cuda/fix_nh_cuda_cu.h b/lib/cuda/fix_nh_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba6203cfd077abb65602230f1287a022e187ae4e
--- /dev/null
+++ b/lib/cuda/fix_nh_cuda_cu.h
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
+extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup  see cpp
+extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup  see cpp
diff --git a/lib/cuda/fix_nh_cuda_kernel.cu b/lib/cuda/fix_nh_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8e14fa7d8747226b62b2d2a43633da61f25bc5b7
--- /dev/null
+++ b/lib/cuda/fix_nh_cuda_kernel.cu
@@ -0,0 +1,205 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit)
+{
+  if(_dist_check) {
+
+    X_FLOAT d = X_F(0.0);
+
+    if(i < _nlocal) {
+      X_FLOAT tmp = xtmp - _xhold[i];
+      d = tmp * tmp;
+      tmp = ytmp - _xhold[i + _maxhold];
+      d += tmp * tmp;
+      tmp = ztmp - _xhold[i + 2 * _maxhold];
+      d += tmp * tmp;
+
+      d = ((_mask[i] & groupbit)) ? d : X_F(0.0);
+    }
+
+    if(not __all(d <= _triggerneighsq))
+      _reneigh_flag[0] = 1;
+  }
+}
+
+__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    V_FLOAT* my_v = _v + i;
+    V_FLOAT vx = my_v[0];
+    V_FLOAT vy = my_v[_nmax];
+    V_FLOAT vz = my_v[2 * _nmax];
+    vx *= factor.x;
+    vy *= factor.y;
+    vz *= factor.z;
+
+    if(p_triclinic) {
+      vx += vy * factor2.z + vz * factor2.y;
+      vy += vz * factor2.x;
+    }
+
+    vx *= factor.x;
+    vy *= factor.y;
+    vz *= factor.z;
+    my_v[0]       = vx;
+    my_v[_nmax]   = vy;
+    my_v[2 * _nmax] = vz;
+  }
+
+}
+
+__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    V_FLOAT* my_v = _v + i;
+    my_v[0] *= factor_eta;
+    my_v[_nmax] *= factor_eta;
+    my_v[2 * _nmax] *= factor_eta;
+  }
+
+}
+
+__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    F_FLOAT* my_f = _f + i;
+    V_FLOAT* my_v = _v + i;
+
+    V_FLOAT 		dtfm = _dtf;
+
+    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
+    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
+
+    V_FLOAT vx = my_v[0];
+    V_FLOAT vy = my_v[_nmax];
+    V_FLOAT vz = my_v[2 * _nmax];
+    vx *= factor.x;
+    vy *= factor.y;
+    vz *= factor.z;
+
+    if(p_triclinic) {
+      vx += vy * factor2.z + vz * factor2.y;
+      vy += vz * factor2.x;
+    }
+
+    vx *= factor.x;
+    vy *= factor.y;
+    vz *= factor.z;
+    my_v[0]       = vx + dtfm * my_f[0];
+    my_v[_nmax]   = vy + dtfm * my_f[_nmax];
+    my_v[2 * _nmax] = vz + dtfm * my_f[_nmax * 2];
+  }
+
+}
+
+__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
+{
+
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    F_FLOAT* my_f = _f + i;
+    V_FLOAT* my_v = _v + i;
+
+    V_FLOAT 		dtfm = _dtf;
+
+    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
+    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
+
+    *my_v = (*my_v + dtfm * (*my_f));
+    my_f += _nmax;
+    my_v += _nmax;
+    *my_v = (*my_v + dtfm * (*my_f));
+    my_f += _nmax;
+    my_v += _nmax;
+    *my_v = (*my_v + dtfm * (*my_f));
+  }
+}
+
+__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
+{
+  X_FLOAT xtmp, ytmp, ztmp;
+
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    V_FLOAT* my_v = _v + i;
+    X_FLOAT* my_x = _x + i;
+
+    xtmp = *my_x += _dtv * *my_v;
+    my_v += _nmax;
+    my_x += _nmax;
+    ytmp = *my_x += _dtv * *my_v;
+    my_v += _nmax;
+    my_x += _nmax;
+    ztmp = *my_x += _dtv * *my_v;
+  }
+
+  check_distance(xtmp, ytmp, ztmp, i, groupbit);
+}
+
+
+__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor, int p_triclinic, F_FLOAT3 factor2)
+{
+
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    F_FLOAT* my_f = _f + i;
+    V_FLOAT* my_v = _v + i;
+
+    V_FLOAT 		dtfm = _dtf;
+
+    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
+    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
+
+    V_FLOAT vx = my_v[0] + dtfm * my_f[0];
+    V_FLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
+    V_FLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
+
+    vx *= factor.x;
+    vy *= factor.y;
+    vz *= factor.z;
+
+    if(p_triclinic) {
+      vx += vy * factor2.z + vz * factor2.y;
+      vy += vz * factor2.x;
+    }
+
+    vx *= factor.x;
+    vy *= factor.y;
+    vz *= factor.z;
+    my_v[0]       = vx;
+    my_v[_nmax]   = vy;
+    my_v[2 * _nmax] = vz;
+
+  }
+}
+
diff --git a/lib/cuda/fix_nve_cuda.cu b/lib/cuda/fix_nve_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8acddcd6f189af2df642b44676919ebeefce51a0
--- /dev/null
+++ b/lib/cuda/fix_nve_cuda.cu
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_nve_cuda
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_nve_cuda_cu.h"
+#include "fix_nve_cuda_kernel.cu"
+
+void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*));  //might be moved to a neighbor record in sdata
+  cudaMemcpyToSymbol(MY_AP(maxhold)   , & sdata->atom.maxhold, sizeof(int));  //might be moved to a neighbor record in sdata
+  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
+  cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int size = (unsigned)10 * sizeof(int);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
+}
+
+void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
+{
+  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(dtf)     , & dtf                       		, sizeof(V_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(dtv)     , & dtv                            , sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check       , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int));       //
+  Cuda_FixNVECuda_UpdateNmax(sdata);
+}
+
+
+void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixNVECuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixNVECuda_UpdateBuffer(sdata);
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  cudaMemset(sdata->buffer, 0, sizeof(int));
+  FixNVECuda_InitialIntegrate_Kernel <<< grid, threads>>> (groupbit);
+  cudaThreadSynchronize();
+  int reneigh_flag;
+  cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
+  sdata->atom.reneigh_flag += reneigh_flag;
+  CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
+
+}
+
+void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixNVECuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixNVECuda_UpdateBuffer(sdata);
+
+#ifdef CUDA_USE_BINNING
+
+  dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
+  dim3 threads(sdata->domain.bin_nmax, 1, 1);
+  FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
+
+#else
+
+  int3 layout = getgrid(mynlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
+
+#endif
+}
+
diff --git a/lib/cuda/fix_nve_cuda_cu.h b/lib/cuda/fix_nve_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..90b393c9ec48086b27d80bb89e256b3262faa396
--- /dev/null
+++ b/lib/cuda/fix_nve_cuda_cu.h
@@ -0,0 +1,28 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
+extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
+extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
diff --git a/lib/cuda/fix_nve_cuda_kernel.cu b/lib/cuda/fix_nve_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c99439adb4b5605213affedfbeb153e99e0c6682
--- /dev/null
+++ b/lib/cuda/fix_nve_cuda_kernel.cu
@@ -0,0 +1,166 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+static inline __device__ void check_distance(X_FLOAT &xtmp, X_FLOAT &ytmp, X_FLOAT &ztmp, int &i, int groupbit)
+{
+  if(_dist_check) {
+    X_FLOAT tmp = xtmp - _xhold[i];
+    X_FLOAT d = tmp * tmp;
+    tmp = ytmp - _xhold[i + _maxhold];
+    d += tmp * tmp;
+    tmp = ztmp - _xhold[i + 2 * _maxhold];
+    d += tmp * tmp;
+
+    d = ((i < _nlocal) && (_mask[i] & groupbit)) ? d : X_F(0.0);
+
+    if(not __all(d <= _triggerneighsq))
+      _reneigh_flag[0] = 1;
+  }
+}
+
+
+__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
+{
+  X_FLOAT xtmp, ytmp, ztmp;
+#ifdef CUDA_USE_BINNING
+
+  const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
+
+  if(threadIdx.x < _bin_count_local[bin]) {
+    const int i = 3 * blockDim.x * bin + threadIdx.x;
+
+    if(_mask[i] & groupbit) {
+      F_FLOAT* my_f = _binned_f + i;
+      V_FLOAT* my_v = _binned_v + i;
+      X_FLOAT* my_x = _binned_x + i;
+
+      V_FLOAT 		dtfm = _dtf
+
+                         if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
+      else 			dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
+
+      V_FLOAT v_mem;
+      v_mem = *my_v += dtfm * (*my_f);
+      xtmp = *my_x += _dtv * v_mem;
+      my_f += blockDim.x;
+      my_v += blockDim.x;
+      my_x += blockDim.x;
+      v_mem = *my_v += dtfm * (*my_f);
+      ytmp = *my_x += _dtv * v_mem;
+      my_f += blockDim.x;
+      my_v += blockDim.x;
+      my_x += blockDim.x;
+      v_mem = *my_v += dtfm * (*my_f);
+      ztmp = *my_x += _dtv * v_mem;
+    }
+  }
+
+#else
+
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    F_FLOAT* my_f = _f + i;
+    V_FLOAT* my_v = _v + i;
+    X_FLOAT* my_x = _x + i;
+
+    V_FLOAT 		dtfm = _dtf;
+
+    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
+    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
+
+    V_FLOAT v_mem;
+    v_mem = *my_v += dtfm * (*my_f);
+    xtmp = *my_x += _dtv * v_mem;
+    my_f += _nmax;
+    my_v += _nmax;
+    my_x += _nmax;
+    v_mem = *my_v += dtfm * (*my_f);
+    ytmp = *my_x += _dtv * v_mem;
+    my_f += _nmax;
+    my_v += _nmax;
+    my_x += _nmax;
+    v_mem = *my_v += dtfm * (*my_f);
+    ztmp = *my_x += _dtv * v_mem;
+  }
+
+#endif
+
+  check_distance(xtmp, ytmp, ztmp, i, groupbit);
+}
+
+__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
+{
+#ifdef CUDA_USE_BINNING
+
+  const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
+
+  if(threadIdx.x < _bin_count_local[bin]) {
+    const int i = 3 * blockDim.x * bin + threadIdx.x;
+
+    if(_mask[i] & groupbit) {
+      F_FLOAT* my_f = _binned_f + i;
+      V_FLOAT* my_v = _binned_v + i;
+
+      V_FLOAT 		dtfm = _dtf
+
+                         if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
+      else 			dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
+
+      *my_v += dtfm * (*my_f);
+      my_f += blockDim.x;
+      my_v += blockDim.x;
+      *my_v += dtfm * (*my_f);
+      my_f += blockDim.x;
+      my_v += blockDim.x;
+      *my_v += dtfm * (*my_f);
+    }
+  }
+
+#else
+
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal && _mask[i] & groupbit) {
+    F_FLOAT* my_f = _f + i;
+    V_FLOAT* my_v = _v + i;
+
+    V_FLOAT 		dtfm = _dtf;
+
+    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
+    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
+
+    *my_v += dtfm * (*my_f);
+    my_f += _nmax;
+    my_v += _nmax;
+    *my_v += dtfm * (*my_f);
+    my_f += _nmax;
+    my_v += _nmax;
+    *my_v += dtfm * (*my_f);
+  }
+
+#endif
+}
+
+
+
diff --git a/lib/cuda/fix_set_force_cuda.cu b/lib/cuda/fix_set_force_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..afa1a4789cd5bb311ec458facbd54f13aa9f7466
--- /dev/null
+++ b/lib/cuda/fix_set_force_cuda.cu
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_set_force_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_set_force_cuda_cu.h"
+#include "fix_set_force_cuda_kernel.cu"
+
+void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
+{
+  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+}
+
+void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixSetForceCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixSetForceCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixSetForceCuda_UpdateBuffer(sdata);
+
+
+  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_FLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+
+  int oldgrid = grid.x;
+  grid.x = 3;
+  threads.x = 512;
+  Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_FLOAT)>>> (oldgrid, foriginal);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_set_force_cuda_cu.h b/lib/cuda/fix_set_force_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..63e528acce1aa50c8601863115bcad21febc4bf3
--- /dev/null
+++ b/lib/cuda/fix_set_force_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, F_FLOAT* foriginal, int flagx, int flagy, int flagz);
diff --git a/lib/cuda/fix_set_force_cuda_kernel.cu b/lib/cuda/fix_set_force_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee1a59062299831d49ff722b9256658186216680
--- /dev/null
+++ b/lib/cuda/fix_set_force_cuda_kernel.cu
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+extern __shared__ F_FLOAT sharedmem[];
+
+
+__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_FLOAT xvalue, F_FLOAT yvalue, F_FLOAT zvalue, int flagx, int flagy, int flagz)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  sharedmem[threadIdx.x] = 0;
+  sharedmem[threadIdx.x + blockDim.x] = 0;
+  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      sharedmem[threadIdx.x] = _f[i];
+      sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
+      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
+
+      if(flagx) _f[i] = xvalue;
+
+      if(flagy) _f[i + 1 * _nmax] = yvalue;
+
+      if(flagz) _f[i + 2 * _nmax] = zvalue;
+    }
+
+
+  reduceBlock(sharedmem);
+  reduceBlock(&sharedmem[blockDim.x]);
+  reduceBlock(&sharedmem[2 * blockDim.x]);
+  F_FLOAT* buffer = (F_FLOAT*)_buffer;
+
+  if(threadIdx.x == 0) {
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
+  }
+}
+
+
+__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_FLOAT* foriginal)
+{
+  int i = 0;
+  sharedmem[threadIdx.x] = 0;
+  F_FLOAT myforig = 0.0;
+  F_FLOAT* buf = (F_FLOAT*)_buffer;
+  buf = &buf[blockIdx.x * n];
+
+  while(i < n) {
+    sharedmem[threadIdx.x] = 0;
+
+    if(i + threadIdx.x < n)
+      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
+
+    __syncthreads();
+    reduceBlock(sharedmem);
+    i += blockDim.x;
+
+    if(threadIdx.x == 0)
+      myforig += sharedmem[0];
+  }
+
+  if(threadIdx.x == 0)
+    foriginal[blockIdx.x] = myforig;
+}
+
diff --git a/lib/cuda/fix_shake_cuda.cu b/lib/cuda/fix_shake_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e27f54e9686517f01cc6b73c691ece40dc41169c
--- /dev/null
+++ b/lib/cuda/fix_shake_cuda.cu
@@ -0,0 +1,297 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_shake_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "fix_shake_cuda_cu.h"
+#include "cuda_pair_virial_kernel_nc.cu"
+
+#define _shake_atom           MY_AP(shake_atom)
+#define _shake_type           MY_AP(shake_type)
+#define _shake_flag           MY_AP(shake_flag)
+#define _xshake               MY_AP(xshake)
+#define _dtfsq                MY_AP(dtfsq)
+#define _bond_distance        MY_AP(bond_distance)
+#define _angle_distance       MY_AP(angle_distance)
+#define _max_iter			  MY_AP(max_iter)
+#define _tolerance			  MY_AP(tolerance)
+__device__ __constant__ int* _shake_atom;
+__device__ __constant__ int* _shake_type;
+__device__ __constant__ int* _shake_flag;
+__device__ __constant__ X_FLOAT3* _xshake;
+__device__ __constant__ F_FLOAT _dtfsq;
+__device__ __constant__ X_FLOAT* _bond_distance;
+__device__ __constant__ X_FLOAT* _angle_distance;
+__device__ __constant__ int _max_iter;
+__device__ __constant__ X_FLOAT _tolerance;
+
+#include "fix_shake_cuda_kernel.cu"
+
+void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(vatom)   , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata         , sizeof(int*));
+}
+
+void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity		, sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(prd)		, sdata->domain.prd				, sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(triclinic)  , &sdata->domain.triclinic		, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(h)			, sdata->domain.h				, sizeof(X_FLOAT) * 6);
+}
+
+void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
+{
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
+}
+
+void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
+                            void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
+                            void* bond_distance, void* angle_distance, void* virial,
+                            int max_iter, X_FLOAT tolerance)
+{
+  Cuda_FixShakeCuda_UpdateNmax(sdata);
+  Cuda_FixShakeCuda_UpdateDomain(sdata);
+  cudaMemcpyToSymbol(MY_AP(shake_atom)        , & shake_atom 	  , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(shake_type)        , & shake_type 	  , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(shake_flag)        , & shake_flag 	  , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(xshake)            , & xshake     	  , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(dtv)               , & dtv        	  , sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(dtfsq)             , & dtfsq      	  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(bond_distance)     , & bond_distance  , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(angle_distance)    , & angle_distance , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(virial)     	   , & virial  		  , sizeof(void*));
+  cudaMemcpyToSymbol(MY_AP(flag)  			   , &sdata->flag	  , sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(max_iter)  		   , &max_iter  	  , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(tolerance)  	   , &tolerance  	  , sizeof(X_FLOAT));
+
+  if(sdata->atom.mass_host)
+    cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_FLOAT*));
+
+  cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int));       //
+
+  cudaMemcpyToSymbol(MY_AP(flag)  , &sdata->flag, sizeof(int*));
+
+}
+
+void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixShakeCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_FixShakeCuda_UpdateBuffer(sdata, 10 * sizeof(double));
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  FixShakeCuda_UnconstrainedUpdate_Kernel <<< grid, threads>>> ();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");
+}
+
+void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixShakeCuda_UpdateNmax(sdata);
+
+  if(sdata->domain.update)
+    Cuda_FixShakeCuda_UpdateDomain(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_FLOAT), 64);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->buffer_new)
+    Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_FLOAT));
+
+  BindXTypeTexture(sdata);
+
+  FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_FLOAT)>>> (vflag, vflag_atom, list, nlist);
+  cudaThreadSynchronize();
+
+  CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
+
+  if(vflag) {
+    int n = grid.x * grid.y;
+    grid.x = 6;
+    grid.y = 1;
+    threads.x = 256;
+    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
+  }
+
+}
+
+int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixShakeCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
+
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    cudaMemset(sdata->flag, 0, sizeof(int));
+    FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz);
+    cudaThreadSynchronize();
+    cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+    int aflag;
+    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
+    CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
+
+  }
+
+  return 3 * n;
+}
+
+int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixShakeCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
+
+  static int count = -1;
+  count++;
+  X_FLOAT dx = 0.0;
+  X_FLOAT dy = 0.0;
+  X_FLOAT dz = 0.0;
+
+  if(pbc_flag != 0) {
+    if(sdata->domain.triclinic == 0) {
+      dx = pbc[0] * sdata->domain.prd[0];
+      dy = pbc[1] * sdata->domain.prd[1];
+      dz = pbc[2] * sdata->domain.prd[2];
+    } else {
+      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
+      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
+      dz = pbc[2] * sdata->domain.prd[2];
+    }
+  }
+
+
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    FixShakeCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+  }
+
+  return 3 * n;
+}
+
+void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixShakeCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int size = n * 3 * sizeof(X_FLOAT);
+
+  if(sdata->buffer_new or (size > sdata->buffersize))
+    Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
+
+  int3 layout = getgrid(n);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  if(sdata->atom.nlocal > 0) {
+    cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+    FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
+
+  }
+}
diff --git a/lib/cuda/fix_shake_cuda_cu.h b/lib/cuda/fix_shake_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..9b808a7216e6d96bc3635bc24d86bcf2a69c9819
--- /dev/null
+++ b/lib/cuda/fix_shake_cuda_cu.h
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, F_FLOAT dtfsq,
+                                       void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
+                                       void* bond_distance, void* angle_distance, void* virial,
+                                       int max_iter, X_FLOAT tolerance);
+extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
+extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist);
+extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
+extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
+extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv);
+
diff --git a/lib/cuda/fix_shake_cuda_kernel.cu b/lib/cuda/fix_shake_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..da176d0770af48241cafe074d147b0f49fd75479
--- /dev/null
+++ b/lib/cuda/fix_shake_cuda_kernel.cu
@@ -0,0 +1,1060 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ void v_tally(int &vflag_global, int &vflag_atom, int &n, int* list, ENERGY_FLOAT total, ENERGY_FLOAT* v)
+{
+  /*if(vflag_global)
+  {
+    ENERGY_FLOAT fraction = n/total;
+  ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    *shared += fraction*v[0]; shared+=blockDim.x;
+    *shared += fraction*v[1]; shared+=blockDim.x;
+    *shared += fraction*v[2]; shared+=blockDim.x;
+    *shared += fraction*v[3]; shared+=blockDim.x;
+    *shared += fraction*v[4]; shared+=blockDim.x;
+    *shared += fraction*v[5];
+  }*/
+  if(vflag_atom) {
+    ENERGY_FLOAT fraction = ENERGY_F(1.0) / total;
+
+    for(int i = 0; i < n; i++) {
+      int m = list[i];
+      ENERGY_FLOAT* myvatom = &_vatom[m];
+
+      *myvatom += fraction * v[0];
+      myvatom += _nmax;
+      *myvatom += fraction * v[1];
+      myvatom += _nmax;
+      *myvatom += fraction * v[2];
+      myvatom += _nmax;
+      *myvatom += fraction * v[3];
+      myvatom += _nmax;
+      *myvatom += fraction * v[4];
+      myvatom += _nmax;
+      *myvatom += fraction * v[5];
+    }
+  }
+}
+
+inline __device__ void minimum_image(X_FLOAT3 &delta)
+{
+  if(_triclinic == 0) {
+    if(_periodicity[0]) {
+      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
+                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
+    }
+
+    if(_periodicity[1]) {
+      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
+    }
+
+    if(_periodicity[2]) {
+      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
+    }
+
+  } else {
+    if(_periodicity[1]) {
+      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
+      delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
+      delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
+                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
+
+    }
+
+    if(_periodicity[1]) {
+      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
+      delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
+                 (delta.y >  X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
+
+    }
+
+    if(_periodicity[0]) {
+      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
+                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
+    }
+  }
+}
+
+__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i >= _nlocal) return;
+
+  X_FLOAT3 my_xshake = {X_F(0.0), X_F(0.0), X_F(0.0)};
+
+  if(_shake_flag[i]) {
+    F_FLOAT* my_f = _f + i;
+    V_FLOAT* my_v = _v + i;
+    X_FLOAT* my_x = _x + i;
+
+    V_FLOAT 		dtfmsq = _dtfsq;
+
+    if(_rmass_flag) dtfmsq *= V_F(1.0) / _rmass[i];
+    else 			dtfmsq *= V_F(1.0) / _mass[_type[i]];
+
+    my_xshake.x =  *my_x + _dtv* *my_v + dtfmsq* *my_f;
+    my_f += _nmax;
+    my_v += _nmax;
+    my_x += _nmax;
+    my_xshake.y =  *my_x + _dtv* *my_v + dtfmsq* *my_f;
+    my_f += _nmax;
+    my_v += _nmax;
+    my_x += _nmax;
+    my_xshake.z =  *my_x + _dtv* *my_v + dtfmsq* *my_f;
+  }
+
+  _xshake[i] = my_xshake;
+}
+
+
+
+
+__device__ void FixShakeCuda_Shake2(int &vflag, int &vflag_atom, int &m)
+{
+  int nlist, list[2];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0, invmass1;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m + _nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+
+  // r01 = distance vec between atoms, with PBC
+
+  X_FLOAT3 r01;
+
+  X_FLOAT4 x_i0, x_i1;
+  x_i0 = fetchXType(i0);
+  x_i1 = fetchXType(i1);
+
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01;
+  X_FLOAT3 xs_i0 = _xshake[i0];
+  X_FLOAT3 xs_i1 = _xshake[i1];
+
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
+  X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+
+  if(_rmass_flag) {
+    invmass0 = X_F(1.0) / _rmass[i0];
+    invmass1 = X_F(1.0) / _rmass[i1];
+  } else {
+    invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)];
+    invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)];
+  }
+
+  X_FLOAT a = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
+  X_FLOAT b = X_F(2.0) * (invmass0 + invmass1) *
+              (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
+  X_FLOAT c = s01sq - bond1 * bond1;
+
+  // error check
+
+  X_FLOAT determ = b * b - X_F(4.0) * a * c;
+
+  if(determ < X_F(0.0)) {
+    _flag[0]++;
+    determ = X_F(0.0);
+  }
+
+  // exact quadratic solution for lamda
+
+  X_FLOAT lamda, lamda1, lamda2;
+  lamda1 = -b + _SQRT_(determ);
+  lamda2 = -lamda1 - X_F(2.0) * b;
+  lamda1 *= X_F(1.0) / (X_F(2.0) * a);
+  lamda2 *= X_F(1.0) / (X_F(2.0) * a);
+
+  lamda = (fabs(lamda1) <= fabs(lamda2)) ? lamda1 : lamda2;
+
+  // update forces if atom is owned by this processor
+
+  lamda *= X_F(1.0) / _dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+  nlist = 0;
+
+  if(i0 < _nlocal) {
+    _f[i0]         += lamda * r01.x;
+    _f[i0 + _nmax]   += lamda * r01.y;
+    _f[i0 + 2 * _nmax] += lamda * r01.z;
+    list[nlist++] = i0;
+  }
+
+  if(i1 < _nlocal) {
+    _f[i1]         -= lamda * r01.x;
+    _f[i1 + _nmax]   -= lamda * r01.y;
+    _f[i1 + 2 * _nmax] -= lamda * r01.z;
+    list[nlist++] = i1;
+  }
+
+  if(vflag || vflag_atom) {
+    ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    X_FLOAT factor = nlist;
+    v[0] = lamda * r01.x * r01.x;
+    *shared = factor * v[0];
+    shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda * r01.y * r01.y;
+    *shared = factor * v[1];
+    shared += blockDim.x;
+    v[2] = lamda * r01.z * r01.z;
+    *shared = factor * v[2];
+    shared += blockDim.x;
+    v[3] = lamda * r01.x * r01.y;
+    *shared = factor * v[3];
+    shared += blockDim.x;
+    v[4] = lamda * r01.x * r01.z;
+    *shared = factor * v[4];
+    shared += blockDim.x;
+    v[5] = lamda * r01.y * r01.z;
+    *shared = factor * v[5];
+    shared += blockDim.x;
+
+    v_tally(vflag, vflag_atom, nlist, list, 2.0, v);
+  }
+}
+
+
+__device__ void FixShakeCuda_Shake3(int &vflag, int &vflag_atom, int &m)
+{
+  int nlist, list[3];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0, invmass1, invmass2;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m + _nmax]];
+  int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+  X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
+
+  // r01 = distance vec between atoms, with PBC
+
+  X_FLOAT3 r01, r02;
+
+  X_FLOAT4 x_i0, x_i1, x_i2;
+  x_i0 = fetchXType(i0);
+  x_i1 = fetchXType(i1);
+  x_i2 = fetchXType(i2);
+
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  r02.x = x_i0.x - x_i2.x;
+  r02.y = x_i0.y - x_i2.y;
+  r02.z = x_i0.z - x_i2.z;
+  minimum_image(r02);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01, s02;
+  X_FLOAT3 xs_i0 = _xshake[i0];
+  X_FLOAT3 xs_i1 = _xshake[i1];
+  X_FLOAT3 xs_i2 = _xshake[i2];
+
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  s02.x = xs_i0.x - xs_i2.x;
+  s02.y = xs_i0.y - xs_i2.y;
+  s02.z = xs_i0.z - xs_i2.z;
+  minimum_image(s02);
+
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
+  X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
+  X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
+  X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+
+  if(_rmass_flag) {
+    invmass0 = X_F(1.0) / _rmass[i0];
+    invmass1 = X_F(1.0) / _rmass[i1];
+    invmass2 = X_F(1.0) / _rmass[i2];
+  } else {
+    invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)];
+    invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)];
+    invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
+  }
+
+  X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
+                (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
+  X_FLOAT a12 = X_F(2.0) * invmass0 *
+                (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
+  X_FLOAT a21 = X_F(2.0) * invmass0 *
+                (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
+  X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
+                (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
+
+  // error check
+
+  X_FLOAT determ = a11 * a22 - a12 * a21;
+
+  if(determ == X_F(0.0)) _flag[0]++;
+
+  X_FLOAT determinv = X_F(1.0) / determ;
+
+  X_FLOAT a11inv = a22 * determinv;
+  X_FLOAT a12inv = -a12 * determinv;
+  X_FLOAT a21inv = -a21 * determinv;
+  X_FLOAT a22inv = a11 * determinv;
+
+  // quadratic correction coeffs
+
+  X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
+
+  X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
+  X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
+  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
+
+  X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
+  X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
+  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
+
+  // iterate until converged
+
+  X_FLOAT lamda01 = X_F(0.0);
+  X_FLOAT lamda02 = X_F(0.0);
+  int niter = 0;
+  int done = 0;
+
+  X_FLOAT quad1, quad2, b1, b2, lamda01_new, lamda02_new;
+
+  //maybe all running full loop?
+  while(__any(!done) && niter < _max_iter) {
+    quad1 = quad1_0101 * lamda01 * lamda01 + quad1_0202 * lamda02 * lamda02 +
+            quad1_0102 * lamda01 * lamda02;
+    quad2 = quad2_0101 * lamda01 * lamda01 + quad2_0202 * lamda02 * lamda02 +
+            quad2_0102 * lamda01 * lamda02;
+
+    b1 = bond1 * bond1 - s01sq - quad1;
+    b2 = bond2 * bond2 - s02sq - quad2;
+
+    lamda01_new = a11inv * b1 + a12inv * b2;
+    lamda02_new = a21inv * b1 + a22inv * b2;
+
+    done++;
+    done = (fabs(lamda01_new - lamda01) > _tolerance) ? 0 : done;
+    done = (fabs(lamda02_new - lamda02) > _tolerance) ? 0 : done;
+
+
+    lamda01 = done < 2 ? lamda01_new : lamda01;
+    lamda02 = done < 2 ? lamda02_new : lamda02;
+    niter++;
+  }
+
+  // update forces if atom is owned by this processor
+
+  lamda01 *= X_F(1.0) / _dtfsq;
+  lamda02 *= X_F(1.0) / _dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+  nlist = 0;
+
+  if(i0 < _nlocal) {
+    _f[i0] += lamda01 * r01.x + lamda02 * r02.x;
+    _f[i0 + _nmax] += lamda01 * r01.y + lamda02 * r02.y;
+    _f[i0 + 2 * _nmax] += lamda01 * r01.z + lamda02 * r02.z;
+    list[nlist++] = i0;
+  }
+
+  if(i1 < _nlocal) {
+    _f[i1] -= lamda01 * r01.x;
+    _f[i1 + _nmax] -= lamda01 * r01.y;
+    _f[i1 + 2 * _nmax] -= lamda01 * r01.z;
+    list[nlist++] = i1;
+  }
+
+  if(i2 < _nlocal) {
+    _f[i2] -= lamda02 * r02.x;
+    _f[i2 + _nmax] -= lamda02 * r02.y;
+    _f[i2 + 2 * _nmax] -= lamda02 * r02.z;
+    list[nlist++] = i2;
+  }
+
+  if(vflag || vflag_atom) {
+    ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
+    v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x;
+    *shared = factor * v[0];
+    shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda01 * r01.y * r01.y + lamda02 * r02.y * r02.y;
+    *shared = factor * v[1];
+    shared += blockDim.x;
+    v[2] = lamda01 * r01.z * r01.z + lamda02 * r02.z * r02.z;
+    *shared = factor * v[2];
+    shared += blockDim.x;
+    v[3] = lamda01 * r01.x * r01.y + lamda02 * r02.x * r02.y;
+    *shared = factor * v[3];
+    shared += blockDim.x;
+    v[4] = lamda01 * r01.x * r01.z + lamda02 * r02.x * r02.z;
+    *shared = factor * v[4];
+    shared += blockDim.x;
+    v[5] = lamda01 * r01.y * r01.z + lamda02 * r02.y * r02.z;
+    *shared = factor * v[5];
+    shared += blockDim.x;
+
+    v_tally(vflag, vflag_atom, nlist, list, 3.0, v);
+  }
+}
+
+__device__ void FixShakeCuda_Shake4(int &vflag, int &vflag_atom, int &m)
+{
+  int nlist, list[4];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0, invmass1, invmass2, invmass3;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m + _nmax]];
+  int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
+  int i3 = _map_array[_shake_atom[m + 3 * _nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+  X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
+  X_FLOAT bond3 = _bond_distance[_shake_type[m + 2 * _nmax]];
+
+  // r01 = distance vec between atoms, with PBC
+
+  X_FLOAT3 r01, r02, r03;
+
+  X_FLOAT4 x_i0, x_i1, x_i2, x_i3;
+  x_i0 = fetchXType(i0);
+  x_i1 = fetchXType(i1);
+  x_i2 = fetchXType(i2);
+  x_i3 = fetchXType(i3);
+
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  r02.x = x_i0.x - x_i2.x;
+  r02.y = x_i0.y - x_i2.y;
+  r02.z = x_i0.z - x_i2.z;
+  minimum_image(r02);
+
+  r03.x = x_i0.x - x_i3.x;
+  r03.y = x_i0.y - x_i3.y;
+  r03.z = x_i0.z - x_i3.z;
+  minimum_image(r03);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01, s02, s03;
+  X_FLOAT3 xs_i0 = _xshake[i0];
+  X_FLOAT3 xs_i1 = _xshake[i1];
+  X_FLOAT3 xs_i2 = _xshake[i2];
+  X_FLOAT3 xs_i3 = _xshake[i3];
+
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  s02.x = xs_i0.x - xs_i2.x;
+  s02.y = xs_i0.y - xs_i2.y;
+  s02.z = xs_i0.z - xs_i2.z;
+  minimum_image(s02);
+
+  s03.x = xs_i0.x - xs_i3.x;
+  s03.y = xs_i0.y - xs_i3.y;
+  s03.z = xs_i0.z - xs_i3.z;
+  minimum_image(s03);
+
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
+  X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
+  X_FLOAT r03sq = r03.x * r03.x + r03.y * r03.y + r03.z * r03.z;
+  X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
+  X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
+  X_FLOAT s03sq = s03.x * s03.x + s03.y * s03.y + s03.z * s03.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+
+  if(_rmass_flag) {
+    invmass0 = X_F(1.0) / _rmass[i0];
+    invmass1 = X_F(1.0) / _rmass[i1];
+    invmass2 = X_F(1.0) / _rmass[i2];
+    invmass3 = X_F(1.0) / _rmass[i3];
+  } else {
+    invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)];
+    invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)];
+    invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
+    invmass3 = X_F(1.0) / _mass[static_cast <int>(x_i3.w)];
+  }
+
+  X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
+                (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
+  X_FLOAT a12 = X_F(2.0) * invmass0 *
+                (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
+  X_FLOAT a13 = X_F(2.0) * invmass0 *
+                (s01.x * r03.x + s01.y * r03.y + s01.z * r03.z);
+  X_FLOAT a21 = X_F(2.0) * invmass0 *
+                (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
+  X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
+                (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
+  X_FLOAT a23 = X_F(2.0) * (invmass0) *
+                (s02.x * r03.x + s02.y * r03.y + s02.z * r03.z);
+  X_FLOAT a31 = X_F(2.0) * (invmass0) *
+                (s03.x * r01.x + s03.y * r01.y + s03.z * r01.z);
+  X_FLOAT a32 = X_F(2.0) * (invmass0) *
+                (s03.x * r02.x + s03.y * r02.y + s03.z * r02.z);
+  X_FLOAT a33 = X_F(2.0) * (invmass0 + invmass3) *
+                (s03.x * r03.x + s03.y * r03.y + s03.z * r03.z);
+
+  // error check
+
+  X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
+                   a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
+
+  if(determ == X_F(0.0)) _flag[0]++;
+
+  X_FLOAT determinv = X_F(1.0) / determ;
+
+  X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
+  X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
+  X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
+  X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
+  X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
+  X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
+  X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
+  X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
+  X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
+
+  // quadratic correction coeffs
+
+  X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
+  X_FLOAT r0103 = (r01.x * r03.x + r01.y * r03.y + r01.z * r03.z);
+  X_FLOAT r0203 = (r02.x * r03.x + r02.y * r03.y + r02.z * r03.z);
+
+  X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
+  X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
+  X_FLOAT quad1_0303 = invmass0 * invmass0 * r03sq;
+  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
+  X_FLOAT quad1_0103 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0103;
+  X_FLOAT quad1_0203 = X_F(2.0) * invmass0 * invmass0 * r0203;
+
+  X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
+  X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
+  X_FLOAT quad2_0303 = invmass0 * invmass0 * r03sq;
+  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
+  X_FLOAT quad2_0103 = X_F(2.0) * invmass0 * invmass0 * r0103;
+  X_FLOAT quad2_0203 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0203;
+
+  X_FLOAT quad3_0101 = invmass0 * invmass0 * r01sq;
+  X_FLOAT quad3_0202 = invmass0 * invmass0 * r02sq;
+  X_FLOAT quad3_0303 = (invmass0 + invmass3) * (invmass0 + invmass3) * r03sq;
+  X_FLOAT quad3_0102 = X_F(2.0) * invmass0 * invmass0 * r0102;
+  X_FLOAT quad3_0103 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0103;
+  X_FLOAT quad3_0203 = X_F(2.0) * (invmass0 + invmass3) * invmass0 * r0203;
+  // iterate until converged
+
+  X_FLOAT lamda01 = X_F(0.0);
+  X_FLOAT lamda02 = X_F(0.0);
+  X_FLOAT lamda03 = X_F(0.0);
+  int niter = 0;
+  int done = 0;
+
+  X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda03_new;
+
+  //maybe all running full loop?
+  while(__any(!done) && niter < _max_iter) {
+    quad1 = quad1_0101 * lamda01 * lamda01 +
+            quad1_0202 * lamda02 * lamda02 +
+            quad1_0303 * lamda03 * lamda03 +
+            quad1_0102 * lamda01 * lamda02 +
+            quad1_0103 * lamda01 * lamda03 +
+            quad1_0203 * lamda02 * lamda03;
+
+    quad2 = quad2_0101 * lamda01 * lamda01 +
+            quad2_0202 * lamda02 * lamda02 +
+            quad2_0303 * lamda03 * lamda03 +
+            quad2_0102 * lamda01 * lamda02 +
+            quad2_0103 * lamda01 * lamda03 +
+            quad2_0203 * lamda02 * lamda03;
+
+    quad3 = quad3_0101 * lamda01 * lamda01 +
+            quad3_0202 * lamda02 * lamda02 +
+            quad3_0303 * lamda03 * lamda03 +
+            quad3_0102 * lamda01 * lamda02 +
+            quad3_0103 * lamda01 * lamda03 +
+            quad3_0203 * lamda02 * lamda03;
+
+    b1 = bond1 * bond1 - s01sq - quad1;
+    b2 = bond2 * bond2 - s02sq - quad2;
+    b3 = bond3 * bond3 - s03sq - quad3;
+
+    lamda01_new = a11inv * b1 + a12inv * b2 + a13inv * b3;
+    lamda02_new = a21inv * b1 + a22inv * b2 + a23inv * b3;
+    lamda03_new = a31inv * b1 + a32inv * b2 + a33inv * b3;
+
+    done++;
+    done = (fabs(lamda01_new - lamda01) > _tolerance) ? 0 : done;
+    done = (fabs(lamda02_new - lamda02) > _tolerance) ? 0 : done;
+    done = (fabs(lamda03_new - lamda03) > _tolerance) ? 0 : done;
+
+    lamda01 = done < 2 ? lamda01_new : lamda01;
+    lamda02 = done < 2 ? lamda02_new : lamda02;
+    lamda03 = done < 2 ? lamda03_new : lamda03;
+    niter++;
+  }
+
+  // update forces if atom is owned by this processor
+
+  lamda01 *= X_F(1.0) / _dtfsq;
+  lamda02 *= X_F(1.0) / _dtfsq;
+  lamda03 *= X_F(1.0) / _dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+  nlist = 0;
+
+  if(i0 < _nlocal) {
+    _f[i0] 			+= lamda01 * r01.x + lamda02 * r02.x + lamda03 * r03.x;
+    _f[i0 + _nmax] 	+= lamda01 * r01.y + lamda02 * r02.y + lamda03 * r03.y;
+    _f[i0 + 2 * _nmax] 	+= lamda01 * r01.z + lamda02 * r02.z + lamda03 * r03.z;
+    list[nlist++] = i0;
+  }
+
+  if(i1 < _nlocal) {
+    _f[i1] -= lamda01 * r01.x;
+    _f[i1 + _nmax] -= lamda01 * r01.y;
+    _f[i1 + 2 * _nmax] -= lamda01 * r01.z;
+    list[nlist++] = i1;
+  }
+
+  if(i2 < _nlocal) {
+    _f[i2] -= lamda02 * r02.x;
+    _f[i2 + _nmax] -= lamda02 * r02.y;
+    _f[i2 + 2 * _nmax] -= lamda02 * r02.z;
+    list[nlist++] = i2;
+  }
+
+  if(i3 < _nlocal) {
+    _f[i3] -= lamda03 * r03.x;
+    _f[i3 + _nmax] -= lamda03 * r03.y;
+    _f[i3 + 2 * _nmax] -= lamda03 * r03.z;
+    list[nlist++] = i3;
+  }
+
+  if(vflag || vflag_atom) {
+    ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    X_FLOAT factor = X_F(2.0) / X_F(4.0) * nlist;
+    v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda03 * r03.x * r03.x;
+    *shared = factor * v[0];
+    shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda01 * r01.y * r01.y + lamda02 * r02.y * r02.y + lamda03 * r03.y * r03.y;
+    *shared = factor * v[1];
+    shared += blockDim.x;
+    v[2] = lamda01 * r01.z * r01.z + lamda02 * r02.z * r02.z + lamda03 * r03.z * r03.z;
+    *shared = factor * v[2];
+    shared += blockDim.x;
+    v[3] = lamda01 * r01.x * r01.y + lamda02 * r02.x * r02.y + lamda03 * r03.x * r03.y;
+    *shared = factor * v[3];
+    shared += blockDim.x;
+    v[4] = lamda01 * r01.x * r01.z + lamda02 * r02.x * r02.z + lamda03 * r03.x * r03.z;
+    *shared = factor * v[4];
+    shared += blockDim.x;
+    v[5] = lamda01 * r01.y * r01.z + lamda02 * r02.y * r02.z + lamda03 * r03.y * r03.z;
+    *shared = factor * v[5];
+    shared += blockDim.x;
+
+    v_tally(vflag, vflag_atom, nlist, list, 4.0, v);
+  }
+}
+
+__device__ void FixShakeCuda_Shake3Angle(int &vflag, int &vflag_atom, int &m)
+{
+  int nlist, list[3];
+  ENERGY_FLOAT v[6];
+  X_FLOAT invmass0, invmass1, invmass2;
+
+  // local atom IDs and constraint distances
+
+  int i0 = _map_array[_shake_atom[m]];
+  int i1 = _map_array[_shake_atom[m + _nmax]];
+  int i2 = _map_array[_shake_atom[m + 2 * _nmax]];
+  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
+  X_FLOAT bond2 = _bond_distance[_shake_type[m + _nmax]];
+  X_FLOAT bond12 = _angle_distance[_shake_type[m + 2 * _nmax]];
+
+  // r01 = distance vec between atoms, with PBC
+
+  X_FLOAT3 r01, r02, r12;
+
+  X_FLOAT4 x_i0, x_i1, x_i2;
+  x_i0 = fetchXType(i0);
+  x_i1 = fetchXType(i1);
+  x_i2 = fetchXType(i2);
+
+  r01.x = x_i0.x - x_i1.x;
+  r01.y = x_i0.y - x_i1.y;
+  r01.z = x_i0.z - x_i1.z;
+  minimum_image(r01);
+
+  r02.x = x_i0.x - x_i2.x;
+  r02.y = x_i0.y - x_i2.y;
+  r02.z = x_i0.z - x_i2.z;
+  minimum_image(r02);
+
+  r12.x = x_i1.x - x_i2.x;
+  r12.y = x_i1.y - x_i2.y;
+  r12.z = x_i1.z - x_i2.z;
+  minimum_image(r12);
+
+  // s01 = distance vec after unconstrained update, with PBC
+
+  X_FLOAT3 s01, s02, s12;
+  X_FLOAT3 xs_i0 = _xshake[i0];
+  X_FLOAT3 xs_i1 = _xshake[i1];
+  X_FLOAT3 xs_i2 = _xshake[i2];
+
+  s01.x = xs_i0.x - xs_i1.x;
+  s01.y = xs_i0.y - xs_i1.y;
+  s01.z = xs_i0.z - xs_i1.z;
+  minimum_image(s01);
+
+  s02.x = xs_i0.x - xs_i2.x;
+  s02.y = xs_i0.y - xs_i2.y;
+  s02.z = xs_i0.z - xs_i2.z;
+  minimum_image(s02);
+
+  s12.x = xs_i1.x - xs_i2.x;
+  s12.y = xs_i1.y - xs_i2.y;
+  s12.z = xs_i1.z - xs_i2.z;
+  minimum_image(s12);
+
+  // scalar distances between atoms
+
+  X_FLOAT r01sq = r01.x * r01.x + r01.y * r01.y + r01.z * r01.z;
+  X_FLOAT r02sq = r02.x * r02.x + r02.y * r02.y + r02.z * r02.z;
+  X_FLOAT r12sq = r12.x * r12.x + r12.y * r12.y + r12.z * r12.z;
+  X_FLOAT s01sq = s01.x * s01.x + s01.y * s01.y + s01.z * s01.z;
+  X_FLOAT s02sq = s02.x * s02.x + s02.y * s02.y + s02.z * s02.z;
+  X_FLOAT s12sq = s12.x * s12.x + s12.y * s12.y + s12.z * s12.z;
+
+  // a,b,c = coeffs in quadratic equation for lamda
+
+  if(_rmass_flag) {
+    invmass0 = X_F(1.0) / _rmass[i0];
+    invmass1 = X_F(1.0) / _rmass[i1];
+    invmass2 = X_F(1.0) / _rmass[i2];
+  } else {
+    invmass0 = X_F(1.0) / _mass[static_cast <int>(x_i0.w)];
+    invmass1 = X_F(1.0) / _mass[static_cast <int>(x_i1.w)];
+    invmass2 = X_F(1.0) / _mass[static_cast <int>(x_i2.w)];
+  }
+
+  X_FLOAT a11 = X_F(2.0) * (invmass0 + invmass1) *
+                (s01.x * r01.x + s01.y * r01.y + s01.z * r01.z);
+  X_FLOAT a12 = X_F(2.0) * invmass0 *
+                (s01.x * r02.x + s01.y * r02.y + s01.z * r02.z);
+  X_FLOAT a13 = - X_F(2.0) * invmass1 *
+                (s01.x * r12.x + s01.y * r12.y + s01.z * r12.z);
+  X_FLOAT a21 = X_F(2.0) * invmass0 *
+                (s02.x * r01.x + s02.y * r01.y + s02.z * r01.z);
+  X_FLOAT a22 = X_F(2.0) * (invmass0 + invmass2) *
+                (s02.x * r02.x + s02.y * r02.y + s02.z * r02.z);
+  X_FLOAT a23 = X_F(2.0) * invmass2 *
+                (s02.x * r12.x + s02.y * r12.y + s02.z * r12.z);
+  X_FLOAT a31 = - X_F(2.0) * invmass1 *
+                (s12.x * r01.x + s12.y * r01.y + s12.z * r01.z);
+  X_FLOAT a32 = X_F(2.0) * invmass2 *
+                (s12.x * r02.x + s12.y * r02.y + s12.z * r02.z);
+  X_FLOAT a33 = X_F(2.0) * (invmass1 + invmass2) *
+                (s12.x * r12.x + s12.y * r12.y + s12.z * r12.z);
+
+  // inverse of matrix
+
+  X_FLOAT determ = a11 * a22 * a33 + a12 * a23 * a31 + a13 * a21 * a32 -
+                   a11 * a23 * a32 - a12 * a21 * a33 - a13 * a22 * a31;
+
+  if(determ == X_F(0.0)) _flag[0]++;
+
+  X_FLOAT determinv = X_F(1.0) / determ;
+
+  X_FLOAT a11inv = determinv * (a22 * a33 - a23 * a32);
+  X_FLOAT a12inv = -determinv * (a12 * a33 - a13 * a32);
+  X_FLOAT a13inv = determinv * (a12 * a23 - a13 * a22);
+  X_FLOAT a21inv = -determinv * (a21 * a33 - a23 * a31);
+  X_FLOAT a22inv = determinv * (a11 * a33 - a13 * a31);
+  X_FLOAT a23inv = -determinv * (a11 * a23 - a13 * a21);
+  X_FLOAT a31inv = determinv * (a21 * a32 - a22 * a31);
+  X_FLOAT a32inv = -determinv * (a11 * a32 - a12 * a31);
+  X_FLOAT a33inv = determinv * (a11 * a22 - a12 * a21);
+
+  // quadratic correction coeffs
+
+  X_FLOAT r0102 = (r01.x * r02.x + r01.y * r02.y + r01.z * r02.z);
+  X_FLOAT r0112 = (r01.x * r12.x + r01.y * r12.y + r01.z * r12.z);
+  X_FLOAT r0212 = (r02.x * r12.x + r02.y * r12.y + r02.z * r12.z);
+
+  X_FLOAT quad1_0101 = (invmass0 + invmass1) * (invmass0 + invmass1) * r01sq;
+  X_FLOAT quad1_0202 = invmass0 * invmass0 * r02sq;
+  X_FLOAT quad1_1212 = invmass1 * invmass1 * r12sq;
+  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0 + invmass1) * invmass0 * r0102;
+  X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0 + invmass1) * invmass1 * r0112;
+  X_FLOAT quad1_0212 = - X_F(2.0) * invmass0 * invmass1 * r0212;
+
+  X_FLOAT quad2_0101 = invmass0 * invmass0 * r01sq;
+  X_FLOAT quad2_0202 = (invmass0 + invmass2) * (invmass0 + invmass2) * r02sq;
+  X_FLOAT quad2_1212 = invmass2 * invmass2 * r12sq;
+  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0 + invmass2) * invmass0 * r0102;
+  X_FLOAT quad2_0112 = X_F(2.0) * invmass0 * invmass2 * r0112;
+  X_FLOAT quad2_0212 = X_F(2.0) * (invmass0 + invmass2) * invmass2 * r0212;
+
+  X_FLOAT quad3_0101 = invmass1 * invmass1 * r01sq;
+  X_FLOAT quad3_0202 = invmass2 * invmass2 * r02sq;
+  X_FLOAT quad3_1212 = (invmass1 + invmass2) * (invmass1 + invmass2) * r12sq;
+  X_FLOAT quad3_0102 = - X_F(2.0) * invmass1 * invmass2 * r0102;
+  X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1 + invmass2) * invmass1 * r0112;
+  X_FLOAT quad3_0212 = X_F(2.0) * (invmass1 + invmass2) * invmass2 * r0212;
+  // iterate until converged
+
+  X_FLOAT lamda01 = X_F(0.0);
+  X_FLOAT lamda02 = X_F(0.0);
+  X_FLOAT lamda12 = X_F(0.0);
+  int niter = 0;
+  int done = 0;
+
+  X_FLOAT quad1, quad2, quad3, b1, b2, b3, lamda01_new, lamda02_new, lamda12_new;
+
+  //maybe all running full loop?
+  while(__any(!done) && niter < _max_iter) {
+    quad1 = quad1_0101 * lamda01 * lamda01 +
+            quad1_0202 * lamda02 * lamda02 +
+            quad1_1212 * lamda12 * lamda12 +
+            quad1_0102 * lamda01 * lamda02 +
+            quad1_0112 * lamda01 * lamda12 +
+            quad1_0212 * lamda02 * lamda12;
+
+    quad2 = quad2_0101 * lamda01 * lamda01 +
+            quad2_0202 * lamda02 * lamda02 +
+            quad2_1212 * lamda12 * lamda12 +
+            quad2_0102 * lamda01 * lamda02 +
+            quad2_0112 * lamda01 * lamda12 +
+            quad2_0212 * lamda02 * lamda12;
+
+    quad3 = quad3_0101 * lamda01 * lamda01 +
+            quad3_0202 * lamda02 * lamda02 +
+            quad3_1212 * lamda12 * lamda12 +
+            quad3_0102 * lamda01 * lamda02 +
+            quad3_0112 * lamda01 * lamda12 +
+            quad3_0212 * lamda02 * lamda12;
+
+    b1 = bond1 * bond1 - s01sq - quad1;
+    b2 = bond2 * bond2 - s02sq - quad2;
+    b3 = bond12 * bond12 - s12sq - quad3;
+
+    lamda01_new = a11inv * b1 + a12inv * b2 + a13inv * b3;
+    lamda02_new = a21inv * b1 + a22inv * b2 + a23inv * b3;
+    lamda12_new = a31inv * b1 + a32inv * b2 + a33inv * b3;
+
+    done++;
+    done = (fabs(lamda01_new - lamda01) > _tolerance) ? 0 : done;
+    done = (fabs(lamda02_new - lamda02) > _tolerance) ? 0 : done;
+    done = (fabs(lamda12_new - lamda12) > _tolerance) ? 0 : done;
+
+    lamda01 = done < 2 ? lamda01_new : lamda01;
+    lamda02 = done < 2 ? lamda02_new : lamda02;
+    lamda12 = done < 2 ? lamda12_new : lamda12;
+    niter++;
+  }
+
+  // update forces if atom is owned by this processor
+
+  lamda01 *= X_F(1.0) / _dtfsq;
+  lamda02 *= X_F(1.0) / _dtfsq;
+  lamda12 *= X_F(1.0) / _dtfsq;
+
+
+  //attenion: are shake clusters <-> atom unique?
+  nlist = 0;
+
+  if(i0 < _nlocal) {
+    _f[i0] 			+= lamda01 * r01.x + lamda02 * r02.x;
+    _f[i0 + _nmax] 	+= lamda01 * r01.y + lamda02 * r02.y;
+    _f[i0 + 2 * _nmax] 	+= lamda01 * r01.z + lamda02 * r02.z;
+    list[nlist++] = i0;
+  }
+
+  if(i1 < _nlocal) {
+    _f[i1] 			-= lamda01 * r01.x - lamda12 * r12.x;
+    _f[i1 + _nmax] 	-= lamda01 * r01.y - lamda12 * r12.y;
+    _f[i1 + 2 * _nmax] 	-= lamda01 * r01.z - lamda12 * r12.z;
+    list[nlist++] = i1;
+  }
+
+  if(i2 < _nlocal) {
+    _f[i2] 			-= lamda02 * r02.x + lamda12 * r12.x;
+    _f[i2 + _nmax] 	-= lamda02 * r02.y + lamda12 * r12.y;
+    _f[i2 + 2 * _nmax] 	-= lamda02 * r02.z + lamda12 * r12.z;
+    list[nlist++] = i2;
+  }
+
+  if(vflag || vflag_atom) {
+    ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    X_FLOAT factor = X_F(2.0) / X_F(3.0) * nlist;
+    v[0] = lamda01 * r01.x * r01.x + lamda02 * r02.x * r02.x + lamda12 * r12.x * r12.x;
+    *shared = factor * v[0];
+    shared += blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
+    v[1] = lamda01 * r01.y * r01.y + lamda02 * r02.y * r02.y + lamda12 * r12.y * r12.y;
+    *shared = factor * v[1];
+    shared += blockDim.x;
+    v[2] = lamda01 * r01.z * r01.z + lamda02 * r02.z * r02.z + lamda12 * r12.z * r12.z;
+    *shared = factor * v[2];
+    shared += blockDim.x;
+    v[3] = lamda01 * r01.x * r01.y + lamda02 * r02.x * r02.y + lamda12 * r12.x * r12.y;
+    *shared = factor * v[3];
+    shared += blockDim.x;
+    v[4] = lamda01 * r01.x * r01.z + lamda02 * r02.x * r02.z + lamda12 * r12.x * r12.z;
+    *shared = factor * v[4];
+    shared += blockDim.x;
+    v[5] = lamda01 * r01.y * r01.z + lamda02 * r02.y * r02.z + lamda12 * r12.y * r12.z;
+    *shared = factor * v[5];
+    shared += blockDim.x;
+
+    v_tally(vflag, vflag_atom, nlist, list, 3.0, v);
+  }
+}
+
+__global__ void FixShakeCuda_Shake_Kernel(int vflag, int vflag_atom, int* list, int nlist)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < nlist) {
+
+    int m = list[i];
+    int sflag = _shake_flag[m];
+
+    if(sflag == 2) FixShakeCuda_Shake2(vflag, vflag_atom, m);
+    else if(sflag == 3) FixShakeCuda_Shake3(vflag, vflag_atom, m);
+    else if(sflag == 4) FixShakeCuda_Shake4(vflag, vflag_atom, m);
+    else FixShakeCuda_Shake3Angle(vflag, vflag_atom, m);
+  } else {
+    ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
+    *shared = ENERGY_F(0.0);
+    shared += blockDim.x;
+    *shared = ENERGY_F(0.0);
+    shared += blockDim.x;
+    *shared = ENERGY_F(0.0);
+    shared += blockDim.x;
+    *shared = ENERGY_F(0.0);
+    shared += blockDim.x;
+    *shared = ENERGY_F(0.0);
+    shared += blockDim.x;
+    *shared = ENERGY_F(0.0);
+  }
+
+  if(vflag) {
+    __syncthreads();
+    int eflag = 0;
+    PairVirialCompute_A_Kernel(eflag, vflag);
+  }
+
+}
+
+__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    if(j > _nmax) _flag[0] = 1;
+
+    X_FLOAT3 xs = _xshake[j];
+    ((X_FLOAT*) _buffer)[i] = xs.x + dx;
+    ((X_FLOAT*) _buffer)[i + 1 * n] = xs.y + dy;
+    ((X_FLOAT*) _buffer)[i + 2 * n] = xs.z + dz;
+  }
+
+}
+
+__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_FLOAT dx, X_FLOAT dy, X_FLOAT dz, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+
+    if(j > _nmax) _flag[0] = 1;
+
+    X_FLOAT3 xs = _xshake[j];
+    xs.x += dx;
+    xs.y += dy;
+    xs.z += dz;
+    _xshake[i + first] = xs;
+  }
+
+}
+
+__global__ void FixShakeCuda_UnpackComm_Kernel(int n, int first)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    X_FLOAT3 xs;
+    xs.x = ((X_FLOAT*) _buffer)[i];
+    xs.y = ((X_FLOAT*) _buffer)[i + 1 * n];
+    xs.z = ((X_FLOAT*) _buffer)[i + 2 * n];
+    _xshake[i + first] = xs;
+  }
+}
+
diff --git a/lib/cuda/fix_temp_berendsen_cuda.cu b/lib/cuda/fix_temp_berendsen_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b99608dda505ce530261b8be625afd58558707ed
--- /dev/null
+++ b/lib/cuda/fix_temp_berendsen_cuda.cu
@@ -0,0 +1,66 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_temp_berendsen_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_temp_berendsen_cuda_cu.h"
+#include "fix_temp_berendsen_cuda_kernel.cu"
+
+
+void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*));
+}
+
+void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
+{
+  V_FLOAT factor = afactor;
+
+  if(sdata->atom.update_nmax)
+    Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixTempBerendsenCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_temp_berendsen_cuda_cu.h b/lib/cuda/fix_temp_berendsen_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..1cffbc730d29b074df7d212280818fc68da79920
--- /dev/null
+++ b/lib/cuda/fix_temp_berendsen_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);
diff --git a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2d3b04ace5acb4a554ea35afb2eeaf5c0e59a84f
--- /dev/null
+++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      _v[i] *= factor;
+      _v[i + _nmax] *= factor;
+      _v[i + 2 * _nmax] *= factor;
+    }
+}
+
diff --git a/lib/cuda/fix_temp_rescale_cuda.cu b/lib/cuda/fix_temp_rescale_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..171156519b96d4204ccd9121914cfa655a213358
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_cuda.cu
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_temp_rescale_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_temp_rescale_cuda_cu.h"
+#include "fix_temp_rescale_cuda_kernel.cu"
+
+
+void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*));
+}
+
+void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
+{
+  V_FLOAT factor = afactor;
+  //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
+  Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  //cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixTempRescaleCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_temp_rescale_cuda_cu.h b/lib/cuda/fix_temp_rescale_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ec9a3161fd64c2ea3350d3773654cd431d8e242
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);
diff --git a/lib/cuda/fix_temp_rescale_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2e34ec592f5f8582a0cb63d84154f76dcf7d0d21
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu
@@ -0,0 +1,37 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      _v[i] *= factor;
+      _v[i + _nmax] *= factor;
+      _v[i + 2 * _nmax] *= factor;
+    }
+}
+
diff --git a/lib/cuda/fix_temp_rescale_limit_cuda.cu b/lib/cuda/fix_temp_rescale_limit_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..72028a124ef00797812f9e1196db59ac1fbc21a9
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_temp_rescale_limit_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_temp_rescale_limit_cuda_cu.h"
+#include "fix_temp_rescale_limit_cuda_kernel.cu"
+
+
+void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*));
+}
+
+void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit)
+{
+  V_FLOAT factor = afactor;
+  //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
+  Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
+  //if(sdata->atom.update_nlocal)
+  //cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+
+  int3 layout = getgrid(sdata->atom.nlocal);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor, limit);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
+}
diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..44efa566beec8a17a1097e9196a0fd05711b8b1a
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit);
diff --git a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eda86ccdce850bebe622255efb39b28a6129032b
--- /dev/null
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
@@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_FLOAT factor, V_FLOAT limit)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      V_FLOAT vx = _v[i];
+      V_FLOAT vy = _v[i + _nmax];
+      V_FLOAT vz = _v[i + 2 * _nmax];
+      vx *= factor;
+      vy *= factor;
+      vz *= factor;
+
+      _v[i] = vx > 0 ? min(vx, limit) : max(vx, -limit);
+      _v[i + _nmax] = vy > 0 ? min(vy, limit) : max(vy, -limit);
+      _v[i + 2 * _nmax] = vz > 0 ? min(vz, limit) : max(vz, -limit);
+    }
+}
+
diff --git a/lib/cuda/fix_viscous_cuda.cu b/lib/cuda/fix_viscous_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..03a019bc9fb923fd8d095e081766c8921721731f
--- /dev/null
+++ b/lib/cuda/fix_viscous_cuda.cu
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX fix_viscous_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+
+#include "fix_viscous_cuda_cu.h"
+#include "fix_viscous_cuda_kernel.cu"
+
+void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
+}
+
+void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_FixViscousCuda_UpdateNmax(sdata);
+
+}
+
+
+void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma)
+{
+  if(sdata->atom.update_nmax)
+    Cuda_FixViscousCuda_UpdateNmax(sdata);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+
+  int3 layout = getgrid(sdata->atom.nlocal, 0);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_FLOAT*) gamma);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
+
+}
diff --git a/lib/cuda/fix_viscous_cuda_cu.h b/lib/cuda/fix_viscous_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3727bc35651af9124cc6acf274d5d2561eb40c67
--- /dev/null
+++ b/lib/cuda/fix_viscous_cuda_cu.h
@@ -0,0 +1,27 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
+extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma);
diff --git a/lib/cuda/fix_viscous_cuda_kernel.cu b/lib/cuda/fix_viscous_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c3397715f324618234648325ee60339b86dd674
--- /dev/null
+++ b/lib/cuda/fix_viscous_cuda_kernel.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_FLOAT* gamma)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < _nlocal)
+    if(_mask[i] & groupbit) {
+      F_FLOAT drag = gamma[_type[i]];
+      _f[i] -= drag * _v[i];
+      _f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax];
+      _f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax];
+    }
+}
diff --git a/lib/cuda/neighbor.cu b/lib/cuda/neighbor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ddcf6ddc091cfd64bd0da941321143d00e3054a7
--- /dev/null
+++ b/lib/cuda/neighbor.cu
@@ -0,0 +1,364 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <time.h>
+#define MY_PREFIX neighbor
+#define IncludeCommonNeigh
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "crm_cuda_utils.cu"
+#include "cuda_wrapper_cu.h"
+
+#define _cutneighsq     MY_AP(cutneighsq)
+#define _ex_type     	MY_AP(ex_type)
+#define _nex_type     	MY_AP(nex_type)
+#define _ex1_bit     	MY_AP(ex1_bit)
+#define _ex2_bit     	MY_AP(ex2_bit)
+#define _nex_group     	MY_AP(nex_group)
+#define _ex_mol_bit     MY_AP(ex_mol_bit)
+#define _nex_mol     	MY_AP(nex_mol)
+__device__ __constant__ CUDA_FLOAT* _cutneighsq;
+__device__ __constant__ int* _ex_type;
+__device__ __constant__ int _nex_type;
+__device__ __constant__ int* _ex1_bit;
+__device__ __constant__ int* _ex2_bit;
+__device__ __constant__ int _nex_group;
+__device__ __constant__ int* _ex_mol_bit;
+__device__ __constant__ int _nex_mol;
+
+#include "neighbor_cu.h"
+#include "neighbor_kernel.cu"
+
+void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
+
+  int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_FLOAT)));
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+
+    if(sdata->buffer != NULL) CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+
+    sdata->buffer = CudaWrapper_AllocCudaData(size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
+}
+
+int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  if(sdata->buffer_new)
+    Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
+
+  // initialize only on first call
+  CUDA_FLOAT rez_bin_size[3] = {
+    (1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
+    (1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
+    (1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
+  };
+
+  short init = 0;
+
+  if(! init) {
+    init = 0;
+    cudaMemcpyToSymbol(MY_AP(x)              , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+    cudaMemcpyToSymbol(MY_AP(nall)         , & sdata->atom.nall                    , sizeof(unsigned));
+    cudaMemcpyToSymbol(MY_AP(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned));
+    cudaMemcpyToSymbol(MY_AP(sublo)          ,   sdata->domain.sublo                 , sizeof(X_FLOAT) * 3);
+  }
+
+
+  int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  timespec starttime, endtime;
+  clock_gettime(CLOCK_REALTIME, &starttime);
+
+  cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_FLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
+
+  Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
+  cudaThreadSynchronize();
+
+  clock_gettime(CLOCK_REALTIME, &endtime);
+  sdata->cuda_timings.neigh_bin +=
+    endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+
+
+  int binning_error;
+  cudaMemcpy((void*) &binning_error, (void*) sdata->buffer, 1 * sizeof(int), cudaMemcpyDeviceToHost);
+
+  if(binning_error) {
+    sneighlist->bin_extraspace += 0.05;
+  } else {
+    MYDBG(printf("CUDA: binning successful\n");)
+  }
+  CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
+  return binning_error;
+}
+
+int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
+  CUDA_FLOAT globcutoff = -1.0;
+
+  short init = 0;
+
+  if(! init) {
+    init = 1;
+
+    // !! LAMMPS indexes atom types starting with 1 !!
+
+    unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+
+    unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
+
+    CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
+    //printf("Allocate: %i\n",nx);
+    sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
+
+    if(sneighlist->cutneighsq) {
+      int cutoffsdiffer = 0;
+      double cutoff0 = sneighlist->cutneighsq[1][1];
+
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+          acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
+
+          if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
+        }
+      }
+
+      if(not cutoffsdiffer) globcutoff = (CUDA_FLOAT) cutoff0;
+    } else {
+      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
+      return 0;
+    }
+
+    int size = 100;
+
+    if(sdata->buffersize < size) {
+      MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+      CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+      sdata->buffer = CudaWrapper_AllocCudaData(size);
+      sdata->buffersize = size;
+      sdata->buffer_new++;
+      MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+    }
+
+    CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
+    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_FLOAT*));
+
+    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
+    cudaMemcpyToSymbol(MY_AP(special_flag)     , sdata->atom.special_flag         , 4 * sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(molecular)        , & sdata->atom.molecular          , sizeof(int));
+  }
+
+  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+  //cudaMemcpyToSymbol(MY_AP(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
+  cudaMemcpyToSymbol(MY_AP(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(inum)             , & sneighlist->inum               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nlocal)           , & sdata->atom.nlocal             , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)             , & sdata->atom.nall            , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(mask)             , & sdata->atom.mask      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)              , & sdata->atom.tag       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(special)          , & sdata->atom.special   .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(maxspecial)       , & sdata->atom.maxspecial         , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nspecial)         , & sdata->atom.nspecial  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(debugdata)        , & sdata->debugdata	 , sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(overlap_comm)     , & sdata->overlap_comm, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(neighbors) 		  , & sneighlist->neighbors.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ex_type) 		  , & sneighlist->ex_type.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ex1_bit) 		  , & sneighlist->ex1_bit.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ex2_bit) 		  , & sneighlist->ex2_bit.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ex_mol_bit) 	  , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(nex_type)     	  , & sneighlist->nex_type, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nex_group)     	  , & sneighlist->nex_group, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nex_mol)     	  , & sneighlist->nex_mol, sizeof(int));
+
+  if(sdata->overlap_comm) {
+    cudaMemcpyToSymbol(MY_AP(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*));
+  }
+
+  //dim3 threads(sneighlist->bin_nmax,1,1);
+  dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
+  dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
+
+  //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
+  int buffer[20];
+  buffer[0] = 1;
+  buffer[1] = 0;
+  CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
+  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
+  //cudaMemset(sdata->debugdata,0,100*sizeof(int));
+  unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_FLOAT)) * threads.x;
+  MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
+  //shared_size=2056;
+  timespec starttime, endtime;
+  clock_gettime(CLOCK_REALTIME, &starttime);
+  //for(int i=0;i<100;i++)
+  {
+    if(sdata->overlap_comm)
+      NeighborBuildFullBin_OverlapComm_Kernel <<< grid, threads, shared_size>>>
+      (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom);
+    else {
+      int exclude = sneighlist->nex_mol | sneighlist->nex_group | sneighlist->nex_type;
+
+      if(exclude)
+        NeighborBuildFullBin_Kernel<1> <<< grid, threads, shared_size>>>
+        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
+      else
+        NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
+        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
+    }
+    //NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
+    //	(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
+
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
+    clock_gettime(CLOCK_REALTIME, &endtime);
+    sdata->cuda_timings.neigh_build +=
+      endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+    //dim3 threads,grid;
+    CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
+
+    if(buffer[0] >= 0 && true && sdata->atom.molecular) {
+      //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
+      clock_gettime(CLOCK_REALTIME, &starttime);
+      int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
+      threads.x = layout.z;
+      threads.y = 1;
+      threads.z = 1;
+      grid.x = layout.x;
+      grid.y = layout.y;
+      grid.z = 1;
+      FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
+      cudaThreadSynchronize();
+      CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
+      clock_gettime(CLOCK_REALTIME, &endtime);
+      sdata->cuda_timings.neigh_special +=
+        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
+    }
+  }
+  //printf("Neightime: %lf\n",sdata->cuda_timings.test1);
+  CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
+
+  //CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
+
+  MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
+  return buffer[0];
+}
+
+int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
+  // initialize only on first call
+  /*static*/ short init = 0;
+
+  if(! init) {
+    init = 1;
+
+    // !! LAMMPS indexes atom types starting with 1 !!
+
+    unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+
+    if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
+      printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
+             "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+             "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
+
+    unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
+    CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
+
+    if(sneighlist->cutneighsq) {
+      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
+        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
+          acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT)(sneighlist->cutneighsq[i][j]);
+          //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
+        }
+      }
+    } else {
+      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
+      return 0;
+    }
+
+    int size = 100;
+
+    if(sdata->buffersize < size) {
+      MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+      CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
+      sdata->buffer = CudaWrapper_AllocCudaData(size);
+      sdata->buffersize = size;
+      sdata->buffer_new++;
+      MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+    }
+
+    cudaMemcpyToSymbol(MY_AP(buffer)           , & sdata->buffer                  , sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
+    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , acutneighsq                    , nx);
+    cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+    cudaMemcpyToSymbol(MY_AP(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(inum)             , & sneighlist->inum               , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nlocal)           , & sdata->atom.nlocal             , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nall)             , & sdata->atom.nall               , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nmax)             , & sdata->atom.nmax               , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
+    cudaMemcpyToSymbol(MY_AP(x)                , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+    cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int));
+
+    free(acutneighsq);
+  }
+
+  int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  int return_value = 1;
+  CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
+
+  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
+  NeighborBuildFullNsq_Kernel <<< grid, threads>>> ();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
+
+  int buffer[20];
+  CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int) * 20);
+  MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
+  return return_value = buffer[0];
+}
diff --git a/lib/cuda/neighbor_cu.h b/lib/cuda/neighbor_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3028e5400f35fe972958701596c318eee64e84c
--- /dev/null
+++ b/lib/cuda/neighbor_cu.h
@@ -0,0 +1,32 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef NEIGHBOR_CU_H_
+#define NEIGHBOR_CU_H_
+#include "cuda_shared.h"
+
+extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
+extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
+extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
+
+#endif /*NEIGHBOR_CU_H_*/
diff --git a/lib/cuda/neighbor_kernel.cu b/lib/cuda/neighbor_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3892f5ec29e09f1a95e73508de828025ddb6a200
--- /dev/null
+++ b/lib/cuda/neighbor_kernel.cu
@@ -0,0 +1,660 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#define SBBITS 30
+
+__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
+                               CUDA_FLOAT rez_bin_size_x, CUDA_FLOAT rez_bin_size_y, CUDA_FLOAT rez_bin_size_z)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  /*int* bin_count=(int*) _buffer;
+  bin_count=bin_count+20;
+  CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
+  CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
+  binned_x = &binned_x[2];
+  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
+
+  if(i < _nall) {
+    // copy atom position from global device memory to local register
+    // in this 3 steps to get as much coalesced access as possible
+    X_FLOAT* my_x = _x + i;
+    CUDA_FLOAT x_i = *my_x;
+    my_x += _nmax;
+    CUDA_FLOAT y_i = *my_x;
+    my_x += _nmax;
+    CUDA_FLOAT z_i = *my_x;
+
+
+    // calculate flat bin index
+    int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
+    int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
+    int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;
+
+    bx -= bx * negativCUDA(1.0f * bx);
+    bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
+    by -= by * negativCUDA(1.0f * by);
+    by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
+    bz -= bz * negativCUDA(1.0f * bz);
+    bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);
+
+
+    const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;
+
+    // add new atom to bin, get bin-array position
+    const unsigned k = atomicAdd(& bin_count[j], 1);
+
+    if(k < bin_nmax) {
+      binned_id [bin_nmax * j + k] = i;
+      binned_x [3 * bin_nmax * j + k] = x_i;
+      binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
+      binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
+    } else {
+      // normally, this should not happen:
+      int errorn = atomicAdd((int*) _buffer, 1);
+      MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
+    }
+  }
+}
+
+
+__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
+{
+  int m;
+
+  if(_nex_type)
+    if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;
+
+  if(_nex_group) {
+    for(m = 0; m < _nex_group; m++) {
+      if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
+
+      if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
+    }
+  }
+
+  if(_nex_mol) {
+    if(_molecule[i] == _molecule[j])
+      for(m = 0; m < _nex_mol; m++)
+        if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
+  }
+
+  return 0;
+}
+
+extern __shared__ CUDA_FLOAT shared[];
+
+__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
+{
+  int k = n.z;
+
+  for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);
+
+  return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
+}
+
+template <const unsigned int exclude>
+__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style, bool neighall)
+{
+  int natoms = neighall ? _nall : _nlocal;
+  //const bool domol=false;
+  int bin_dim_z = gridDim.y;
+  CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
+  binned_x = &binned_x[2];
+  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
+  int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
+  int bin_x = blockIdx.x / bin_dim_y;
+  int bin_y = blockIdx.x - bin_x * bin_dim_y;
+  int bin_z = blockIdx.y;
+  int bin_c = bin_count[bin];
+
+
+  CUDA_FLOAT cut;
+
+  if(globcutoff > 0)
+    cut = globcutoff;
+
+  int i = _nall;
+  CUDA_FLOAT* my_x;
+  CUDA_FLOAT x_i, y_i, z_i;
+
+  for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
+
+    int actIdx = threadIdx.x + actOffset;
+    CUDA_FLOAT* other_x = shared;
+    int* other_id = (int*) &other_x[3 * blockDim.x];
+
+    if(actIdx < bin_c) {
+      i = binned_id[__mul24(bin, bin_nmax) + actIdx];
+      my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
+      x_i = *my_x;
+      my_x += bin_nmax;
+      y_i = *my_x;
+      my_x += bin_nmax;
+      z_i = *my_x;
+    } else
+      i = 2 * _nall;
+
+    __syncthreads();
+
+    int jnum = 0;
+    int itype;
+
+    if(i < natoms) {
+      jnum = 0;
+      _ilist[i] = i;
+      itype = _type[i];
+    }
+
+    //__syncthreads();
+
+
+    for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
+      int otherActIdx = threadIdx.x + otherActOffset;
+
+      if(otherActIdx < bin_c) {
+        if(otherActOffset == actOffset) {
+          other_id[threadIdx.x] = i;
+          other_x[threadIdx.x] = x_i;
+          other_x[threadIdx.x + blockDim.x] = y_i;
+          other_x[threadIdx.x + 2 * blockDim.x] = z_i;
+        } else {
+          other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
+          my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
+          other_x[threadIdx.x] = *my_x;
+          my_x += bin_nmax;
+          other_x[threadIdx.x + blockDim.x] = *my_x;
+          my_x += bin_nmax;
+          other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
+
+        }
+      }
+
+      __syncthreads();
+      int kk = threadIdx.x;
+
+      for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
+        if(i < natoms) {
+          kk++;
+          kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
+          int j = other_id[kk];
+
+          if(exclude && exclusion(i, j, itype, _type[j])) continue;
+
+          if(globcutoff < 0) {
+            int jtype = _type[j];
+            cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+          }
+
+          CUDA_FLOAT delx = x_i - other_x[kk];
+          CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
+          CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
+          CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+
+          if(rsq <= cut && i != j) {
+            if(jnum < _maxneighbors) {
+              if(block_style)
+                _neighbors[i * _maxneighbors + jnum] = j;
+              else
+                _neighbors[i + jnum * natoms] = j;
+            }
+
+            ++jnum;
+          }
+        }
+      }
+
+      __syncthreads();
+
+    }
+
+    for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
+      for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
+        for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
+          if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
+
+          if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
+
+          int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
+
+          if(other_bin == bin) continue;
+
+          int obin_c = bin_count[other_bin];
+
+          for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
+            int otherActIdx = otherActOffset + threadIdx.x;
+
+            if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
+              other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
+              my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
+              other_x[threadIdx.x] = *my_x;
+              my_x += bin_nmax;
+              other_x[threadIdx.x + blockDim.x] = *my_x;
+              my_x += bin_nmax;
+              other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
+            }
+
+            __syncthreads();
+
+            for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
+              if(i < natoms) {
+                int j = other_id[k];
+
+                if(exclude && exclusion(i, j, itype, _type[j])) continue;
+
+                if(globcutoff < 0) {
+                  int jtype = _type[j];
+                  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+                }
+
+                CUDA_FLOAT delx = x_i - other_x[k];
+                CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
+                CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
+                CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                if(rsq <= cut && i != j) {
+                  if(jnum < _maxneighbors) {
+                    if(block_style)
+                      _neighbors[i * _maxneighbors + jnum] = j;
+                    else
+                      _neighbors[i + jnum * natoms] = j;
+                  }
+
+                  ++jnum;
+                }
+              }
+            }
+
+            __syncthreads();
+
+          }
+        }
+
+    if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
+
+    if(i < natoms)
+      _numneigh[i] = jnum;
+  }
+}
+
+
+__global__ void FindSpecial(int block_style)
+{
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int which;
+  int tag_mask = 0;
+  int3 spec_flag;
+
+  int3 mynspecial = {0, 0, 1};
+
+  if(ii >= _nlocal) return;
+
+  int special_id[CUDA_MAX_NSPECIAL];
+
+  int i = _ilist[ii];
+
+  if(i >= _nlocal) return;
+
+  int jnum = _numneigh[i];
+
+  if(_special_flag[1] == 0) spec_flag.x = -1;
+  else if(_special_flag[1] == 1) spec_flag.x = 0;
+  else spec_flag.x = 1;
+
+  if(_special_flag[2] == 0) spec_flag.y = -1;
+  else if(_special_flag[2] == 1) spec_flag.y = 0;
+  else spec_flag.y = 2;
+
+  if(_special_flag[3] == 0) spec_flag.z = -1;
+  else if(_special_flag[3] == 1) spec_flag.z = 0;
+  else spec_flag.z = 3;
+
+  mynspecial.x = _nspecial[i];
+  mynspecial.y = _nspecial[i + _nmax];
+  mynspecial.z = _nspecial[i + 2 * _nmax];
+
+  if(i < _nlocal) {
+    int* list = &_special[i];
+
+    for(int k = 0; k < mynspecial.z; k++) {
+      special_id[k] = list[k * _nmax];
+      tag_mask = tag_mask | special_id[k];
+    }
+  }
+
+
+  for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
+    int j;
+
+    if(block_style)
+      j = _neighbors[i * _maxneighbors + k];
+    else
+      j = _neighbors[i + k * _nlocal];
+
+    int tag_j = _tag[j];
+    which = 0;
+
+    if((tag_mask & tag_j) == tag_j) {
+      which = find_special(mynspecial, special_id, tag_j, spec_flag);
+
+      if(which > 0) {
+        if(block_style)
+          _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
+        else
+          _neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
+      } else if(which < 0) {
+        if(block_style)
+          _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
+        else
+          _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];
+
+        jnum--;
+        k--;
+      }
+    }
+  }
+
+  _numneigh[i] = jnum;
+}
+
+__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_FLOAT globcutoff, int block_style)
+{
+  int bin_dim_z = gridDim.y;
+  CUDA_FLOAT* binned_x = (CUDA_FLOAT*) _buffer;
+  binned_x = &binned_x[2];
+  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
+  int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
+  int bin_x = blockIdx.x / bin_dim_y;
+  int bin_y = blockIdx.x - bin_x * bin_dim_y;
+  int bin_z = blockIdx.y;
+  int bin_c = bin_count[bin];
+
+
+  CUDA_FLOAT cut;
+
+  if(globcutoff > 0)
+    cut = globcutoff;
+
+  int i = _nall;
+  CUDA_FLOAT* my_x;
+  CUDA_FLOAT x_i, y_i, z_i;
+
+  for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
+
+    int actIdx = threadIdx.x + actOffset;
+    CUDA_FLOAT* other_x = shared;
+    int* other_id = (int*) &other_x[3 * blockDim.x];
+
+    if(actIdx < bin_c) {
+      i = binned_id[__mul24(bin, bin_nmax) + actIdx];
+      my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
+      x_i = *my_x;
+      my_x += bin_nmax;
+      y_i = *my_x;
+      my_x += bin_nmax;
+      z_i = *my_x;
+    } else
+      i = 2 * _nall;
+
+    __syncthreads();
+
+    int jnum = 0;
+    int jnum_border = 0;
+    int jnum_inner = 0;
+    int i_border = -1;
+    int itype;
+
+    if(i < _nlocal) {
+      jnum = 0;
+      _ilist[i] = i;
+      itype = _type[i];
+    }
+
+    __syncthreads();
+
+
+    for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
+      int otherActIdx = threadIdx.x + otherActOffset;
+
+      if(otherActIdx < bin_c) {
+        if(otherActOffset == actOffset) {
+          other_id[threadIdx.x] = i;
+          other_x[threadIdx.x] = x_i;
+          other_x[threadIdx.x + blockDim.x] = y_i;
+          other_x[threadIdx.x + 2 * blockDim.x] = z_i;
+        } else {
+          other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
+          my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
+          other_x[threadIdx.x] = *my_x;
+          my_x += bin_nmax;
+          other_x[threadIdx.x + blockDim.x] = *my_x;
+          my_x += bin_nmax;
+          other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
+
+        }
+      }
+
+      __syncthreads();
+      int kk = threadIdx.x;
+
+      for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
+        if(i < _nlocal) {
+          kk++;
+          kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
+          int j = other_id[kk];
+
+          if(globcutoff < 0) {
+            int jtype = _type[j];
+            cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+          }
+
+          CUDA_FLOAT delx = x_i - other_x[kk];
+          CUDA_FLOAT dely = y_i - other_x[kk + blockDim.x];
+          CUDA_FLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
+          CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+
+          if(rsq <= cut && i != j) {
+            if((j >= _nlocal) && (i_border < 0))
+              i_border = atomicAdd(_inum_border, 1);
+
+            if(jnum < _maxneighbors) {
+              if(block_style) {
+                _neighbors[i * _maxneighbors + jnum] = j;
+
+                if(j >= _nlocal) {
+                  _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
+                } else {
+                  _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
+                }
+              } else {
+                _neighbors[i + jnum * _nlocal] = j;
+
+                if(j >= _nlocal) {
+                  _neighbors_border[i_border + jnum_border * _nlocal] = j;
+                } else {
+                  _neighbors_inner[i + jnum_inner * _nlocal] = j;
+                }
+              }
+            }
+
+            ++jnum;
+
+            if(j >= _nlocal)
+              jnum_border++;
+            else
+              jnum_inner++;
+          }
+        }
+      }
+
+      __syncthreads();
+    }
+
+    for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
+      for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
+        for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
+          if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
+
+          if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
+
+          int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
+
+          if(other_bin == bin) continue;
+
+          int obin_c = bin_count[other_bin];
+
+          for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
+            int otherActIdx = otherActOffset + threadIdx.x;
+
+            if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
+              other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
+              my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
+              other_x[threadIdx.x] = *my_x;
+              my_x += bin_nmax;
+              other_x[threadIdx.x + blockDim.x] = *my_x;
+              my_x += bin_nmax;
+              other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
+            }
+
+            __syncthreads();
+
+            for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
+              if(i < _nlocal) {
+                int j = other_id[k];
+
+                if(globcutoff < 0) {
+                  int jtype = _type[j];
+                  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
+                }
+
+                CUDA_FLOAT delx = x_i - other_x[k];
+                CUDA_FLOAT dely = y_i - other_x[k + blockDim.x];
+                CUDA_FLOAT delz = z_i - other_x[k + 2 * blockDim.x];
+                CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+                if(rsq <= cut && i != j) {
+                  if((j >= _nlocal) && (i_border < 0))
+                    i_border = atomicAdd(_inum_border, 1);
+
+                  if(jnum < _maxneighbors) {
+                    if(block_style) {
+                      _neighbors[i * _maxneighbors + jnum] = j;
+
+                      if(j >= _nlocal) {
+                        _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
+                      } else {
+                        _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
+                      }
+                    } else {
+                      _neighbors[i + jnum * _nlocal] = j;
+
+                      if(j >= _nlocal) {
+                        _neighbors_border[i_border + jnum_border * _nlocal] = j;
+                      } else {
+                        _neighbors_inner[i + jnum_inner * _nlocal] = j;
+                      }
+                    }
+                  }
+
+                  ++jnum;
+
+                  if(j >= _nlocal)
+                    jnum_border++;
+                  else
+                    jnum_inner++;
+                }
+              }
+            }
+
+            __syncthreads();
+          }
+        }
+
+    if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
+
+    if(i < _nlocal) {
+      _numneigh[i] = jnum;
+      _numneigh_inner[i] = jnum_inner;
+
+      if(i_border >= 0) _numneigh_border[i_border] = jnum_border;
+
+      if(i_border >= 0) _ilist_border[i_border] = i;
+
+    }
+  }
+}
+
+__global__ void NeighborBuildFullNsq_Kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* buffer = (int*) _buffer;
+
+  if(i < _nlocal) {
+    X_FLOAT* my_x = _x + i;
+    CUDA_FLOAT x_i = *my_x;
+    my_x += _nmax;
+    CUDA_FLOAT y_i = *my_x;
+    my_x += _nmax;
+    CUDA_FLOAT z_i = *my_x;
+    int jnum = 0;
+    int* jlist = _firstneigh[i];
+    _ilist[i] = i;
+
+    int itype = _type[i];
+    __syncthreads();
+
+    for(int j = 0; j < _nall; ++j) {
+      my_x = _x + j;
+      CUDA_FLOAT x_j = *my_x;
+      my_x += _nmax;
+      CUDA_FLOAT y_j = *my_x;
+      my_x += _nmax;
+      CUDA_FLOAT z_j = *my_x;
+      CUDA_FLOAT delx = x_i - x_j;
+      CUDA_FLOAT dely = y_i - y_j;
+      CUDA_FLOAT delz = z_i - z_j;
+      CUDA_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+      int jtype = _type[j];
+
+      if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
+        if(jnum < _maxneighbors)
+          jlist[jnum] = j;
+
+        if(i == 151)((int*)_buffer)[jnum + 2] = j;
+
+        ++jnum;
+      }
+
+      __syncthreads();
+    }
+
+    if(jnum > _maxneighbors) buffer[0] = 0;
+
+    _numneigh[i] = jnum;
+
+    if(i == 151)((int*)_buffer)[1] = jnum;
+  }
+}
+
diff --git a/lib/cuda/pair_born_coul_long_cuda.cu b/lib/cuda/pair_born_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e6d66733e73fa107980534ce92224147b7b66356
--- /dev/null
+++ b/lib/cuda/pair_born_coul_long_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _sigma MY_AP(coeff2)
+#define _a MY_AP(coeff3)
+#define _c MY_AP(coeff4)
+#define _d MY_AP(coeff5)
+
+#include "pair_born_coul_long_cuda_cu.h"
+#include "pair_born_coul_long_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true);
+}
+
+void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairBornCoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+
+#undef _rhoinv
+#undef _sigma
+#undef _a
+#undef _c
+#undef _d
+
diff --git a/lib/cuda/pair_born_coul_long_cuda_cu.h b/lib/cuda/pair_born_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..cdd4e6cafae7f06c9b3edd47b0a91dee9c0838cc
--- /dev/null
+++ b/lib/cuda/pair_born_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bc79848b08dc794fb3d77908a63a309882deef2d
--- /dev/null
+++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r = _RSQRT_(r2inv);
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  const F_FLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
+  const F_FLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
+                            F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv;
+
+  if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv
+                                    + _d[ij_type] * r2inv * r6inv - _offset[ij_type]);
+
+  return factor_lj * forceborn * r2inv;
+}
diff --git a/lib/cuda/pair_buck_coul_cut_cuda.cu b/lib/cuda/pair_buck_coul_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba61f5e036302939297d4d577259de67c688faea
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_cut_cuda.cu
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _buck1 MY_AP(coeff2)
+#define _buck2 MY_AP(coeff3)
+#define _a MY_AP(coeff4)
+#define _c MY_AP(coeff5)
+
+#include "pair_buck_coul_cut_cuda_cu.h"
+
+#include <time.h>
+void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true);
+}
+
+void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairBuckCoulCutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _rhoinv
+#undef _buck1
+#undef _buck2
+#undef _a
+#undef _c
+
diff --git a/lib/cuda/pair_buck_coul_cut_cuda_cu.h b/lib/cuda/pair_buck_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..25f916119bdadf065243851354f4090fbe627946
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_cut_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_buck_coul_long_cuda.cu b/lib/cuda/pair_buck_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4e7203f839dbbad3acb352a2f104d1a43c2f79c
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_long_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _buck1 MY_AP(coeff2)
+#define _buck2 MY_AP(coeff3)
+#define _a MY_AP(coeff4)
+#define _c MY_AP(coeff5)
+
+#include "pair_buck_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true);
+}
+
+void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairBuckCoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+
+
+#undef _rhoinv
+#undef _buck1
+#undef _buck2
+#undef _a
+#undef _c
+
diff --git a/lib/cuda/pair_buck_coul_long_cuda_cu.h b/lib/cuda/pair_buck_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..b13476644ae88d88d8cae476894cc5975fed5b70
--- /dev/null
+++ b/lib/cuda/pair_buck_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_buck_cuda.cu b/lib/cuda/pair_buck_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b7ca740c00567fb1f856291490d324d22d0a038c
--- /dev/null
+++ b/lib/cuda/pair_buck_cuda.cu
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _rhoinv MY_AP(coeff1)
+#define _buck1 MY_AP(coeff2)
+#define _buck2 MY_AP(coeff3)
+#define _a MY_AP(coeff4)
+#define _c MY_AP(coeff5)
+
+#include "pair_buck_cuda_cu.h"
+#include "pair_buck_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5);
+}
+
+void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairBuckCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _rhoinv
+#undef _buck1
+#undef _buck2
+#undef _a
+#undef _c
+
diff --git a/lib/cuda/pair_buck_cuda_cu.h b/lib/cuda/pair_buck_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..152fad94f44b9043831cbd13d6b61b2b2838e665
--- /dev/null
+++ b/lib/cuda/pair_buck_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_buck_cuda_kernel_nc.cu b/lib/cuda/pair_buck_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8ce7d28654e288e62ce3f281cc63c523db1505ec
--- /dev/null
+++ b/lib/cuda/pair_buck_cuda_kernel_nc.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairBuckCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  const F_FLOAT r = _RSQRT_(r2inv);
+  const F_FLOAT rexp = _EXP_(-r * _rhoinv[ij_type]);
+  const F_FLOAT forcebuck = _buck1[ij_type] * r * rexp - _buck2[ij_type] * r6inv;
+
+  if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv -
+                                    _offset[ij_type]);
+
+  return (factor_lj * forcebuck) * r2inv;
+}
diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f9f853437b4575390d807652dd7fcd85f79b216
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda.cu
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+
+#include "pair_cg_cmm_coul_cut_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true, false);
+
+}
+
+
+
+
+void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairCGCMMCoulCutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..fff7db7a646a72d9676983f27326a4f907d3cd5f
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..43bedca88321f22e54916cf720911740bf7ec6ad
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda.cu
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+
+#include "pair_cg_cmm_coul_debye_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCoulDebyeCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true, false);
+
+}
+
+
+
+
+void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairCGCMMCoulDebyeCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b0b6af597f064ed9644f2920432540df9c1ecb7
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_debye_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda.cu b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ed1bbf0cfc5da4f7e50d767384e69c68b355f82c
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_long_cuda.cu
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+
+#include "pair_cg_cmm_coul_long_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true, false);
+
+}
+
+
+
+
+void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairCGCMMCoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..86befd78b8705dcbe46fa3bd30252f23c5b613bd
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_cuda.cu b/lib/cuda/pair_cg_cmm_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7ec1ebff99911b98361c5bb32d21822a9870fba1
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_cuda.cu
@@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _cg_type MY_AP(coeff5)
+
+enum {CG_NOT_SET = 0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES,
+      CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG
+     };
+
+#include "pair_cg_cmm_cuda_cu.h"
+#include "pair_cg_cmm_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+
+void Cuda_PairCGCMMCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, false, false);
+
+}
+
+
+
+
+void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairCGCMMCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  int maxthreads = 128;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, maxthreads);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _cg_type
+
diff --git a/lib/cuda/pair_cg_cmm_cuda_cu.h b/lib/cuda/pair_cg_cmm_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..739c0ae28f2885015ef50a1048ab7c8e7a90d29c
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairCGCMMCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..85b41605bd00558e399434ec26541d541072a8ff
--- /dev/null
+++ b/lib/cuda/pair_cg_cmm_cuda_kernel_nc.cu
@@ -0,0 +1,49 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const int cg_type = _cg_type[ij_type];
+  const F_FLOAT r4inv = r2inv * r2inv;
+  const F_FLOAT rNinv_first = cg_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq);
+  const F_FLOAT rNinv_second = cg_type != CG_LJ12_4 ? -r2inv : -F_F(1.0);
+  const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second);
+
+  if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]);
+
+  return factor_lj * forcelj * r2inv;
+}
+
+/*__device__ inline F_FLOAT PairCGCMMCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	const int cg_type = tex1Dfetch(_coeff5_gm_tex,ij_type);
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r4inv = r2inv*r2inv;
+	const F_FLOAT rNinv_first = cg_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
+	const F_FLOAT rNinv_second = cg_type!=CG_LJ12_4?r2inv:F_F(1.0);
+	const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
+
+    if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second));
+	return factor_lj*forcelj*r2inv;
+}*/
diff --git a/lib/cuda/pair_eam_cuda.cu b/lib/cuda/pair_eam_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb20343770514e82824d1bb242015f7e568d85ff
--- /dev/null
+++ b/lib/cuda/pair_eam_cuda.cu
@@ -0,0 +1,351 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _type2frho MY_AP(coeff1)
+#define _type2rhor MY_AP(coeff2)
+#define _type2z2r MY_AP(coeff3)
+#define _rdr MY_AP(rdr)
+#define _rdrho MY_AP(rdrho)
+#define _nr MY_AP(nr)
+#define _nrho MY_AP(nrho)
+#define _nfrho MY_AP(nfrho)
+#define _nrhor MY_AP(nrhor)
+#define _nz2r MY_AP(nz2r)
+#define _frho_spline MY_AP(frho_spline)
+#define _rhor_spline MY_AP(rhor_spline)
+#define _z2r_spline MY_AP(z2r_spline)
+#define _rho MY_AP(rho)
+#define _fp MY_AP(fp)
+
+__device__ __constant__ F_FLOAT MY_AP(rdr);
+__device__ __constant__ F_FLOAT MY_AP(rdrho);
+__device__ __constant__ int MY_AP(nr);
+__device__ __constant__ int MY_AP(nrho);
+__device__ __constant__ int MY_AP(nfrho);
+__device__ __constant__ int MY_AP(nrhor);
+__device__ __constant__ int MY_AP(nz2r);
+__device__ __constant__ F_FLOAT* MY_AP(frho_spline);
+__device__ __constant__ F_FLOAT* MY_AP(rhor_spline);
+__device__ __constant__ F_FLOAT* MY_AP(z2r_spline);
+__device__ __constant__ F_FLOAT* MY_AP(rho);
+__device__ __constant__ F_FLOAT* MY_AP(fp);
+
+#define _rhor_spline_tex         MY_AP(rhor_spline_tex)
+#if F_PRECISION == 1
+texture<float4, 1> _rhor_spline_tex;
+#else
+texture<int4, 1> _rhor_spline_tex;
+#endif
+
+
+#define _z2r_spline_tex         MY_AP(z2r_spline_tex)
+#if F_PRECISION == 1
+texture<float4, 1> _z2r_spline_tex;
+#else
+texture<int4, 1> _z2r_spline_tex;
+#endif
+
+
+
+#include "pair_eam_cuda_cu.h"
+#include "pair_eam_cuda_kernel_nc.cu"
+#include <time.h>
+
+int eam_buff_offset;
+int rhor_spline_size;
+void* rhor_spline_pointer;
+int z2r_spline_size;
+void* z2r_spline_pointer;
+
+
+inline void BindEAMTextures(cuda_shared_data* sdata)
+{
+  _rhor_spline_tex.normalized = false;                      // access with normalized texture coordinates
+  _rhor_spline_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _rhor_spline_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+
+  const textureReference* rhor_spline_texture_ptr = &MY_AP(rhor_spline_tex);
+
+#if F_PRECISION == 1
+  cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
+#else
+  cudaChannelFormatDesc channelDescRhor = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, rhor_spline_texture_ptr, rhor_spline_pointer, &channelDescRhor, rhor_spline_size);
+#endif
+
+  _z2r_spline_tex.normalized = false;                      // access with normalized texture coordinates
+  _z2r_spline_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
+  _z2r_spline_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
+
+  const textureReference* z2r_spline_texture_ptr = &MY_AP(z2r_spline_tex);
+
+#if F_PRECISION == 1
+  cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<float4>();
+  cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
+#else
+  cudaChannelFormatDesc channelDescZ2r = cudaCreateChannelDesc<int4>();
+  cudaBindTexture(0, z2r_spline_texture_ptr, z2r_spline_pointer, &channelDescZ2r, z2r_spline_size);
+#endif
+
+}
+
+void Cuda_PairEAMCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateBuffer failed");
+  int3 layout = getgrid(sneighlist->inum, 7 * sizeof(F_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(F_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_PairEAMCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+
+    if(sdata->buffer != NULL) cudaFree(sdata->buffer);
+
+    cudaMalloc((void**)&sdata->buffer, size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateBuffer failed");
+}
+
+void Cuda_PairEAMCuda_UpdateNeighbor(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(ilist)     , & sneighlist->ilist     .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(inum)      , & sneighlist->inum               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      , & sdata->atom.nmax               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(numneigh)  , & sneighlist->numneigh  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(neighbors)      , & sneighlist->neighbors  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(maxneighbors)       , & sneighlist->maxneighbors     , sizeof(int));
+}
+
+void Cuda_PairEAMCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: before updateNmax failed");
+  cudaMemcpyToSymbol(MY_AP(x)         , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)         	, & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(f)         			, & sdata->atom.f         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(type)      			, & sdata->atom.type      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)      			, & sdata->atom.tag       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: updateNmax failed");
+}
+
+
+void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r,
+                           void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp,
+                           int* type2frho, int** type2z2r, int** type2rhor)
+{
+  // !! LAMMPS indexes atom types starting with 1 !!
+
+  unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+
+  if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
+    printf("# CUDA: Cuda_PairEAMCuda_Init: you need %u types. this is more than %u "
+           "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=99 "
+           "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
+
+  unsigned nI = sizeof(F_FLOAT) * cuda_ntypes * cuda_ntypes;
+
+  X_FLOAT cutsq_global;
+  cutsq_global = (X_FLOAT)(sdata->pair.cut_global);
+  cudaMemcpyToSymbol(MY_AP(cutsq_global)	, &cutsq_global  				, sizeof(X_FLOAT));
+
+
+  F_FLOAT* coeff_buf = new F_FLOAT[cuda_ntypes * cuda_ntypes];
+
+  for(int i = 0; i < cuda_ntypes; i++) coeff_buf[i] = type2frho[i];
+
+  cudaMemcpyToSymbol(MY_AP(coeff1)        , coeff_buf             , cuda_ntypes * sizeof(F_FLOAT));
+
+  for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2rhor[0][0])[i];
+
+  cudaMemcpyToSymbol(MY_AP(coeff2)        , coeff_buf             , nI);
+
+  for(int i = 0; i < cuda_ntypes * cuda_ntypes; i++) coeff_buf[i] = (&type2z2r[0][0])[i];
+
+  cudaMemcpyToSymbol(MY_AP(coeff3)        , coeff_buf             , nI);
+
+  delete [] coeff_buf;
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+  F_FLOAT rdr_F = rdr;
+  F_FLOAT rdrho_F = rdrho;
+  cudaMemcpyToSymbol(MY_AP(box_size)   , box_size                 , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes            , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(virial)     , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rdr), &rdr_F, sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(rdrho), &rdrho_F, sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(nr), &nr, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nrho), &nrho, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nfrho), &nfrho, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(rho), &rho, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(fp), &fp, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(frho_spline), &frho_spline, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(rhor_spline), &rhor_spline, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(z2r_spline), &z2r_spline, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(nrhor), &nrhor, sizeof(int));
+
+  rhor_spline_size = nrhor * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
+  z2r_spline_size = nz2r * (nr + 1) * EAM_COEFF_LENGTH * sizeof(F_FLOAT);
+  rhor_spline_pointer = rhor_spline;
+  z2r_spline_pointer = z2r_spline;
+
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: init failed");
+
+}
+
+
+
+void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  if(sdata->atom.update_nmax)
+    Cuda_PairEAMCuda_UpdateNmax(sdata, sneighlist);
+
+  if(sdata->atom.update_neigh)
+    Cuda_PairEAMCuda_UpdateNeighbor(sdata, sneighlist);
+
+  if(sdata->atom.update_nlocal)
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+
+  if(sdata->buffer_new)
+    Cuda_PairEAMCuda_UpdateBuffer(sdata, sneighlist);
+
+  cudaMemcpyToSymbol(MY_AP(eatom)     			, & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     			, & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
+
+  int sharedperproc = 0;
+
+  if(eflag || eflag_atom) sharedperproc = 1;
+
+  if(vflag || vflag_atom) sharedperproc = 7;
+
+  int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  eam_buff_offset = grid.x * grid.y;
+
+  BindXTypeTexture(sdata);
+  BindEAMTextures(sdata); // initialize only on first call
+
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 1 problems before kernel invocation");
+  PairEAMCuda_Kernel1 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 1 execution failed");
+
+
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
+
+}
+
+void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  int sharedperproc = 0;
+
+  if(eflag || eflag_atom) sharedperproc = 1;
+
+  if(vflag || vflag_atom) sharedperproc = 7;
+
+  int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  BindXTypeTexture(sdata);
+  BindEAMTextures(sdata); // initialize only on first call
+  // initialize only on first call
+  sdata->pair.lastgridsize = grid.x * grid.y;
+  sdata->pair.n_energy_virial = sharedperproc;
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCuda: kernel start eflag: %i vflag: %i\n", eflag, vflag);)
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pre pair Kernel 2 problems before kernel invocation");
+  PairEAMCuda_Kernel2 <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom);
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 start failed");
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_PairEAMCuda: pair Kernel 2 execution failed");
+
+  if(eflag || vflag) {
+    int n = grid.x * grid.y;
+    grid.x = sharedperproc;
+    grid.y = 1;
+    threads.x = 256;
+    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)*sharedperproc>>>(n);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_PairEAMCuda: virial compute Kernel execution failed");
+  }
+
+  MYDBG(printf("# CUDA: Cuda_PairEAMCoulLongCuda: kernel done\n");)
+
+}
+
+void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send)
+{
+  int3 layout = getgrid(n, 0);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  F_FLOAT* buf = (F_FLOAT*)(& ((double*)sdata->buffer)[eam_buff_offset]);
+
+  PairEAMCuda_PackComm_Kernel <<< grid, threads, 0>>> ((int*) sdata->comm.sendlist.dev_data, n
+      , sdata->comm.maxlistlength, iswap, buf);
+  cudaThreadSynchronize();
+  cudaMemcpy(buf_send, buf, n* sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+  cudaThreadSynchronize();
+}
+
+void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp)
+{
+  F_FLOAT* fp_first = &(((F_FLOAT*) fp)[first]);
+  cudaMemcpy(fp_first, buf_recv, n * sizeof(F_FLOAT), cudaMemcpyHostToDevice);
+}
+
+#undef _type2frho
+#undef _type2rhor
+#undef _type2z2r
+
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into global and per-atom accumulators
+   need i < nlocal test since called by bond_quartic and dihedral_charmm
+------------------------------------------------------------------------- */
+
diff --git a/lib/cuda/pair_eam_cuda_cu.h b/lib/cuda/pair_eam_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..d491efe70fadceef1ed2eb4faf6833d037dbd81e
--- /dev/null
+++ b/lib/cuda/pair_eam_cuda_cu.h
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+extern "C" void Cuda_PairEAMCuda_Init(cuda_shared_data* sdata, double rdr, double rdrho, int nfrho, int nrhor, int nr, int nrho, int nz2r,
+                                      void* frho_spline, void* rhor_spline, void* z2r_spline, void* rho, void* fp,
+                                      int* type2frho, int** type2z2r, int** type2rhor);
+extern "C" void Cuda_PairEAM1Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+extern "C" void Cuda_PairEAM2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+extern "C" void Cuda_PairEAMCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send);
+extern "C" void Cuda_PairEAMCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, void* fp);
+
+#define EAM_COEFF_LENGTH 8
diff --git a/lib/cuda/pair_eam_cuda_kernel_nc.cu b/lib/cuda/pair_eam_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..458945418ab1131504faf4e86cec3f827c1f2e2c
--- /dev/null
+++ b/lib/cuda/pair_eam_cuda_kernel_nc.cu
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+
+
+static __device__ inline F_FLOAT4 fetchRhor(int i)
+{
+#ifdef CUDA_USE_TEXTURE
+#if F_PRECISION == 1
+  return tex1Dfetch(_rhor_spline_tex, i);
+#else
+  return tex1Dfetch_double_f(_rhor_spline_tex, i);
+#endif
+#else
+  return _rhor_spline[i];
+#endif
+}
+
+static __device__ inline F_FLOAT4 fetchZ2r(int i)
+{
+#ifdef CUDA_USE_TEXTURE
+#if F_PRECISION == 1
+  return tex1Dfetch(_z2r_spline_tex, i);
+#else
+  return tex1Dfetch_double_f(_z2r_spline_tex, i);
+#endif
+#else
+  return _z2r_spline[i];
+#endif
+}
+
+__global__ void PairEAMCuda_Kernel1(int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  ENERGY_FLOAT* sharedE;
+  ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+
+  if(eflag || eflag_atom) {
+    sharedE = &sharedmem[threadIdx.x];
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+  }
+
+  if(vflag || vflag_atom) {
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT xtmp, ytmp, ztmp;
+  X_FLOAT4 myxtype;
+  F_FLOAT delx, dely, delz;
+  int itype;
+  int i = _nlocal;
+  int jnum = 0;
+  int* jlist;
+
+  if(ii < _inum) {
+    i = _ilist[ii];
+
+    myxtype = fetchXType(i);
+    xtmp = myxtype.x;
+    ytmp = myxtype.y;
+    ztmp = myxtype.z;
+    itype = static_cast <int>(myxtype.w);
+
+    jnum = _numneigh[i];
+
+    jlist = &_neighbors[i];
+
+    if(i < _nlocal)
+      _rho[i] = F_F(0.0);
+  }
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(ii < _inum)
+      if(jj < jnum) {
+        const int j = jlist[jj * _nlocal];
+        myxtype = fetchXType(j);
+        delx = xtmp - myxtype.x;
+        dely = ytmp - myxtype.y;
+        delz = ztmp - myxtype.z;
+        int jtype = static_cast <int>(myxtype.w);
+        const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+        if(rsq < _cutsq_global) {
+          F_FLOAT p = sqrt(rsq) * _rdr + F_F(1.0);
+          int m = static_cast<int>(p);
+          m = MIN(m, _nr - 1);
+          p -= m;
+          p = MIN(p, F_F(1.0));
+
+          int k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2;
+          F_FLOAT4 c = fetchRhor(k + 1);
+          _rho[i] += ((c.w * p + c.x) * p + c.y) * p + c.z;
+        }
+      }
+  }
+
+  if(ii < _inum) {
+
+    F_FLOAT p = _rho[i] * _rdrho + F_F(1.0);
+    int m = static_cast<int>(p);
+    m = MAX(1, MIN(m, _nrho - 1));
+    p -= m;
+    p = MIN(p, F_F(1.0));
+    F_FLOAT* coeff = &_frho_spline[(static_cast <int>(_type2frho[itype]) * (_nrho + 1) + m) * EAM_COEFF_LENGTH];
+    _fp[i] = (coeff[0] * p + coeff[1]) * p + coeff[2];
+
+    if(eflag || eflag_atom) {
+      sharedmem[threadIdx.x] += ((coeff[3] * p + coeff[4]) * p + coeff[5]) * p + coeff[6];
+    }
+
+  }
+
+  __syncthreads();
+
+  if(eflag || eflag_atom) {
+    if(i < _nlocal && eflag_atom)
+      _eatom[i] += sharedmem[threadIdx.x];
+
+    reduceBlock(sharedmem);
+    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+    buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(2.0) * sharedmem[0];
+  }
+}
+
+__global__ void PairEAMCuda_Kernel2(int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+
+  ENERGY_FLOAT* sharedE;
+  ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+
+  if(eflag || eflag_atom) {
+    sharedE = &sharedmem[threadIdx.x];
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+  }
+
+  if(vflag || vflag_atom) {
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT xtmp, ytmp, ztmp;
+  X_FLOAT4 myxtype;
+  F_FLOAT fxtmp, fytmp, fztmp, fpair;
+  F_FLOAT delx, dely, delz;
+  int itype, i;
+  int jnum = 0;
+  int* jlist;
+
+  if(ii < _inum) {
+    i = _ilist[ii];
+
+    myxtype = fetchXType(i);
+    xtmp = myxtype.x;
+    ytmp = myxtype.y;
+    ztmp = myxtype.z;
+    itype = static_cast <int>(myxtype.w);
+    fxtmp = F_F(0.0);
+    fytmp = F_F(0.0);
+    fztmp = F_F(0.0);
+
+    jnum = _numneigh[i];
+
+    jlist = &_neighbors[i];
+
+    if(i < _nlocal)
+      _rho[i] = F_F(0.0);
+  }
+
+  if(ii < gridDim.x * gridDim.y) evdwl = ((ENERGY_FLOAT*) _buffer)[ii];
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(ii < _inum)
+      if(jj < jnum) {
+        const int j = jlist[jj * _nlocal];
+        myxtype = fetchXType(j);
+        delx = xtmp - myxtype.x;
+        dely = ytmp - myxtype.y;
+        delz = ztmp - myxtype.z;
+        int jtype = static_cast <int>(myxtype.w);
+        const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+        if(rsq < _cutsq_global) {
+          F_FLOAT r = _SQRT_(rsq);
+          F_FLOAT p = r * _rdr + F_F(1.0);
+          int m = static_cast<int>(p);
+          m = MIN(m, _nr - 1);
+          p -= m;
+          p = MIN(p, F_F(1.0));
+
+          int k = (static_cast <int>(_type2rhor[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2;
+          F_FLOAT4 c = fetchRhor(k);
+          F_FLOAT rhoip = (c.x * p + c.y) * p + c.z;
+          k = (static_cast <int>(_type2rhor[jtype * _cuda_ntypes + itype]) * (_nr + 1) + m) * 2;
+          c = fetchRhor(k);
+          F_FLOAT rhojp = (c.x * p + c.y) * p + c.z;
+          k = (static_cast <int>(_type2z2r[itype * _cuda_ntypes + jtype]) * (_nr + 1) + m) * 2;
+          c = fetchZ2r(k);
+          F_FLOAT z2p = (c.x * p + c.y) * p + c.z;
+          c = fetchZ2r(k + 1);
+          F_FLOAT z2 = ((c.w * p + c.x) * p + c.y) * p + c.z;
+
+          F_FLOAT recip = F_F(1.0) / r;
+          F_FLOAT phi = z2 * recip;
+          F_FLOAT phip = z2p * recip - phi * recip;
+          F_FLOAT psip = _fp[i] * rhojp + _fp[j] * rhoip + phip;
+          fpair = -psip * recip;
+
+          F_FLOAT dxfp, dyfp, dzfp;
+          fxtmp += dxfp = delx * fpair;
+          fytmp += dyfp = dely * fpair;
+          fztmp += dzfp = delz * fpair;
+          evdwl += phi;
+
+          if(vflag || vflag_atom) {
+            sharedV[0 * blockDim.x] += delx * dxfp;
+            sharedV[1 * blockDim.x] += dely * dyfp;
+            sharedV[2 * blockDim.x] += delz * dzfp;
+            sharedV[3 * blockDim.x] += delx * dyfp;
+            sharedV[4 * blockDim.x] += delx * dzfp;
+            sharedV[5 * blockDim.x] += dely * dzfp;
+          }
+        }
+      }
+  }
+
+  __syncthreads();
+
+  if(ii < _inum) {
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+      if(eflag) {
+        buffer = &buffer[1 * gridDim.x * gridDim.y];
+      }
+
+      if(vflag) {
+        buffer = &buffer[6 * gridDim.x * gridDim.y];
+      }
+
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = fxtmp;
+      my_f += _nmax;
+      *my_f = fytmp;
+      my_f += _nmax;
+      *my_f = fztmp;
+    } else {
+      my_f = _f + i;
+      *my_f += fxtmp;
+      my_f += _nmax;
+      *my_f += fytmp;
+      my_f += _nmax;
+      *my_f += fztmp;
+    }
+  }
+
+  __syncthreads();
+
+  if(eflag) {
+    sharedE[0] = evdwl;
+  }
+
+  if(eflag_atom && i < _nlocal) {
+    _eatom[i] += evdwl;
+  }
+
+  if(vflag_atom && i < _nlocal) {
+    _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+    _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+    _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+    _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+    _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+    _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+  }
+
+  if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0);
+}
+
+__global__ void PairEAMCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, F_FLOAT* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  int* list = sendlist + iswap * maxlistlength;
+
+  if(i < n) {
+    int j = list[i];
+    buffer[i] = _fp[j];
+  }
+}
+
+__global__ void PairEAMCuda_UnpackComm_Kernel(int n, int first, F_FLOAT* buffer)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(i < n) {
+    _fp[i + first] = buffer[i];
+  }
+}
diff --git a/lib/cuda/pair_gran_hooke_cuda.cu b/lib/cuda/pair_gran_hooke_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5c143240cb5efb23af24d6ce174671516aae58c2
--- /dev/null
+++ b/lib/cuda/pair_gran_hooke_cuda.cu
@@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _kn MY_AP(coeff1)  //[0]
+#define _kt MY_AP(coeff1)  //[1]
+#define _gamman MY_AP(coeff1) //[2]
+#define _gammat MY_AP(coeff3) //[0]
+#define _xmu MY_AP(coeff2) //[0]
+#define _dampflag MY_AP(coeff2) //[1]
+
+#include "pair_gran_hooke_cuda_cu.h"
+#include "pair_gran_hooke_cuda_kernel_nc.cu"
+#include <time.h>
+
+void Cuda_PairGranHookeCuda_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateBuffer failed");
+  int3 layout = getgrid(sneighlist->inum, 7 * sizeof(ENERGY_FLOAT));
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+  int size = (unsigned)(layout.y * layout.x) * 7 * sizeof(ENERGY_FLOAT);
+
+  if(sdata->buffersize < size) {
+    MYDBG(printf("Cuda_PairGranHookeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
+
+    if(sdata->buffer != NULL) cudaFree(sdata->buffer);
+
+    cudaMalloc((void**)&sdata->buffer, size);
+    sdata->buffersize = size;
+    sdata->buffer_new++;
+    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
+  }
+
+  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateBuffer failed");
+}
+
+void Cuda_PairGranHookeCuda_UpdateNmax(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
+{
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: before updateNmax failed");
+  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal) , & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
+  //cudaMemcpyToSymbol(MY_AP(firstneigh), & sneighlist->firstneigh.dev_data, sizeof(int*) );
+  cudaMemcpyToSymbol(MY_AP(ilist)     , & sneighlist->ilist     .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(inum)      , & sneighlist->inum               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nlocal)    , & sdata->atom.nlocal             , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nall)      , & sdata->atom.nall               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(nmax)      , & sdata->atom.nmax               , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(numneigh)  , & sneighlist->numneigh  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(neighbors) , & sneighlist->neighbors  .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(type)      , & sdata->atom.type      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(tag)       , & sdata->atom.tag       .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(mask)      , & sdata->atom.mask      .dev_data, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(f)         , & sdata->atom.f         .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x)         , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(x_type)    , & sdata->atom.x_type    .dev_data, sizeof(X_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(v_radius)  , & sdata->atom.v_radius  .dev_data, sizeof(V_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(omega_rmass), & sdata->atom.omega_rmass.dev_data, sizeof(V_FLOAT4*));
+  cudaMemcpyToSymbol(MY_AP(torque)    , & sdata->atom.torque    .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(maxneighbors), &sneighlist->maxneighbors	 	  , sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(eatom)     , & sdata->atom.eatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(vatom)     , & sdata->atom.vatom     .dev_data, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(debugdata) , & sdata->debugdata      		  , sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(freeze_group_bit) , & sdata->pair.freeze_group_bit, sizeof(int));
+
+
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: updateNmax failed");
+}
+
+
+void Cuda_PairGranHookeCuda_Init(cuda_shared_data* sdata)
+{
+  // !! LAMMPS indexes atom types starting with 1 !!
+
+  unsigned cuda_ntypes = sdata->atom.ntypes + 2;
+
+  if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
+    printf("# CUDA: Cuda_PairGranHookeCuda_Init: you need %u types. this is more than %u "
+           "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
+           "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES_PLUS_ONE - 1);
+
+  unsigned cuda_ntypes2 = cuda_ntypes * cuda_ntypes;
+  unsigned n = sizeof(F_FLOAT) * cuda_ntypes2;
+
+  F_FLOAT coeffs1[cuda_ntypes2];
+  coeffs1[0] = (F_FLOAT) sdata->pair.coeff1[0][0];
+  coeffs1[1] = (F_FLOAT) sdata->pair.coeff1[0][1];
+  coeffs1[2] = (F_FLOAT) sdata->pair.coeff1[1][0];
+  F_FLOAT coeffs3[cuda_ntypes2];
+  coeffs3[0] = (F_FLOAT) sdata->pair.coeff1[1][1];
+  F_FLOAT coeffs2[cuda_ntypes2];
+  coeffs2[0] = (F_FLOAT) sdata->pair.coeff2[0][0];
+  coeffs2[1] = (F_FLOAT) sdata->pair.coeff2[0][1];
+
+
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+  //printf("n: %i %i\n",n,CUDA_MAX_TYPES2);
+  cudaMemcpyToSymbol(MY_AP(box_size)   , box_size                 , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes), & cuda_ntypes            , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(coeff1)        , coeffs1                   , n);
+  cudaMemcpyToSymbol(MY_AP(coeff2)        , coeffs2                   , n);
+  cudaMemcpyToSymbol(MY_AP(coeff3)        , coeffs3                   , n);
+  cudaMemcpyToSymbol(MY_AP(virial)     , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity, sizeof(int) * 3);
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: init failed");
+}
+
+
+
+void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  //if(sdata->atom.update_nmax)
+  Cuda_PairGranHookeCuda_UpdateNmax(sdata, sneighlist);
+  //if(sdata->atom.update_nlocal)
+  {
+    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
+    cudaMemcpyToSymbol(MY_AP(nall)    , & sdata->atom.nall          , sizeof(int));
+  }
+  //if(sdata->buffer_new)
+  Cuda_PairGranHookeCuda_UpdateBuffer(sdata, sneighlist);
+
+  BindXTypeTexture(sdata);
+  BindVRadiusTexture(sdata);
+  BindOmegaRmassTexture(sdata);
+
+  int sharedperproc = 0;
+
+  if(eflag) sharedperproc += 1;
+
+  if(vflag) sharedperproc += 6;
+
+  int3 layout = getgrid(sneighlist->inum, sharedperproc * sizeof(ENERGY_FLOAT), 128);
+  dim3 threads(layout.z, 1, 1);
+  dim3 grid(layout.x, layout.y, 1);
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairGranHookeCuda_Init(sdata);
+  }
+
+  MYDBG(printf("# CUDA: Cuda_PairGranHookeCuda: kernel start eflag: %i vflag: %i config: %i %i %i %i\n", eflag, vflag, grid.x, grid.y, threads.x, sharedperproc * sizeof(ENERGY_FLOAT)*threads.x);)
+
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pre pair lj cut Kernel problems before kernel invocation");
+  PairGranHookeCuda_Kernel <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x>>> (eflag, vflag, eflag_atom, vflag_atom, (int**)sneighlist->firstneigh.dev_data, sneighlist->binned_id
+      , (F_FLOAT) sdata->pair.coeff1[0][0], (F_FLOAT) sdata->pair.coeff1[1][0], (F_FLOAT) sdata->pair.coeff1[1][1], (F_FLOAT) sdata->pair.coeff2[0][0]);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) pair lj cut Kernel execution failed");
+
+  if(eflag || vflag) {
+    int n = grid.x * grid.y;
+    grid.x = sharedperproc;
+    grid.y = 1;
+    threads.x = 256;
+    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(n);
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("Cuda_PairGranHookeCuda: (no binning) virial compute Kernel execution failed");
+  }
+
+  MYDBG(printf("# CUDA: Cuda_PairGranHookeCoulLongCuda: kernel done\n");)
+
+}
+
+
+#undef _kn
+#undef _kt
+#undef _gamman
+#undef _gammat
+#undef _xmu
+#undef _dampflag
+
+
diff --git a/lib/cuda/pair_gran_hooke_cuda_cu.h b/lib/cuda/pair_gran_hooke_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..78ad3c945266fc71a38c50ed9d8e241398b9133c
--- /dev/null
+++ b/lib/cuda/pair_gran_hooke_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairGranHookeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e6a4ed2b8a75d7a40245531cccd404c3fd3d345e
--- /dev/null
+++ b/lib/cuda/pair_gran_hooke_cuda_kernel_nc.cu
@@ -0,0 +1,227 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+__global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id
+    , F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu)
+{
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+
+  ENERGY_FLOAT* sharedE;
+  ENERGY_FLOAT* sharedV;
+
+  if(eflag || eflag_atom) {
+    sharedE = &sharedmem[threadIdx.x];
+    sharedV = &sharedmem[0];
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+  }
+
+  if(vflag || vflag_atom) {
+    sharedV += threadIdx.x;
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");)
+
+    X_FLOAT xtmp, ytmp, ztmp;
+
+  X_FLOAT4 myxtype;
+  V_FLOAT4 myvradius, ovradius;
+  F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
+  F_FLOAT delx, dely, delz;
+  F_FLOAT radi, radj, radsum, r, rsqinv;
+  F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
+  F_FLOAT wr1, wr2, wr3;
+  F_FLOAT vtr1, vtr2, vtr3, vrel;
+  F_FLOAT meff, damp, ccel, tor1, tor2, tor3;
+  F_FLOAT fn, fs, ft, fs1, fs2, fs3;
+
+  int jnum = 0;
+  int i, j;
+  int* jlist;
+
+  if(ii < _inum) {
+    i = _ilist[ii];
+
+    myxtype = fetchXType(i);
+    myvradius = fetchVRadius(i);
+
+    xtmp = myxtype.x;
+    ytmp = myxtype.y;
+    ztmp = myxtype.z;
+    radi = myvradius.w;
+
+    fxtmp = F_F(0.0);
+    fytmp = F_F(0.0);
+    fztmp = F_F(0.0);
+    torquextmp = F_F(0.0);
+    torqueytmp = F_F(0.0);
+    torqueztmp = F_F(0.0);
+
+    jnum = _numneigh[i];
+
+    jlist = &_neighbors[i];
+  }
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(ii < _inum)
+      if(jj < jnum) {
+        j = jlist[jj * _nlocal];
+
+        myxtype = fetchXType(j);
+        ovradius = fetchVRadius(j);
+
+        delx = xtmp - myxtype.x;
+        dely = ytmp - myxtype.y;
+        delz = ztmp - myxtype.z;
+
+        radj = ovradius.w;
+        radsum = radi + radj;
+
+        const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
+
+        if(rsq < radsum * radsum) {
+          const F_FLOAT rinv = _RSQRT_(rsq);
+          r = F_F(1.0) / rinv;
+          rsqinv = F_F(1.0) / rsq;
+
+          // relative translational velocity
+
+          vr1 = myvradius.x - ovradius.x;
+          vr2 = myvradius.y - ovradius.y;
+          vr3 = myvradius.z - ovradius.z;
+
+          // normal component
+
+          vnnr = vr1 * delx + vr2 * dely + vr3 * delz;
+          vn1 = delx * vnnr * rsqinv;
+          vn2 = dely * vnnr * rsqinv;
+          vn3 = delz * vnnr * rsqinv;
+
+          // tangential component
+
+          vt1 = vr1 - vn1;
+          vt2 = vr2 - vn2;
+          vt3 = vr3 - vn3;
+
+          // relative rotational velocity
+          V_FLOAT4 omegarmass_i = fetchOmegaRmass(i);
+          V_FLOAT4 omegarmass_j = fetchOmegaRmass(j);
+
+          wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv;
+          wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv;
+          wr3 = (radi * omegarmass_i.z + radj * omegarmass_j.z) * rinv;
+
+          meff = omegarmass_i.w * omegarmass_j.w / (omegarmass_i.w + omegarmass_j.w);
+
+          if(_mask[i] & _freeze_group_bit) meff = omegarmass_j.w;
+
+          if(_mask[j] & _freeze_group_bit) meff = omegarmass_i.w;
+
+          damp = meff * gamman * vnnr * rsqinv;
+          ccel = kn * (radsum - r) * rinv - damp;
+
+          vtr1 = vt1 - (delz * wr2 - dely * wr3);
+          vtr2 = vt2 - (delx * wr3 - delz * wr1);
+          vtr3 = vt3 - (dely * wr1 - delx * wr2);
+          vrel = vtr1 * vtr1 + vtr2 * vtr2 + vtr3 * vtr3;
+          vrel = _SQRT_(vrel);
+
+          fn = xmu * fabs(ccel * r);
+          fs = meff * gammat * vrel;
+          ft = (vrel != F_F(0.0)) ? MIN(fn, fs) / vrel : F_F(0.0);
+
+          fs1 = -ft * vtr1;
+          fs2 = -ft * vtr2;
+          fs3 = -ft * vtr3;
+
+          F_FLOAT dxfp, dyfp, dzfp;
+          fxtmp += dxfp = delx * ccel + fs1;
+          fytmp += dyfp = dely * ccel + fs2;
+          fztmp += dzfp = delz * ccel + fs3;
+
+          tor1 = rinv * (dely * fs3 - delz * fs2);
+          tor2 = rinv * (delz * fs1 - delx * fs3);
+          tor3 = rinv * (delx * fs2 - dely * fs1);
+
+          torquextmp -= radi * tor1;
+          torqueytmp -= radi * tor2;
+          torqueztmp -= radi * tor3;
+
+          if(vflag) {
+            sharedV[0 * blockDim.x] += delx * dxfp;
+            sharedV[1 * blockDim.x] += dely * dyfp;
+            sharedV[2 * blockDim.x] += delz * dzfp;
+            sharedV[3 * blockDim.x] += delx * dyfp;
+            sharedV[4 * blockDim.x] += delx * dzfp;
+            sharedV[5 * blockDim.x] += dely * dzfp;
+          }
+
+        }
+      }
+  }
+
+  __syncthreads();
+
+  if(ii < _inum) {
+    F_FLOAT* my_f = _f + i;
+    *my_f += fxtmp;
+    my_f += _nmax;
+    *my_f += fytmp;
+    my_f += _nmax;
+    *my_f += fztmp;
+    F_FLOAT* my_torque = _torque + i;
+    *my_torque += torquextmp;
+    my_torque += _nmax;
+    *my_torque += torqueytmp;
+    my_torque += _nmax;
+    *my_torque += torqueztmp;
+  }
+
+  __syncthreads();
+
+  if(eflag) sharedE[0] = evdwl;
+
+  if(eflag_atom && i < _nlocal) _eatom[i] += evdwl;
+
+  if(vflag_atom && i < _nlocal) {
+    _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+    _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+    _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+    _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+    _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+    _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+  }
+
+  if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0);
+}
diff --git a/lib/cuda/pair_lj96_cut_cuda.cu b/lib/cuda/pair_lj96_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1d40a3c82e9334fcdea8d0744710535316f6b531
--- /dev/null
+++ b/lib/cuda/pair_lj96_cut_cuda.cu
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj96_cut_cuda_cu.h"
+#include "pair_lj96_cut_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+
+void Cuda_PairLJ96CutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, false, false);
+}
+
+
+
+
+void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJ96CutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 256);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ96_CUT, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+
+
diff --git a/lib/cuda/pair_lj96_cut_cuda_cu.h b/lib/cuda/pair_lj96_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e4b62daff2835855c6d51bf39e3d80b3c59a502
--- /dev/null
+++ b/lib/cuda/pair_lj96_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJ96CutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f3c2477be6f738e18c219cdadd929baad84d295a
--- /dev/null
+++ b/lib/cuda/pair_lj96_cut_cuda_kernel_nc.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJ96CutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  const F_FLOAT r3inv = _SQRT_(r6inv);
+  const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]);
+
+  if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv - _lj4[ij_type]) - _offset[ij_type]);
+
+  return factor_lj * forcelj * r2inv;
+}
+
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..752f3bd47dbb435a14f2d7ab08ba6c8697aefd85
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda.cu
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+
+#include "pair_lj_charmm_coul_charmm_cuda_cu.h"
+#include "pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJCharmmCoulCharmmCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
+  cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv  , sizeof(F_FLOAT));
+
+  return;
+}
+
+
+
+void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+                                     int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul)
+{
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCharmmCoulCharmmCuda_Init(sdata, cut_coul_innersq, 1.0 / denom_lj, 1.0 / denom_coul);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e44d2941f2aa9de73b6ff4395df36cde895579aa
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCharmmCoulCharmmCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul);
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d4ed2f48af11dd3f98983a995f072f1e924a0706
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_cuda_kernel_nc.cu
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairLJCharmmCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
+  F_FLOAT philj, switch1;
+
+  if(rsq > _cut_innersq_global) {
+    switch1 = (_cutsq_global - rsq) * (_cutsq_global - rsq) *
+              (_cutsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_innersq_global) * _denom_lj_inv;
+    const F_FLOAT switch2 = F_F(12.0) * rsq * (_cutsq_global - rsq) *
+                            (rsq - _cut_innersq_global) * _denom_lj_inv;
+    philj = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]);
+    forcelj = forcelj * switch1 + philj * switch2;
+  }
+
+  if(eflag) {
+    ENERGY_FLOAT evdwl_tmp = factor_lj;
+
+    if(rsq > _cut_innersq_global) {
+      evdwl_tmp *= philj * switch1;
+    } else
+      evdwl_tmp *= r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]);
+
+    evdwl += evdwl_tmp;
+  }
+
+  return factor_lj * forcelj * r2inv;
+}
+
+__device__ inline F_FLOAT CoulCharmmCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
+{
+  F_FLOAT forcecoul;
+  ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * _RSQRT_(rsq) * factor_coul;
+
+  if(rsq > _cut_coul_innersq_global) {
+    const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
+                            (_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv;
+    ecoul_tmp *= switch1;
+    const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
+                            (rsq - _cut_coul_innersq_global) * _denom_coul_inv;
+    forcecoul *= switch1 + switch2;
+  }
+
+  if(eflag) {
+    ecoul += ecoul_tmp * factor_coul;
+  }
+
+  return forcecoul * (F_F(1.0) / rsq);
+}
+
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..31df02b2efe3569cc7478f0f55051233371a4e1e
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda.cu
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _cut_coul_innersq_global MY_AP(cut_coul_innersq_global)
+#define _denom_lj_inv MY_AP(denom_lj_inv)
+#define _denom_coul_inv MY_AP(denom_coul_inv)
+__device__ __constant__ F_FLOAT _cut_coul_innersq_global;
+__device__ __constant__ F_FLOAT _denom_lj_inv;
+__device__ __constant__ F_FLOAT _denom_coul_inv;
+
+
+#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
+#include "pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_innersq, F_FLOAT denom_lj_inv, F_FLOAT denom_coul_inv)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
+  cudaMemcpyToSymbol(MY_AP(cut_coul_innersq_global) , &cut_coul_innersq  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(denom_coul_inv) , &denom_coul_inv  , sizeof(F_FLOAT));
+
+  return;
+}
+
+
+
+void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+    int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul)
+{
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCharmmCoulCharmmImplicitCuda_Init(sdata, cut_coul_innersq, 1.0 / denom_lj, 1.0 / denom_coul);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_CHARMM_IMPLICIT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..c410906957cbb8e51b9e5e17745e648c93165846
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCharmmCoulCharmmImplicitCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj, F_FLOAT cut_coul_innersq, F_FLOAT denom_coul);
diff --git a/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a20b8626accdbf4cf9698d65bf5a6d5a7d61987
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_charmm_implicit_cuda_kernel_nc.cu
@@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT CoulCharmmImplicitCuda_Eval(const F_FLOAT &rsq, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
+{
+  F_FLOAT forcecoul;
+  ENERGY_FLOAT ecoul_tmp = forcecoul = _qqrd2e * qij * (F_F(1.0) / rsq) * factor_coul;
+
+  if(rsq > _cut_coul_innersq_global) {
+    const F_FLOAT switch1 = (_cut_coulsq_global - rsq) * (_cut_coulsq_global - rsq) *
+                            (_cut_coulsq_global + F_F(2.0) * rsq - F_F(3.0) * _cut_coul_innersq_global) * _denom_coul_inv;
+    ecoul_tmp *= switch1;
+    const F_FLOAT switch2 = F_F(12.0) * rsq * (_cut_coulsq_global - rsq) *
+                            (rsq - _cut_coul_innersq_global) * _denom_coul_inv;
+    forcecoul *= (switch1 + switch2);
+  }
+
+  if(eflag) {
+    ecoul += ecoul_tmp * factor_coul;
+  }
+
+  return F_F(2.0) * forcecoul * (F_F(1.0) / rsq);
+}
+
diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda.cu b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0096f7757e5545fd808bd89c6cf21d4f87862c22
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_long_cuda.cu
@@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+
+#include "pair_lj_charmm_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCharmmCoulLongCuda_Init(cuda_shared_data* sdata, F_FLOAT denom_lj_inv)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true, true, true);
+  cudaMemcpyToSymbol(MY_AP(denom_lj_inv) , &denom_lj_inv  , sizeof(F_FLOAT));
+
+  return;
+}
+
+
+
+void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+                                   int eflag_atom, int vflag_atom, F_FLOAT denom_lj)
+{
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCharmmCoulLongCuda_Init(sdata, 1.0 / denom_lj);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CHARMM, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..34b0b722ef5f173552cf2486926ffada9815f73c
--- /dev/null
+++ b/lib/cuda/pair_lj_charmm_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCharmmCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT denom_lj);
diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda.cu b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f09a480534c9786f43e55dfd7ed99a1a219735f2
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_cut_cuda.cu
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_class2_coul_cut_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJClass2CoulCutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true);
+}
+
+void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJClass2CoulCutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..b775ad34682f6ace4a52ee1b374cecca613dbe43
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJClass2CoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda.cu b/lib/cuda/pair_lj_class2_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f20c74c33a550e8c251eb0df56e0b66b682524ee
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_long_cuda.cu
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_class2_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJClass2CoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true);
+}
+
+void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJClass2CoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c2fba253a16652448d561d1e4d611c20e7415f7
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJClass2CoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_class2_cuda.cu b/lib/cuda/pair_lj_class2_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a72e31fd9c4ab6ab6b7f159b80bbd85737f79038
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_cuda.cu
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_class2_cuda_cu.h"
+#include "pair_lj_class2_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJClass2Cuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJClass2Cuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
+  //if(CUDA_ARCH==20) maxthreads*=2;
+  //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CLASS2, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_class2_cuda_cu.h b/lib/cuda/pair_lj_class2_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e58da30450c2ca594005a4c4db4f58e13450bbef
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJClass2Cuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..761e985ec8c017b105b4bb6cb31489088d6ecf91
--- /dev/null
+++ b/lib/cuda/pair_lj_class2_cuda_kernel_nc.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJClass2Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  const F_FLOAT r3inv = _SQRT_(r6inv);
+
+  if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r3inv -
+                                    _lj4[ij_type]) - _offset[ij_type]);
+
+  return factor_lj * r6inv * (_lj1[ij_type] * r3inv - _lj2[ij_type]) * r2inv;
+}
+
diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda.cu b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..88ba0300cf44d3ad3ffef3713abdc1b50264f96d
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_cut_cuda.cu
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_coul_cut_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true);
+}
+
+void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCutCoulCutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..5aa5ce22c8ad3f57797cd21a27c5be9c5ab7f7f6
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCutCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda.cu b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fdbe594768ca93d0c83744a8231a50f413fa2449
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_debye_cuda.cu
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_coul_debye_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutCoulDebyeCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true);
+}
+
+void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCutCoulDebyeCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_DEBYE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..59e4cdbc159008343c5d615542c876fc8b58e5a1
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_debye_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCutCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda.cu b/lib/cuda/pair_lj_cut_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..316bb68351c50cb53fb26cc6cfb4986984283fae
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_long_cuda.cu
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_coul_long_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4, true);
+}
+
+void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCutCoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5ed5be999506308613f5ed83e56b4c7c6487ad8
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairLJCutCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_lj_cut_cuda.cu b/lib/cuda/pair_lj_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f2796e95883495c569955e9e2cdfaba48a26308
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_cuda.cu
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_cuda_cu.h"
+#include "pair_lj_cut_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJCutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
+  //if(CUDA_ARCH==20) maxthreads*=2;
+  //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_cuda_cu.h b/lib/cuda/pair_lj_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8a6365c8cba1be1f117755ede9bd57555b56a6c
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairLJCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2517a006e9bdc4e925bdd50bf40099f3f8f18d60
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_cuda_kernel_nc.cu
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJCutCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+
+  if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv -
+                                    _lj4[ij_type]) - _offset[ij_type]);
+
+  return factor_lj * r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]) * r2inv;
+}
+
diff --git a/lib/cuda/pair_lj_cut_experimental_cuda.cu b/lib/cuda/pair_lj_cut_experimental_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4df5755326d49c31b3cf1b6e219ddfe218e9ea85
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_experimental_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+
+#include "pair_lj_cut_experimental_cuda_cu.h"
+
+#include <time.h>
+
+void Cuda_PairLJCutExperimentalCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJCutExperimentalCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  //int maxthreads=192*sizeof(double)/sizeof(F_FLOAT);
+  //if(CUDA_ARCH==20) maxthreads*=2;
+  //cudaFuncSetCacheConfig(Pair_Kernel_TpA_opt<PAIR_LJ_CUT,COUL_NONE,DATA_NONE>,cudaFuncCachePreferL1);
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
+
+  if(sharedperproc == 0) sharedperproc++;
+
+  //printf("comm_phase: %i\n",sdata->comm.comm_phase);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA_opt<PAIR_LJ_CUT, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom, sdata->comm.comm_phase);
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
diff --git a/lib/cuda/pair_lj_cut_experimental_cuda_cu.h b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..34e9f5417875096cd8cb7b76da1a80bbda0787a6
--- /dev/null
+++ b/lib/cuda/pair_lj_cut_experimental_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJCutExperimentalCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_expand_cuda.cu b/lib/cuda/pair_lj_expand_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..290c9a7a97c1d9fbad9d27f6d7dc5ac27d3bd032
--- /dev/null
+++ b/lib/cuda/pair_lj_expand_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _shift MY_AP(coeff5)
+
+#include "pair_lj_expand_cuda_cu.h"
+#include "pair_lj_expand_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+void Cuda_PairLJExpandCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5);
+}
+
+
+
+
+void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJExpandCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 256);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_EXPAND, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+
+
diff --git a/lib/cuda/pair_lj_expand_cuda_cu.h b/lib/cuda/pair_lj_expand_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..fff9a09fbfe2506ebfaa159e835b0b0be47195d5
--- /dev/null
+++ b/lib/cuda/pair_lj_expand_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJExpandCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fc03d6fbf48e18544a1267b6ae890ae263834f84
--- /dev/null
+++ b/lib/cuda/pair_lj_expand_cuda_kernel_nc.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairLJExpandCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r = _SQRT_(rsq);
+  const F_FLOAT rshift = r - _shift[ij_type];
+  const F_FLOAT rshiftsq = rshift * rshift;
+  const F_FLOAT r2inv = F_F(1.0) / rshiftsq;
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  const F_FLOAT forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
+
+  if(eflag) evdwl += factor_lj * (r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) - _offset[ij_type]);
+
+  return factor_lj * forcelj * (F_F(1.0) / rshift) * (F_F(1.0) / r);
+}
diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..354f06b54b714fea10735c367c8fd6ca3f27caf2
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda.cu
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _ljsw1 MY_AP(coeff5_gm)
+#define _ljsw2 MY_AP(coeff6_gm)
+#define _ljsw3 MY_AP(coeff7_gm)
+#define _ljsw4 MY_AP(coeff8_gm)
+#define _ljsw5 MY_AP(coeff9_gm)
+
+#define _cut_coul_inner_global MY_AP(cut_coul_inner_global)
+#define _coulsw1 MY_AP(coulsw1)
+#define _coulsw2 MY_AP(coulsw2)
+#define _coulsw5 MY_AP(coulsw5)
+__device__ __constant__ F_FLOAT _cut_coul_inner_global;
+__device__ __constant__ F_FLOAT _coulsw1;
+__device__ __constant__ F_FLOAT _coulsw2;
+__device__ __constant__ F_FLOAT _coulsw5;
+
+
+#include "pair_lj_gromacs_coul_gromacs_cuda_cu.h"
+#include "pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJGromacsCoulGromacsCuda_Init(cuda_shared_data* sdata, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 9, true, true, true);
+  cudaMemcpyToSymbol(MY_AP(cut_coul_inner_global) , &cut_coul_inner  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(coulsw1) , &coulsw1  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(coulsw2) , &coulsw2  , sizeof(F_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(coulsw5) , &coulsw5  , sizeof(F_FLOAT));
+
+  return;
+}
+
+
+
+void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+                                       int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5)
+{
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJGromacsCoulGromacsCuda_Init(sdata, cut_coul_inner, coulsw1, coulsw2, coulsw5);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_GROMACS, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _ljsw1
+#undef _ljsw2
+#undef _ljsw3
+#undef _ljsw4
+#undef _ljsw5
+#undef _cut_coul_inner_global
+#undef _coulsw1
+#undef _coulsw2
+#undef _coulsw5
diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e3b078166ea8a6839d2a9105e612b6fb4101c56
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJGromacsCoulGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom, F_FLOAT cut_coul_inner, F_FLOAT coulsw1, F_FLOAT coulsw2, F_FLOAT coulsw5);
diff --git a/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ee6dda06f019ef1d8852331372245c440c72dfbc
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_coul_gromacs_cuda_kernel_nc.cu
@@ -0,0 +1,51 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT CoulGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_coul, int &eflag, ENERGY_FLOAT &ecoul, F_FLOAT qij)
+{
+  if(qij != F_F(0.0)) {
+    F_FLOAT ecoul_tmp;
+    F_FLOAT forcecoul = _RSQRT_(rsq);
+
+    if(eflag) ecoul_tmp = forcecoul - _coulsw5;
+
+    if(rsq > _cut_coul_inner_global * _cut_coul_inner_global) {
+      const F_FLOAT r = F_F(1.0) / forcecoul;
+      const F_FLOAT tc = r - _cut_coul_inner_global;
+      forcecoul += r * tc * tc * (_coulsw1 + _coulsw2 * tc);
+
+      if(eflag)  ecoul_tmp -= tc * tc * tc * (_coulsw1 * (F_F(1.0) / F_F(3.0)) + _coulsw2 * tc * (F_F(1.0) / F_F(4.0)));
+    }
+
+    F_FLOAT qprod = _qqrd2e * qij * factor_coul;
+    forcecoul *= qprod;
+
+    if(eflag) {
+      ecoul += ecoul_tmp * qprod;
+    }
+
+    return forcecoul * (F_F(1.0) / rsq);
+  }
+
+  return F_F(0.0);
+}
diff --git a/lib/cuda/pair_lj_gromacs_cuda.cu b/lib/cuda/pair_lj_gromacs_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..35cc94a3cf6bc2559ddcc8f5c1d7ab91f664b534
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_cuda.cu
@@ -0,0 +1,84 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _ljsw1 MY_AP(coeff5_gm)
+#define _ljsw2 MY_AP(coeff6_gm)
+#define _ljsw3 MY_AP(coeff7_gm)
+#define _ljsw4 MY_AP(coeff8_gm)
+#define _ljsw5 MY_AP(coeff9_gm)
+
+#include "pair_lj_gromacs_cuda_cu.h"
+#include "pair_lj_gromacs_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJGromacsCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 9, false, true, true);
+}
+
+
+
+void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+                            int eflag_atom, int vflag_atom)
+{
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJGromacsCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_GROMACS, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _ljsw1
+#undef _ljsw2
+#undef _ljsw3
+#undef _ljsw4
+#undef _ljsw5
diff --git a/lib/cuda/pair_lj_gromacs_cuda_cu.h b/lib/cuda/pair_lj_gromacs_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..273d190e0afe02d0cb720ede95b41271226f279b
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJGromacsCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bf9d042e65bb9409871a7dcf0c2a50ff1c9d27ab
--- /dev/null
+++ b/lib/cuda/pair_lj_gromacs_cuda_kernel_nc.cu
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJGromacsCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  F_FLOAT tlj;
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r = _RSQRT_(r2inv);
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+  F_FLOAT	forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
+  const X_FLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]);
+
+  if(rsq > cut_lj_innersq) {
+    tlj = r - _SQRT_(cut_lj_innersq);
+    forcelj += r * tlj * tlj * (_ljsw1[ij_type] + _ljsw2[ij_type] * tlj);
+  }
+
+  if(eflag) {
+    ENERGY_FLOAT evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]);
+
+    if(rsq > cut_lj_innersq) {
+      evdwl_tmp += tlj * tlj * tlj *
+                   (_ljsw3[ij_type] + _ljsw4[ij_type] * tlj) + _ljsw5[ij_type];;
+    }
+
+    evdwl += evdwl_tmp * factor_lj;
+  }
+
+  return factor_lj * forcelj * r2inv;
+}
diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8647b1a62e7d2141dadefcba3989744c18fc1854
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_coul_cut_cuda.cu
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _lj_type MY_AP(coeff5)
+
+
+#include "pair_lj_sdk_coul_cut_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairLJSDKCoulCutCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true, false);
+
+}
+
+
+
+
+void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJSDKCoulCutCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_CUT, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _lj_type
+
diff --git a/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a7cea4086d49ca1b005774a170bbb6251d3292f
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_coul_cut_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJSDKCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..48dddcae6a9b547234f3d8fd1b3c349736e010e9
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_coul_debye_cuda.cu
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _lj_type MY_AP(coeff5)
+
+
+#include "pair_lj_sdk_coul_debye_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairLJSDKCoulDebyeCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true, false);
+
+}
+
+
+
+
+void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJSDKCoulDebyeCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_DEBYE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _lj_type
+
diff --git a/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..7cee6319d60c16025be80f07948bf9fe6a0cb5fb
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_coul_debye_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJSDKCoulDebyeCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda.cu b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6cbe15c7ab48df4b55e1bf8a76e6819446793c00
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_coul_long_cuda.cu
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _lj_type MY_AP(coeff5)
+
+
+#include "pair_lj_sdk_coul_long_cuda_cu.h"
+#include <time.h>
+
+
+
+
+void Cuda_PairLJSDKCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true, false);
+
+}
+
+
+
+
+void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJSDKCoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 128);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _lj_type
+
diff --git a/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h b/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae407b4841973c77f6e79541eef2d8af2ae83187
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_coul_long_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJSDKCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_sdk_cuda.cu b/lib/cuda/pair_lj_sdk_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a6fcf7f7a0f0da81d0b826db0a36bd22e0c9a39d
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_cuda.cu
@@ -0,0 +1,87 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1)
+#define _lj2 MY_AP(coeff2)
+#define _lj3 MY_AP(coeff3)
+#define _lj4 MY_AP(coeff4)
+#define _lj_type MY_AP(coeff5)
+
+enum {CG_NOT_SET = 0, CG_LJ9_6, CG_LJ12_4, CG_LJ12_6, NUM_CG_TYPES,
+      CG_COUL_NONE, CG_COUL_CUT, CG_COUL_DEBYE, CG_COUL_LONG
+     };
+
+#include "pair_lj_sdk_cuda_cu.h"
+#include "pair_lj_sdk_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+
+void Cuda_PairLJSDKCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, false, false);
+
+}
+
+
+
+
+void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJSDKCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  int maxthreads = 128;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, maxthreads);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_CG_CMM, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _lj_type
+
diff --git a/lib/cuda/pair_lj_sdk_cuda_cu.h b/lib/cuda/pair_lj_sdk_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b858cf7075c95d193810438f0debfb1f107784c
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJSDKCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f8f2474551840bb13113345c21b2d226b333df92
--- /dev/null
+++ b/lib/cuda/pair_lj_sdk_cuda_kernel_nc.cu
@@ -0,0 +1,49 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl) //0.11 of 0.4
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const int lj_type = _lj_type[ij_type];
+  const F_FLOAT r4inv = r2inv * r2inv;
+  const F_FLOAT rNinv_first = lj_type != CG_LJ9_6 ? r4inv : _RSQRT_(rsq);
+  const F_FLOAT rNinv_second = lj_type != CG_LJ12_4 ? -r2inv : -F_F(1.0);
+  const F_FLOAT forcelj = r4inv * (_lj1[ij_type] * r4inv * rNinv_first + _lj2[ij_type] * rNinv_second);
+
+  if(eflag) evdwl += factor_lj * (r4inv * (_lj3[ij_type] * r4inv * rNinv_first + _lj4[ij_type] * rNinv_second) - _offset[ij_type]);
+
+  return factor_lj * forcelj * r2inv;
+}
+
+/*__device__ inline F_FLOAT PairLJSDKCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
+{
+	const int lj_type = tex1Dfetch(_coeff5_gm_tex,ij_type);
+	const F_FLOAT r2inv = F_F(1.0)/rsq;
+	const F_FLOAT r4inv = r2inv*r2inv;
+	const F_FLOAT rNinv_first = lj_type!=CG_LJ9_6?r4inv:_RSQRT_(rsq);
+	const F_FLOAT rNinv_second = lj_type!=CG_LJ12_4?r2inv:F_F(1.0);
+	const F_FLOAT forcelj = r4inv * (tex1Dfetch(_coeff1_gm_tex,ij_type)*r4inv*rNinv_first - tex1Dfetch(_coeff2_gm_tex,ij_type)*rNinv_second);
+
+    if(eflag) evdwl += factor_lj*(r4inv*(tex1Dfetch(_coeff3_gm_tex,ij_type)*r4inv*rNinv_first-tex1Dfetch(_coeff4_gm_tex,ij_type)*rNinv_second));
+	return factor_lj*forcelj*r2inv;
+}*/
diff --git a/lib/cuda/pair_lj_smooth_cuda.cu b/lib/cuda/pair_lj_smooth_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..aa1df9e6bec193c2ce4f06fbb88fd7786d93ad04
--- /dev/null
+++ b/lib/cuda/pair_lj_smooth_cuda.cu
@@ -0,0 +1,84 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _lj1 MY_AP(coeff1_gm)
+#define _lj2 MY_AP(coeff2_gm)
+#define _lj3 MY_AP(coeff3_gm)
+#define _lj4 MY_AP(coeff4_gm)
+#define _ljsw1 MY_AP(coeff5_gm)
+#define _ljsw2 MY_AP(coeff6_gm)
+#define _ljsw3 MY_AP(coeff7_gm)
+#define _ljsw4 MY_AP(coeff8_gm)
+#define _ljsw0 MY_AP(coeff9_gm)
+
+#include "pair_lj_smooth_cuda_cu.h"
+#include "pair_lj_smooth_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairLJSmoothCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 9, false, true, true);
+}
+
+
+
+void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,
+                           int eflag_atom, int vflag_atom)
+{
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairLJSmoothCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_LJ_SMOOTH, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_LJ_SMOOTH, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _lj1
+#undef _lj2
+#undef _lj3
+#undef _lj4
+#undef _ljsw1
+#undef _ljsw2
+#undef _ljsw3
+#undef _ljsw4
+#undef _ljsw0
diff --git a/lib/cuda/pair_lj_smooth_cuda_cu.h b/lib/cuda/pair_lj_smooth_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..78a04db227252f229c99b7bcec9316d343c74439
--- /dev/null
+++ b/lib/cuda/pair_lj_smooth_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairLJSmoothCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c1bb3b07854493195b6ee0caa2db089f6a2822ca
--- /dev/null
+++ b/lib/cuda/pair_lj_smooth_cuda_kernel_nc.cu
@@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT PairLJSmoothCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  F_FLOAT fskin, t, tsq, forcelj;
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r = _RSQRT_(r2inv);
+  const F_FLOAT r6inv = r2inv * r2inv * r2inv;
+
+
+  X_FLOAT cut_lj_innersq = (_cut_innersq_global > X_F(0.0) ? _cut_innersq_global : _cut_innersq[ij_type]);
+
+  if(rsq < cut_lj_innersq) {
+    forcelj = r6inv * (_lj1[ij_type] * r6inv - _lj2[ij_type]);
+  } else {
+    t = r - _SQRT_(cut_lj_innersq);
+    tsq = t * t;
+    fskin = _ljsw1[ij_type] +  _ljsw2[ij_type] * t +
+            _ljsw3[ij_type] * tsq +  _ljsw4[ij_type] * tsq * t;
+    forcelj = fskin * r;
+
+  }
+
+  if(eflag) {
+    ENERGY_FLOAT evdwl_tmp;
+
+    if(rsq < cut_lj_innersq) {
+      evdwl_tmp = r6inv * (_lj3[ij_type] * r6inv - _lj4[ij_type]) -
+                  _offset[ij_type];
+    } else {
+      evdwl_tmp = _ljsw0[ij_type] - _ljsw1[ij_type] * t -
+                  _ljsw2[ij_type] * tsq * F_F(0.5) - _ljsw3[ij_type] * tsq * t * (F_F(1.0) / F_F(3.0)) -
+                  _ljsw4[ij_type] * tsq * tsq * (F_F(1.0) / F_F(4.0)) - _offset[ij_type];
+    }
+
+    evdwl += evdwl_tmp * factor_lj;
+  }
+
+  return factor_lj * forcelj * r2inv;
+}
diff --git a/lib/cuda/pair_manybody_const.h b/lib/cuda/pair_manybody_const.h
new file mode 100644
index 0000000000000000000000000000000000000000..94d644a9a1346b931d671db67d8915149833a4a5
--- /dev/null
+++ b/lib/cuda/pair_manybody_const.h
@@ -0,0 +1,16 @@
+/*
+ * pair_manybody_const.h
+ *
+ *  Created on: Oct 11, 2011
+ *      Author: chmu-tph
+ */
+
+#define MANYBODY_NPAIR 3
+
+__device__ __constant__ int elem2param[(MANYBODY_NPAIR + 1) * (MANYBODY_NPAIR + 1) * (MANYBODY_NPAIR + 1)];
+__device__ __constant__ int nelements;
+__device__ __constant__ int map[MANYBODY_NPAIR + 2];
+__device__ __constant__ int* _glob_numneigh_red;  //number of neighbors within force cutoff (as opposed to neighbor cutoff)
+__device__ __constant__ int* _glob_neighbors_red; //indices of neighbors within force cutoff
+__device__ __constant__ int* _glob_neightype_red; //type of neighbors within force cutoff
+
diff --git a/lib/cuda/pair_morse_coul_long_cuda.cu b/lib/cuda/pair_morse_coul_long_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7512eb05679a3e75f1cfac2a437682af0b5e50fb
--- /dev/null
+++ b/lib/cuda/pair_morse_coul_long_cuda.cu
@@ -0,0 +1,79 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _r0 MY_AP(coeff1)
+#define _alpha MY_AP(coeff2)
+#define _morse1 MY_AP(coeff3)
+#define _d0 MY_AP(coeff4)
+#define _c0 MY_AP(coeff5)
+
+#include "pair_morse_coul_long_cuda_cu.h"
+#include "pair_morse_coul_long_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+void Cuda_PairMorseCoulLongCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 5, true);
+}
+
+void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairMorseCoulLongCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_MORSE_R6, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_MORSE_R6, COUL_LONG, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+
+
+#undef _rhoinv
+#undef _sigma
+#undef _a
+#undef _c
+#undef _d
+#undef _c0
+
diff --git a/lib/cuda/pair_morse_coul_long_cuda_cu.h b/lib/cuda/pair_morse_coul_long_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6d031a1ce89539854a70bedb68ee8ed74693ab0
--- /dev/null
+++ b/lib/cuda/pair_morse_coul_long_cuda_cu.h
@@ -0,0 +1,30 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+#ifdef CUDA_USE_BINNING
+extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
+#else
+extern "C" void Cuda_PairMorseCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
+#endif
diff --git a/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f6b436f0d681e0ee9d9c7c24dd0ec63e7b365a48
--- /dev/null
+++ b/lib/cuda/pair_morse_coul_long_cuda_kernel_nc.cu
@@ -0,0 +1,35 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairMorseR6Cuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r2inv = F_F(1.0) / rsq;
+  const F_FLOAT r = _SQRT_(rsq);
+  const F_FLOAT r4inv = r2inv * r2inv;
+  const F_FLOAT dr = r - _r0[ij_type];
+  const F_FLOAT dexp = _EXP_(-_alpha[ij_type] * dr);
+
+  if(eflag) evdwl += factor_lj * (_d0[ij_type] * (dexp * dexp - F_F(2.0) * dexp) + _c0[ij_type] * r4inv * r4inv * r4inv
+                                    - _offset[ij_type]);
+
+  return factor_lj * (_morse1[ij_type] * (dexp * dexp - dexp) * (F_F(1.0) / r) - F_F(12.0) * _c0[ij_type] * r4inv * r4inv * r4inv * r2inv);
+}
diff --git a/lib/cuda/pair_morse_cuda.cu b/lib/cuda/pair_morse_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2a651b916347a401d7a19ebec051a0f8d414abc
--- /dev/null
+++ b/lib/cuda/pair_morse_cuda.cu
@@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#define _r0 MY_AP(coeff1)
+#define _alpha MY_AP(coeff2)
+#define _morse1 MY_AP(coeff3)
+#define _d0 MY_AP(coeff4)
+
+#include "pair_morse_cuda_cu.h"
+#include "pair_morse_cuda_kernel_nc.cu"
+#include <time.h>
+
+
+
+void Cuda_PairMorseCuda_Init(cuda_shared_data* sdata)
+{
+  Cuda_Pair_Init_AllStyles(sdata, 4);
+}
+
+
+
+
+void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+
+  // initialize only on first call
+  static  short init = 0;
+
+  if(! init) {
+    init = 1;
+    Cuda_PairMorseCuda_Init(sdata);
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 256);
+
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+  if(sdata->pair.use_block_per_atom)
+    Pair_Kernel_BpA<PAIR_MORSE, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+  else
+    Pair_Kernel_TpA<PAIR_MORSE, COUL_NONE, DATA_NONE>
+    <<< grid, threads, sharedperproc* sizeof(ENERGY_FLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
+#undef _r0
+#undef _alpha
+#undef _morse1
+#undef _d0
+
+
diff --git a/lib/cuda/pair_morse_cuda_cu.h b/lib/cuda/pair_morse_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..a9f923d20b1b88381e84d9c720f42432eec81432
--- /dev/null
+++ b/lib/cuda/pair_morse_cuda_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairMorseCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_morse_cuda_kernel_nc.cu b/lib/cuda/pair_morse_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0b3baac412ac0d6284a190ee94ac030c0e8b59fd
--- /dev/null
+++ b/lib/cuda/pair_morse_cuda_kernel_nc.cu
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+__device__ inline F_FLOAT PairMorseCuda_Eval(const F_FLOAT &rsq, const int ij_type, F_FLOAT &factor_lj, int &eflag, ENERGY_FLOAT &evdwl)
+{
+  const F_FLOAT r = _SQRT_(rsq);
+  const F_FLOAT dr = r - _r0[ij_type];
+  const F_FLOAT dexp = _EXP_(-_alpha[ij_type] * dr);
+
+  if(eflag) evdwl += factor_lj * (_d0[ij_type] * (dexp * dexp - F_F(2.0) * dexp)
+                                    - _offset[ij_type]);
+
+  return factor_lj * _morse1[ij_type] * (dexp * dexp - dexp) * (F_F(1.0) / r);
+}
+
diff --git a/lib/cuda/pair_sw_cuda.cu b/lib/cuda/pair_sw_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..491d4d666fc4289084e216c00d78c1bfd1a4bf12
--- /dev/null
+++ b/lib/cuda/pair_sw_cuda.cu
@@ -0,0 +1,139 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+#include "pair_sw_cuda_cu.h"
+__device__ __constant__ ParamSW_Float params_sw[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
+
+#include "pair_sw_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+
+void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h)
+{
+  unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+
+  cudaMemcpyToSymbol(MY_AP(box_size)     , box_size                      , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)  , &cuda_ntypes                   , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(virial)       , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity)  , sdata->domain.periodicity     , sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+  cudaMemcpyToSymbol(params_sw, params_host  , sizeof(ParamSW_Float)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(elem2param, elem2param_host  , sizeof(int)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(map, map_host  , sizeof(int)*cuda_ntypes);
+  cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
+}
+
+void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  static int glob_ij_size = 0;
+  static F_FLOAT4* glob_r_ij = NULL;
+  static int* glob_numneigh_red = NULL;
+  static int* glob_neighbors_red = NULL;
+  static int* glob_neightype_red = NULL;
+
+  if(glob_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
+    glob_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
+    cudaFree(glob_r_ij);
+    cudaFree(glob_numneigh_red);
+    cudaFree(glob_neighbors_red);
+    cudaFree(glob_neightype_red);
+    cudaMalloc(&glob_r_ij, glob_ij_size * 4);
+    cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
+    cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij  , sizeof(F_FLOAT4*));
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+
+
+  dim3 grid2;
+
+  if(sdata->atom.nall <= 256 * 64000) {
+    grid2.x = (sdata->atom.nall + 255) / 256;
+    grid2.y = 1;
+  } else {
+    grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
+    grid2.y = 128;
+  }
+
+  grid2.z = 1;
+  dim3 threads2;
+  threads2.x = 256;
+  threads2.y = 1;
+  threads2.z = 1;
+
+  timespec time1, time2;
+
+  //pre-calculate all neighbordistances and zeta_ij
+  clock_gettime(CLOCK_REALTIME, &time1);
+  Pair_SW_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>();
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test1 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+  clock_gettime(CLOCK_REALTIME, &time1);
+
+  //actual force calculation
+  unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
+
+  if(eflag) {
+    if(vflag)
+      Pair_SW_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+    else
+      Pair_SW_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+  } else {
+    if(vflag)
+      Pair_SW_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+    else
+      Pair_SW_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+  }
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test2 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
diff --git a/lib/cuda/pair_sw_cuda_cu.h b/lib/cuda/pair_sw_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3713a398902e20e8738f707a8658007111f46fe
--- /dev/null
+++ b/lib/cuda/pair_sw_cuda_cu.h
@@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+struct ParamSW_Float {
+  F_FLOAT epsilon, sigma;
+  F_FLOAT littlea, lambda, gamma, costheta;
+  F_FLOAT biga, bigb;
+  F_FLOAT powerp, powerq;
+  F_FLOAT tol;
+  F_FLOAT cut, cutsq;
+  F_FLOAT sigma_gamma, lambda_epsilon, lambda_epsilon2;
+  F_FLOAT c1, c2, c3, c4, c5, c6;
+  int ielement, jelement, kelement;
+};
+
+extern "C" void Cuda_PairSWCuda_Init(cuda_shared_data* sdata, ParamSW_Float* params_host, void* map_host, void* elem2param_host, int nelements_h);
+extern "C" void Cuda_PairSWCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_sw_cuda_kernel_nc.cu b/lib/cuda/pair_sw_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ade74808eefdf20b5a0e533d632914ed56a80b0a
--- /dev/null
+++ b/lib/cuda/pair_sw_cuda_kernel_nc.cu
@@ -0,0 +1,457 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#define Pi F_F(3.1415926535897932384626433832795)
+#define PI Pi
+#define PI2 F_F(0.5)*Pi
+#define PI4 F_F(0.25)*Pi
+
+
+
+__device__ void twobody(int iparam, F_FLOAT rsq, F_FLOAT &fforce,
+                        int eflag, ENERGY_FLOAT &eng)
+{
+  F_FLOAT r, rp, rq, rainv, expsrainv;
+
+  r = sqrt(rsq);
+  rp = pow(r, -params_sw[iparam].powerp);
+  rq = pow(r, -params_sw[iparam].powerq);
+  rainv = 1.0 / (r - params_sw[iparam].cut);
+  expsrainv = exp(params_sw[iparam].sigma * rainv);
+  fforce = (params_sw[iparam].c1 * rp - params_sw[iparam].c2 * rq +
+            (params_sw[iparam].c3 * rp - params_sw[iparam].c4 * rq) * rainv * rainv * r) * expsrainv / rsq;
+
+  if(eflag) eng += (params_sw[iparam].c5 * rp - params_sw[iparam].c6 * rq) * expsrainv;
+}
+
+__device__ void threebody(int paramij, int paramik, int paramijk,
+                          F_FLOAT4 &delr1,
+                          F_FLOAT4 &delr2,
+                          F_FLOAT3 &fj, F_FLOAT3 &fk, int eflag, ENERGY_FLOAT &eng)
+{
+  F_FLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1;
+  F_FLOAT r2, rinvsq2, rainv2, gsrainv2, gsrainvsq2, expgsrainv2;
+  F_FLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1, frad2;
+  F_FLOAT facang, facang12, csfacang, csfac1, csfac2;
+
+  r1 = sqrt(delr1.w);
+  rinvsq1 = F_F(1.0) / delr1.w;
+  rainv1 = F_F(1.0) / (r1 - params_sw[paramij].cut);
+  gsrainv1 = params_sw[paramij].sigma_gamma * rainv1;
+  gsrainvsq1 = gsrainv1 * rainv1 / r1;
+  expgsrainv1 = exp(gsrainv1);
+
+  r2 = sqrt(delr2.w);
+  rinvsq2 = F_F(1.0) / delr2.w;
+  rainv2 = F_F(1.0) / (r2 - params_sw[paramik].cut);
+  gsrainv2 = params_sw[paramik].sigma_gamma * rainv2;
+  gsrainvsq2 = gsrainv2 * rainv2 / r2;
+  expgsrainv2 = exp(gsrainv2);
+
+  rinv12 = F_F(1.0) / (r1 * r2);
+  cs = (delr1.x * delr2.x + delr1.y * delr2.y + delr1.z * delr2.z) * rinv12;
+  delcs = cs - params_sw[paramijk].costheta;
+  delcssq = delcs * delcs;
+
+  facexp = expgsrainv1 * expgsrainv2;
+
+  // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) *
+  //          facexp*delcssq;
+
+  facrad = params_sw[paramijk].lambda_epsilon * facexp * delcssq;
+  frad1 = facrad * gsrainvsq1;
+  frad2 = facrad * gsrainvsq2;
+  facang = params_sw[paramijk].lambda_epsilon2 * facexp * delcs;
+  facang12 = rinv12 * facang;
+  csfacang = cs * facang;
+  csfac1 = rinvsq1 * csfacang;
+
+  fj.x = delr1.x * (frad1 + csfac1) - delr2.x * facang12;
+  fj.y = delr1.y * (frad1 + csfac1) - delr2.y * facang12;
+  fj.z = delr1.z * (frad1 + csfac1) - delr2.z * facang12;
+
+  csfac2 = rinvsq2 * csfacang;
+
+  fk.x = delr2.x * (frad2 + csfac2) - delr1.x * facang12;
+  fk.y = delr2.y * (frad2 + csfac2) - delr1.y * facang12;
+  fk.z = delr2.z * (frad2 + csfac2) - delr1.z * facang12;
+
+  if(eflag) eng += F_F(2.0) * facrad;
+}
+
+__device__ void threebody_fj(int paramij, int paramik, int paramijk,
+                             F_FLOAT4 &delr1,
+                             F_FLOAT4 &delr2,
+                             F_FLOAT3 &fj)
+{
+  F_FLOAT r1, rinvsq1, rainv1, gsrainv1, gsrainvsq1, expgsrainv1;
+  F_FLOAT r2, rainv2, gsrainv2, expgsrainv2;
+  F_FLOAT rinv12, cs, delcs, delcssq, facexp, facrad, frad1;
+  F_FLOAT facang, facang12, csfacang, csfac1;
+
+  r1 = sqrt(delr1.w);
+  rinvsq1 = F_F(1.0) / delr1.w;
+  rainv1 = F_F(1.0) / (r1 - params_sw[paramij].cut);
+  gsrainv1 = params_sw[paramij].sigma_gamma * rainv1;
+  gsrainvsq1 = gsrainv1 * rainv1 / r1;
+  expgsrainv1 = exp(gsrainv1);
+
+  r2 = sqrt(delr2.w);
+  rainv2 = F_F(1.0) / (r2 - params_sw[paramik].cut);
+  gsrainv2 = params_sw[paramik].sigma_gamma * rainv2;
+  expgsrainv2 = exp(gsrainv2);
+
+  rinv12 = F_F(1.0) / (r1 * r2);
+  cs = (delr1.x * delr2.x + delr1.y * delr2.y + delr1.z * delr2.z) * rinv12;
+  delcs = cs - params_sw[paramijk].costheta;
+  delcssq = delcs * delcs;
+
+  facexp = expgsrainv1 * expgsrainv2;
+
+  // facrad = sqrt(paramij->lambda_epsilon*paramik->lambda_epsilon) *
+  //          facexp*delcssq;
+
+  facrad = params_sw[paramijk].lambda_epsilon * facexp * delcssq;
+  frad1 = facrad * gsrainvsq1;
+  facang = params_sw[paramijk].lambda_epsilon2 * facexp * delcs;
+  facang12 = rinv12 * facang;
+  csfacang = cs * facang;
+  csfac1 = rinvsq1 * csfacang;
+
+  fj.x = delr1.x * (frad1 + csfac1) - delr2.x * facang12;
+  fj.y = delr1.y * (frad1 + csfac1) - delr2.y * facang12;
+  fj.z = delr1.z * (frad1 + csfac1) - delr2.z * facang12;
+}
+
+
+__global__ void Pair_SW_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
+{
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(ii >= _nall) return;
+
+  X_FLOAT4 myxtype;
+  F_FLOAT4 delij;
+  F_FLOAT xtmp, ytmp, ztmp;
+  int itype, jnum, i, j;
+  int* jlist;
+  int neigh_red = 0;
+  i = ii;//_ilist[ii];
+  myxtype = fetchXType(i);
+
+  xtmp = myxtype.x;
+  ytmp = myxtype.y;
+  ztmp = myxtype.z;
+  itype = map[(static_cast <int>(myxtype.w))];
+
+  jnum = _numneigh[i];
+  jlist = &_neighbors[i];
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(jj < jnum) {
+
+      j = jlist[jj * _nall];
+      j &= NEIGHMASK;
+      myxtype = fetchXType(j);
+      delij.x = xtmp - myxtype.x;
+      delij.y = ytmp - myxtype.y;
+      delij.z = ztmp - myxtype.z;
+      int jtype = map[(static_cast <int>(myxtype.w))];
+      int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
+      delij.w = vec3_dot(delij, delij);
+
+      if(delij.w < params_sw[iparam_ij].cutsq) {
+        _glob_neighbors_red[i + neigh_red * _nall] = j;
+        _glob_neightype_red[i + neigh_red * _nall] = jtype;
+        _glob_r_ij[i + neigh_red * _nall] = delij;
+        neigh_red++;
+      }
+    }
+  }
+
+  _glob_numneigh_red[i] = neigh_red;
+}
+
+
+template <int eflag, int vflagm>
+__global__ void Pair_SW_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
+{
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+
+  ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x];
+  ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+  F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem;
+
+  if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x];
+  else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x];
+  else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x];
+
+  shared_F_F += threadIdx.x;
+
+  if(eflag_atom || eflag) {
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+  }
+
+  if(vflagm || vflag_atom) {
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int jnum_red = 0;
+#define fxtmp shared_F_F[0]
+#define fytmp shared_F_F[blockDim.x]
+#define fztmp shared_F_F[2*blockDim.x]
+  //#define jnum_red (static_cast <int> (shared_F_F[3*blockDim.x]))
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  X_FLOAT4 myxtype_i, myxtype_j, myxtype_k;
+  F_FLOAT4 delij, delik, deljk;
+  F_FLOAT fpair;
+
+  int itype, i, j;
+  int* jlist_red;
+
+  if(ii < _inum) {
+    i = _ilist[ii];
+
+    if(vflagm)
+      myxtype_i = fetchXType(i);
+
+    //itype=map[(static_cast <int> (myxtype_i.w))];
+    itype = map[_type[i]];
+
+
+    fxtmp = F_F(0.0);
+    fytmp = F_F(0.0);
+    fztmp = F_F(0.0);
+
+
+    //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i];
+    jnum_red = _glob_numneigh_red[i];
+    jlist_red = &_glob_neighbors_red[i];
+  }
+
+  __syncthreads();
+#pragma unroll 1
+
+  for(int jj = 0; jj < jnum_red; jj++) {
+    if(i < _nlocal) {
+      fpair = F_F(0.0);
+      j = jlist_red[jj * _nall];
+      j &= NEIGHMASK;
+
+      if(vflagm)
+        myxtype_j = fetchXType(j);
+
+      int jtype = _glob_neightype_red[i + jj * _nall];
+      delij = _glob_r_ij[i + jj * _nall];
+
+      volatile int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
+      volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype];
+
+      if(delij.w < params_sw[iparam_ij].cutsq) {
+        F_FLOAT dxfp, dyfp, dzfp;
+        twobody(iparam_ij, delij.w, fpair, eflag, evdwl);
+        fxtmp += dxfp = delij.x * fpair;
+        fytmp += dyfp = delij.y * fpair;
+        fztmp += dzfp = delij.z * fpair;
+
+        if(vflagm) {
+          sharedV[0 * blockDim.x] += delij.x * dxfp;
+          sharedV[1 * blockDim.x] += delij.y * dyfp;
+          sharedV[2 * blockDim.x] += delij.z * dzfp;
+          sharedV[3 * blockDim.x] += delij.x * dyfp;
+          sharedV[4 * blockDim.x] += delij.x * dzfp;
+          sharedV[5 * blockDim.x] += delij.y * dzfp;
+        }
+
+
+
+
+
+
+        vec3_scale(F_F(-1.0), delij, delij);
+
+#pragma unroll 1
+
+        for(int kk = jj + 1; kk < jnum_red; kk++) {
+          int k = jlist_red[kk * _nall];
+          k &= NEIGHMASK;
+
+          if(vflagm)
+            myxtype_k = fetchXType(k);
+
+          delik = _glob_r_ij[i + kk * _nall];
+
+          int ktype = _glob_neightype_red[i + kk * _nall];
+          int iparam_ik = elem2param[(itype * nelements + ktype) * nelements + ktype];
+          int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype];
+          vec3_scale(F_F(-1.0), delik, delik);
+
+          if(delik.w <= params_sw[iparam_ijk].cutsq) {
+            F_FLOAT3 fj, fk;
+            threebody(iparam_ij, iparam_ik, iparam_ijk,
+                      delij, delik, fj, fk, eflag, evdwl);
+            fxtmp -= fj.x + fk.x;
+            fytmp -= fj.y + fk.y;
+            fztmp -= fj.z + fk.z;
+
+            if(vflagm) {
+              sharedV[0 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.x * (fj.x + fk.x);
+              sharedV[1 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.y * (fj.y + fk.y);
+              sharedV[2 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.z * (fj.z + fk.z);
+              sharedV[3 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.x * (fj.y + fk.y);
+              sharedV[4 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.x * (fj.z + fk.z);
+              sharedV[5 * blockDim.x] -= ENERGY_F(2.0) * myxtype_i.y * (fj.z + fk.z);
+
+              sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.x;
+              sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.y;
+              sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.z * fj.z;
+              sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.y;
+              sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.z;
+              sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.z;
+
+              sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.x;
+              sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.y;
+              sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.z * fk.z;
+              sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.y;
+              sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.z;
+              sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.z;
+            }
+          }
+        }
+
+        int j_jnum_red = _glob_numneigh_red[j];
+        int* j_jlist_red = &_glob_neighbors_red[j];
+
+        int j_ii = 0;
+
+        //#pragma unroll 1
+        for(int j_kk = 0; j_kk < j_jnum_red; j_kk++) {
+          if(j_jlist_red[j_kk * _nall] == i) j_ii = j_kk;
+        }
+
+#pragma unroll 1
+
+        for(int kk = 0; kk < j_jnum_red; kk++) {
+          if(j_ii == kk) continue;
+
+          int k = j_jlist_red[kk * _nall];
+          k &= NEIGHMASK;
+          deljk = _glob_r_ij[j + kk * _nall];
+          vec3_scale(F_F(-1.0), deljk, deljk);
+          int ktype = _glob_neightype_red[j + kk * _nall];
+
+          int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype];
+          int iparam_jk = elem2param[(jtype * nelements + ktype) * nelements + ktype];
+          int iparam_jik = elem2param[(jtype * nelements + itype) * nelements + ktype];
+
+
+          vec3_scale(F_F(-1.0), delij, delij);
+
+          if(deljk.w <= params_sw[iparam_jik].cutsq) {
+            F_FLOAT3 fj;
+
+            threebody_fj(iparam_ji, iparam_jk, iparam_jik,
+                         delij, deljk, fj);
+            fxtmp += fj.x;
+            fytmp += fj.y;
+            fztmp += fj.z;
+
+          }
+
+          vec3_scale(F_F(-1.0), delij, delij);
+        }
+      }
+    }
+
+  }
+
+  __syncthreads();
+
+  if(ii < _inum) {
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+      if(eflag) {
+        buffer = &buffer[1 * gridDim.x * gridDim.y];
+      }
+
+      if(vflagm) {
+        buffer = &buffer[6 * gridDim.x * gridDim.y];
+      }
+
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = fxtmp;
+      my_f += _nmax;
+      *my_f = fytmp;
+      my_f += _nmax;
+      *my_f = fztmp;
+    } else {
+      my_f = _f + i;
+      *my_f += fxtmp;
+      my_f += _nmax;
+      *my_f += fytmp;
+      my_f += _nmax;
+      *my_f += fztmp;
+    }
+  }
+
+  __syncthreads();
+
+  if(eflag) {
+    sharedE[0] = evdwl;
+  }
+
+  if(eflag_atom && i < _nlocal) {
+    _eatom[i] = ENERGY_F(0.5) * evdwl;
+  }
+
+  if(vflag_atom && i < _nlocal) {
+    _vatom[i]         = ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+    _vatom[i + _nmax]   = ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+    _vatom[i + 2 * _nmax] = ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+    _vatom[i + 3 * _nmax] = ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+    _vatom[i + 4 * _nmax] = ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+    _vatom[i + 5 * _nmax] = ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+  }
+
+  if(vflagm && eflag) PairVirialCompute_A_Kernel_Template<1, 1>();
+  else if(eflag) PairVirialCompute_A_Kernel_Template<1, 0>();
+  else if(vflagm) PairVirialCompute_A_Kernel_Template<0, 1>();
+
+#undef fxtmp
+#undef fytmp
+#undef fztmp
+  //#undef jnum_red
+}
diff --git a/lib/cuda/pair_tersoff_cuda.cu b/lib/cuda/pair_tersoff_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0ae5e846a000545c12377f7718bb4e48ee1cf154
--- /dev/null
+++ b/lib/cuda/pair_tersoff_cuda.cu
@@ -0,0 +1,154 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+
+
+#include "pair_tersoff_cuda_cu.h"
+__device__ __constant__ Param_Float params[MANYBODY_NPAIR* MANYBODY_NPAIR* MANYBODY_NPAIR];
+__device__ __constant__ F_FLOAT* _glob_zeta_ij; //zeta_ij
+__device__ __constant__ F_FLOAT4* _glob_r_ij; //r_ij (x,y,z,r^2) for pairs within force cutoff
+__device__ __constant__ bool _zbl; //is tersoff zbl?
+
+
+#include "pair_tersoff_cuda_kernel_nc.cu"
+
+#include <time.h>
+
+
+void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl)
+{
+  unsigned cuda_ntypes = sdata->atom.ntypes + 1;
+  X_FLOAT box_size[3] = {
+    sdata->domain.subhi[0] - sdata->domain.sublo[0],
+    sdata->domain.subhi[1] - sdata->domain.sublo[1],
+    sdata->domain.subhi[2] - sdata->domain.sublo[2]
+  };
+
+  cudaMemcpyToSymbol(MY_AP(box_size)     , box_size                      , sizeof(X_FLOAT) * 3);
+  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)  , &cuda_ntypes                   , sizeof(unsigned));
+  cudaMemcpyToSymbol(MY_AP(virial)       , &sdata->pair.virial.dev_data   , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(eng_vdwl)     , &sdata->pair.eng_vdwl.dev_data , sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(periodicity)  , sdata->domain.periodicity     , sizeof(int) * 3);
+  cudaMemcpyToSymbol(MY_AP(collect_forces_later), &sdata->pair.collect_forces_later  , sizeof(int));
+  cudaMemcpyToSymbol(params, params_host  , sizeof(Param_Float)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(elem2param, elem2param_host  , sizeof(int)*nelements_h * nelements_h * nelements_h);
+  cudaMemcpyToSymbol(map, map_host  , sizeof(int)*cuda_ntypes);
+  cudaMemcpyToSymbol(nelements, &nelements_h, sizeof(int));
+  cudaMemcpyToSymbol(_zbl, &zbl, sizeof(bool));
+
+}
+
+void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
+{
+  static F_FLOAT* glob_zeta_ij = NULL;
+  static int glob_zeta_ij_size = 0;
+  static F_FLOAT4* glob_r_ij = NULL;
+  static int* glob_numneigh_red = NULL;
+  static int* glob_neighbors_red = NULL;
+  static int* glob_neightype_red = NULL;
+
+  if(glob_zeta_ij_size < sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT)) {
+    glob_zeta_ij_size = sdata->atom.nall * sneighlist->maxneighbors * sizeof(F_FLOAT);
+    cudaFree(glob_zeta_ij);
+    cudaFree(glob_r_ij);
+    cudaFree(glob_numneigh_red);
+    cudaFree(glob_neighbors_red);
+    cudaFree(glob_neightype_red);
+    cudaMalloc(&glob_zeta_ij, glob_zeta_ij_size);
+    cudaMalloc(&glob_r_ij, glob_zeta_ij_size * 4);
+    cudaMalloc(&glob_numneigh_red, sdata->atom.nall * sizeof(int));
+    cudaMalloc(&glob_neighbors_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMalloc(&glob_neightype_red, sdata->atom.nall * sneighlist->maxneighbors * sizeof(int));
+    cudaMemcpyToSymbol(_glob_numneigh_red, &glob_numneigh_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neighbors_red, &glob_neighbors_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_neightype_red, &glob_neightype_red  , sizeof(int*));
+    cudaMemcpyToSymbol(_glob_r_ij, &glob_r_ij  , sizeof(F_FLOAT4*));
+    cudaMemcpyToSymbol(_glob_zeta_ij, &glob_zeta_ij  , sizeof(F_FLOAT*));
+  }
+
+  dim3 grid, threads;
+  int sharedperproc;
+
+  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 64);
+  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
+
+
+
+  dim3 grid2;
+
+  if(sdata->atom.nall <= 256 * 64000) {
+    grid2.x = (sdata->atom.nall + 255) / 256;
+    grid2.y = 1;
+  } else {
+    grid2.x = (sdata->atom.nall + 256 * 128 - 1) / (256 * 128);
+    grid2.y = 128;
+  }
+
+  grid2.z = 1;
+  dim3 threads2;
+  threads2.x = 256;
+  threads2.y = 1;
+  threads2.z = 1;
+
+  timespec time1, time2;
+
+  //pre-calculate all neighbordistances and zeta_ij
+  clock_gettime(CLOCK_REALTIME, &time1);
+  Pair_Tersoff_Kernel_TpA_RIJ <<< grid2, threads2, 0, streams[1]>>>
+  ();
+  cudaThreadSynchronize();
+  Pair_Tersoff_Kernel_TpA_ZetaIJ <<< grid2, threads2, 0, streams[1]>>>
+  ();
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test1 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+  clock_gettime(CLOCK_REALTIME, &time1);
+
+  //actual force calculation
+  unsigned int sharedsize = (sharedperproc * sizeof(ENERGY_FLOAT) + 4 * sizeof(F_FLOAT)) * threads.x; //extra 4 floats per thread used to reduce register pressure
+
+  if(eflag) {
+    if(vflag)
+      Pair_Tersoff_Kernel_TpA<1, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+    else
+      Pair_Tersoff_Kernel_TpA<1, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+  } else {
+    if(vflag)
+      Pair_Tersoff_Kernel_TpA<0, 1> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+    else
+      Pair_Tersoff_Kernel_TpA<0, 0> <<< grid, threads, sharedsize, streams[1]>>>
+      (eflag_atom, vflag_atom);
+  }
+  cudaThreadSynchronize();
+  clock_gettime(CLOCK_REALTIME, &time2);
+  sdata->cuda_timings.test2 +=
+    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
+
+  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
+}
+
diff --git a/lib/cuda/pair_tersoff_cuda_cu.h b/lib/cuda/pair_tersoff_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4eb81827f6a8deb9e8a89c141f86a685091db48
--- /dev/null
+++ b/lib/cuda/pair_tersoff_cuda_cu.h
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+struct Param_Float {
+  F_FLOAT lam1, lam2, lam3;
+  F_FLOAT c, d, h;
+  F_FLOAT gamma, powerm;
+  F_FLOAT powern, beta;
+  F_FLOAT biga, bigb, bigd, bigr;
+  F_FLOAT cut, cutsq;
+  F_FLOAT c1, c2, c3, c4;
+  int ielement, jelement, kelement;
+  int powermint;
+  //F_FLOAT Z_i,Z_j;
+  F_FLOAT ZBLcut, ZBLexpscale;
+  F_FLOAT a_ij, premult;
+};
+
+extern "C" void Cuda_PairTersoffCuda_Init(cuda_shared_data* sdata, Param_Float* params_host, void* map_host, void* elem2param_host, int nelements_h, bool zbl);
+extern "C" void Cuda_PairTersoffCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
diff --git a/lib/cuda/pair_tersoff_cuda_kernel_nc.cu b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e5143b36adc5d4b0efd409e9999d6885240f121a
--- /dev/null
+++ b/lib/cuda/pair_tersoff_cuda_kernel_nc.cu
@@ -0,0 +1,1097 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#define Pi F_F(3.1415926535897932384626433832795)
+#define PI Pi
+#define PI2 F_F(0.5)*Pi
+#define PI4 F_F(0.25)*Pi
+template <const int eflag, const int vflag>
+static inline __device__ void PairVirialCompute_A_Kernel_Template()
+{
+  __syncthreads();
+  ENERGY_FLOAT* shared = sharedmem;
+
+  if(eflag) {
+    reduceBlock(shared);
+    shared += blockDim.x;
+  }
+
+  if(vflag) {
+    reduceBlock(shared + 0 * blockDim.x);
+    reduceBlock(shared + 1 * blockDim.x);
+    reduceBlock(shared + 2 * blockDim.x);
+    reduceBlock(shared + 3 * blockDim.x);
+    reduceBlock(shared + 4 * blockDim.x);
+    reduceBlock(shared + 5 * blockDim.x);
+  }
+
+  if(threadIdx.x == 0) {
+    shared = sharedmem;
+    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+    if(eflag) {
+      buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
+      shared += blockDim.x;
+      buffer += gridDim.x * gridDim.y;
+    }
+
+    if(vflag) {
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x];
+      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x];
+    }
+  }
+
+  __syncthreads();
+}
+
+__global__ void virial_fdotr_compute_kernel(int eflag)
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0];
+  ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x];
+  sharedE += threadIdx.x;
+  sharedVirial += threadIdx.x;
+
+  if(i < _nlocal) {
+
+    F_FLOAT x = _x[i];
+    F_FLOAT y = _x[i + _nmax];
+    F_FLOAT z = _x[i + 2 * _nmax];
+    F_FLOAT fx = _f[i];
+    F_FLOAT fy = _f[i + _nmax];
+    F_FLOAT fz = _f[i + 2 * _nmax];
+    //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz);
+    sharedVirial[0] = fx * x;
+    sharedVirial[1 * blockDim.x] = fy * y;
+    sharedVirial[2 * blockDim.x] = fz * z;
+    sharedVirial[3 * blockDim.x] = fy * x;
+    sharedVirial[4 * blockDim.x] = fz * x;
+    sharedVirial[5 * blockDim.x] = fz * y;
+  } else {
+    sharedVirial[0] = 0;
+    sharedVirial[1 * blockDim.x] = 0;
+    sharedVirial[2 * blockDim.x] = 0;
+    sharedVirial[3 * blockDim.x] = 0;
+    sharedVirial[4 * blockDim.x] = 0;
+    sharedVirial[5 * blockDim.x] = 0;
+  }
+
+  sharedVirial = (ENERGY_FLOAT*) &sharedmem[0];
+  sharedVirial += blockDim.x;
+  reduceBlockP2(sharedVirial);
+  reduceBlockP2(&sharedVirial[1 * blockDim.x]);
+  reduceBlockP2(&sharedVirial[2 * blockDim.x]);
+  reduceBlockP2(&sharedVirial[3 * blockDim.x]);
+  reduceBlockP2(&sharedVirial[4 * blockDim.x]);
+  reduceBlockP2(&sharedVirial[5 * blockDim.x]);
+
+  if(threadIdx.x < 6) {
+    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+    if(eflag) buffer = &buffer[gridDim.x * gridDim.y];
+
+    buffer[blockIdx.x * gridDim.y + blockIdx.y + threadIdx.x * gridDim.x * gridDim.y] = sharedVirial[threadIdx.x * blockDim.x];
+  }
+}
+
+/*#define vec3_scale(K,X,Y) Y.x = K*X.x;  Y.y = K*X.y;  Y.z = K*X.z;
+#define vec3_scaleadd(K,X,Y,Z) Z.x = K*X.x+Y.x;  Z.y = K*X.y+Y.y;  Z.z = K*X.z+Y.z;
+#define vec3_add(X,Y,Z) Z.x = X.x+Y.x;  Z.y = X.y+Y.y;  Z.z = X.z+Y.z;
+#define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/
+
+__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y)
+{
+  y.x = k * x.x;
+  y.y = k * x.y;
+  y.z = k * x.z;
+}
+
+__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT3 &y)
+{
+  y.x = k * x.x;
+  y.y = k * x.y;
+  y.z = k * x.z;
+}
+
+__device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT4 &y)
+{
+  y.x = k * x.x;
+  y.y = k * x.y;
+  y.z = k * x.z;
+}
+
+__device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z)
+{
+  z.x = k * x.x + y.x;
+  z.y = k * x.y + y.y;
+  z.z = k * x.z + y.z;
+}
+
+__device__ inline void vec3_add(F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z)
+{
+  z.x = x.x + y.x;
+  z.y = x.y + y.y;
+  z.z = x.z + y.z;
+}
+
+__device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y)
+{
+  return x.x * y.x + x.y * y.y + x.z * y.z;
+}
+
+__device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y)
+{
+  return x.x * y.x + x.y * y.y + x.z * y.z;
+}
+
+/* ----------------------------------------------------------------------
+   Fermi-like smoothing function
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam)
+{
+  return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut)));
+}
+
+/* ----------------------------------------------------------------------
+   Fermi-like smoothing function derivative with respect to r
+------------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam)
+{
+  volatile const F_FLOAT tmp =  exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut));
+  return params[iparam].ZBLexpscale * tmp /
+         ((F_F(1.0) + tmp) * (F_F(1.0) + tmp));
+}
+
+__device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D)
+{
+  return (r < ters_R - ters_D) ? F_F(1.0) : ((r > ters_R + ters_D) ?
+         F_F(0.0) : F_F(0.5) * (F_F(1.0) - sin(PI2 * (r - ters_R) / ters_D)));
+}
+
+__device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D)
+{
+  return ((r < ters_R - ters_D) || (r > ters_R + ters_D)) ?
+         F_F(0.0) : -(PI4 / ters_D) * cos(PI2 * (r - ters_R) / ters_D);
+}
+
+
+__device__ inline F_FLOAT ters_gijk(F_FLOAT &cos_theta, int iparam)
+{
+  F_FLOAT ters_c = params[iparam].c;
+  F_FLOAT ters_d = params[iparam].d;
+
+  return params[iparam].gamma * (F_F(1.0) + pow(params[iparam].c / params[iparam].d, F_F(2.0)) -
+                                 pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0))));
+}
+
+__device__ F_FLOAT ters_gijk2(F_FLOAT &cos_theta, int iparam)
+{
+  F_FLOAT ters_c = params[iparam].c;
+  F_FLOAT ters_d = params[iparam].d;
+
+  return params[iparam].gamma * (F_F(1.0) + pow(ters_c / ters_d, F_F(2.0)) -
+                                 pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0))));
+}
+
+__device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam)
+{
+  F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c, F_F(2.0)) * (params[iparam].h - costheta);
+  F_FLOAT denominator = pow(pow(params[iparam].d, F_F(2.0)) +
+                            pow(params[iparam].h - costheta, F_F(2.0)), F_F(2.0));
+  return params[iparam].gamma * numerator / denominator;
+}
+
+__device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik,
+                               F_FLOAT3 &delij, F_FLOAT3 &delik)
+{
+  F_FLOAT rij, rik, costheta, arg, ex_delr;
+
+  rij = sqrt(rsqij);
+  rik = sqrt(rsqik);
+  costheta = vec3_dot(delij, delik) / (rij * rik);
+
+  arg = (params[iparam].powermint == 3) ? (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)) : params[iparam].lam3 * (rij - rik);
+
+  if(arg > F_F(69.0776)) ex_delr = F_F(1.e30);
+  else if(arg < -F_F(69.0776)) ex_delr = F_F(0.0);
+  else ex_delr = exp(arg);
+
+  return ters_fc(rik, params[iparam].bigr, params[iparam].bigd) * ex_delr * params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c / (params[iparam].d * params[iparam].d)) -
+         (params[iparam].c * params[iparam].c) / ((params[iparam].d * params[iparam].d) + (params[iparam].h - costheta) * (params[iparam].h - costheta)));
+}
+
+__device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce,
+                          int eflag, ENERGY_FLOAT &eng)
+{
+  F_FLOAT r, tmp_fc, tmp_fc_d, tmp_exp;
+
+  F_FLOAT ters_R = params[iparam].bigr;
+  F_FLOAT ters_D = params[iparam].bigd;
+  r = sqrt(rsq);
+  tmp_fc = ters_fc(r, ters_R, ters_D);
+  tmp_fc_d = ters_fc_d(r, ters_R, ters_D);
+  tmp_exp = exp(-params[iparam].lam1 * r);
+
+  if(!_zbl) {
+    fforce = -params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1) / r;
+
+    if(eflag) eng += tmp_fc * params[iparam].biga * tmp_exp;
+  } else {
+    F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1);
+    ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp;
+
+    F_FLOAT r_ov_a = r / params[iparam].a_ij;
+    F_FLOAT phi = F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) + F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) +
+                  F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) + F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a);
+    F_FLOAT dphi = (F_F(1.0) / params[iparam].a_ij) * (-F_F(3.2) * F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) -
+                   F_F(0.9423) * F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) -
+                   F_F(0.4029) * F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) -
+                   F_F(0.2016) * F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a));
+    F_FLOAT fforce_ZBL = params[iparam].premult / (-r * r) * phi + params[iparam].premult / r * dphi;
+    ENERGY_FLOAT eng_ZBL = params[iparam].premult * (F_F(1.0) / r) * phi;
+
+    fforce = -(-F_fermi_d(r, iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r, iparam) * (fforce_ters - fforce_ZBL)) / r;
+
+    if(eflag)
+      eng += eng_ZBL + F_fermi(r, iparam) * (eng_ters - eng_ZBL);
+  }
+
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D)
+{
+  if(r > ters_R + ters_D) return F_F(0.0);
+
+  if(_zbl)
+    return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r, ters_R, ters_D) * F_fermi(r, iparam);
+  else
+    return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r, ters_R, ters_D);
+}
+
+/* ---------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D)
+{
+  if(r > ters_R + ters_D) return F_F(0.0);
+
+  if(_zbl)
+    return params[iparam].bigb * exp(-params[iparam].lam2 * r) *
+           ((params[iparam].lam2 * ters_fc(r, ters_R, ters_D) - ters_fc_d(r, ters_R, ters_D)) * F_fermi(r, iparam)
+            - ters_fc(r, ters_R, ters_D) * F_fermi_d(r, iparam));
+  else
+    return params[iparam].bigb * exp(-params[iparam].lam2 * r) *
+           (params[iparam].lam2 * ters_fc(r, ters_R, ters_D) - ters_fc_d(r, ters_R, ters_D));
+}
+
+/* ---------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam)
+{
+  F_FLOAT tmp = params[iparam].beta * zeta;
+
+  if(tmp > params[iparam].c1) return F_F(1.0) / sqrt(tmp);
+
+  if(tmp > params[iparam].c2)
+    return (F_F(1.0) - pow(tmp, -params[iparam].powern) / (F_F(2.0) * params[iparam].powern)) / sqrt(tmp);
+
+  if(tmp < params[iparam].c4) return F_F(1.0);
+
+  if(tmp < params[iparam].c3)
+    return F_F(1.0) - pow(tmp, params[iparam].powern) / (F_F(2.0) * params[iparam].powern);
+
+  return pow(F_F(1.0) + pow(tmp, params[iparam].powern), -F_F(1.0) / (F_F(2.0) * params[iparam].powern));
+}
+
+/* ---------------------------------------------------------------------- */
+
+__device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam)
+{
+  F_FLOAT tmp = params[iparam].beta * zeta;
+
+  if(tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5) * pow(tmp, -F_F(1.5));
+
+  if(tmp > params[iparam].c2)
+    return params[iparam].beta * (-F_F(0.5) * pow(tmp, -F_F(1.5)) *
+                                  (F_F(1.0) - F_F(0.5) * (F_F(1.0) +  F_F(1.0) / (F_F(2.0) * params[iparam].powern)) *
+                                   pow(tmp, -params[iparam].powern)));
+
+  if(tmp < params[iparam].c4) return F_F(0.0);
+
+  if(tmp < params[iparam].c3)
+    return -F_F(0.5) * params[iparam].beta * pow(tmp, params[iparam].powern - F_F(1.0));
+
+  F_FLOAT tmp_n = pow(tmp, params[iparam].powern);
+  return -F_F(0.5) * pow(F_F(1.0) + tmp_n, -F_F(1.0) - (F_F(1.0) / (F_F(2.0) * params[iparam].powern))) * tmp_n / zeta;
+}
+
+__device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij,
+                           F_FLOAT &fforce, F_FLOAT &prefactor,
+                           int eflag, F_FLOAT &eng)
+{
+  F_FLOAT r, fa, fa_d, bij;
+  F_FLOAT ters_R = params[iparam].bigr;
+  F_FLOAT ters_D = params[iparam].bigd;
+  r = sqrt(rsq);
+  fa = ters_fa(r, iparam, ters_R, ters_D);
+  fa_d = ters_fa_d(r, iparam, ters_R, ters_D);
+  bij = ters_bij(zeta_ij, iparam);
+  fforce = F_F(0.5) * bij * fa_d / r;
+  prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam);
+
+  if(eflag) eng += bij * fa;
+}
+
+__device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij,
+    F_FLOAT &fforce, F_FLOAT &prefactor)
+{
+  F_FLOAT r, fa, fa_d, bij;
+  F_FLOAT ters_R = params[iparam].bigr;
+  F_FLOAT ters_D = params[iparam].bigd;
+  r = sqrt(rsq);
+  fa = ters_fa(r, iparam, ters_R, ters_D);
+  fa_d = ters_fa_d(r, iparam, ters_R, ters_D);
+  bij = ters_bij(zeta_ij, iparam);
+  fforce = F_F(0.5) * bij * fa_d / r;
+  prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam);
+}
+
+__device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij,
+                                     F_FLOAT &prefactor)
+{
+  F_FLOAT r, fa;
+  r = sqrt(rsq);
+  fa = ters_fa(r, iparam, params[iparam].bigr, params[iparam].bigd);
+  prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam);
+}
+
+
+__device__ void costheta_d(F_FLOAT3 &rij_hat, F_FLOAT &rij,
+                           F_FLOAT3 &rik_hat, F_FLOAT &rik,
+                           F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk)
+{
+  // first element is derivative wrt Ri, second wrt Rj, third wrt Rk
+
+  F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
+
+  vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj);
+  vec3_scale(F_F(1.0) / rij, drj, drj);
+  vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk);
+  vec3_scale(F_F(1.0) / rik, drk, drk);
+  vec3_add(drj, drk, dri);
+  vec3_scale(-F_F(1.0), dri, dri);
+}
+
+__device__ void ters_zetaterm_d(F_FLOAT prefactor,
+                                F_FLOAT3 &rij_hat, F_FLOAT rij,
+                                F_FLOAT3 &rik_hat, F_FLOAT rik,
+                                F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk,
+                                int iparam)
+{
+  F_FLOAT ex_delr, ex_delr_d, tmp;
+  F_FLOAT3 dcosdri, dcosdrj, dcosdrk;
+
+  if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
+  else tmp = params[iparam].lam3 * (rij - rik);
+
+  if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
+  else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
+  else ex_delr = exp(tmp);
+
+  if(params[iparam].powermint == 3)
+    ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
+  else ex_delr_d = params[iparam].lam3 * ex_delr;
+
+
+  const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
+  costheta_d(rij_hat, rij, rik_hat, rik, dcosdri, dcosdrj, dcosdrk);
+
+  const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
+                       (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
+  const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
+  const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
+                              (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
+  const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
+  // dri = -dfc*gijk*ex_delr*rik_hat;
+  // dri += fc*gijk_d*ex_delr*dcosdri;
+  // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat);
+  const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
+  const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd);
+
+
+  vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri);
+  vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri);
+  vec3_scaleadd(fc * gijk * ex_delr_d, rik_hat, dri, dri);
+  vec3_scaleadd(-fc * gijk * ex_delr_d, rij_hat, dri, dri);
+  vec3_scale(prefactor, dri, dri);
+  // compute the derivative wrt Rj
+  // drj = fc*gijk_d*ex_delr*dcosdrj;
+  // drj += fc*gijk*ex_delr_d*rij_hat;
+
+  vec3_scale(fc * gijk_d * ex_delr, dcosdrj, drj);
+  vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj);
+  vec3_scale(prefactor, drj, drj);
+
+  // compute the derivative wrt Rk
+  // drk = dfc*gijk*ex_delr*rik_hat;
+  // drk += fc*gijk_d*ex_delr*dcosdrk;
+  // drk += -fc*gijk*ex_delr_d*rik_hat;
+
+  vec3_scale(dfc * gijk * ex_delr, rik_hat, drk);
+  vec3_scaleadd(fc * gijk_d * ex_delr, dcosdrk, drk, drk);
+  vec3_scaleadd(-fc * gijk * ex_delr_d, rik_hat, drk, drk);
+  vec3_scale(prefactor, drk, drk);
+}
+
+__device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor,
+                                   F_FLOAT3 &rij_hat, F_FLOAT &rij,
+                                   F_FLOAT3 &rik_hat, F_FLOAT &rik,
+                                   F_FLOAT3 &dri,  int &iparam)
+{
+  F_FLOAT ex_delr, ex_delr_d, tmp;
+
+  if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
+  else tmp = params[iparam].lam3 * (rij - rik);
+
+  if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
+  else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
+  else ex_delr = exp(tmp);
+
+  if(params[iparam].powermint == 3)
+    ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
+  else ex_delr_d = params[iparam].lam3 * ex_delr;
+
+  const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
+  //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+
+
+  F_FLOAT3 dcosdri;
+  vec3_scaleadd(-cos_theta, rij_hat, rik_hat, dri);
+  vec3_scale(F_F(1.0) / rij, dri, dri);
+  vec3_scaleadd(-cos_theta, rik_hat, rij_hat, dcosdri);
+  vec3_scale(F_F(1.0) / rik, dcosdri, dcosdri);
+  vec3_add(dri, dcosdri, dcosdri);
+  vec3_scale(-F_F(1.0), dcosdri, dcosdri);
+
+  const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
+                       (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
+  const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
+  const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
+                              (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
+  const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
+  //
+  const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
+  const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd);
+
+  vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri);
+  vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri);
+  vec3_scaleadd(fc * gijk * ex_delr_d, rik_hat, dri, dri);
+  vec3_scaleadd(-fc * gijk * ex_delr_d, rij_hat, dri, dri);
+  vec3_scale(prefactor, dri, dri);
+
+}
+
+__device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor,
+                                   F_FLOAT3 &rij_hat, F_FLOAT &rij,
+                                   F_FLOAT3 &rik_hat, F_FLOAT &rik,
+                                   F_FLOAT3 &drj, int &iparam)
+{
+  F_FLOAT ex_delr, ex_delr_d, tmp;
+
+  if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
+  else tmp = params[iparam].lam3 * (rij - rik);
+
+  if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
+  else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
+  else ex_delr = exp(tmp);
+
+  if(params[iparam].powermint == 3)
+    ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
+  else ex_delr_d = params[iparam].lam3 * ex_delr;
+
+  const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
+  vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj);
+  vec3_scale(F_F(1.0) / rij, drj, drj);
+
+  const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
+                       (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
+  const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
+  const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
+                              (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
+  const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
+
+  const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
+
+  vec3_scale(fc * gijk_d * ex_delr, drj, drj);
+  vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj);
+  vec3_scale(prefactor, drj, drj);
+}
+
+__device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor,
+                                   F_FLOAT3 &rij_hat, F_FLOAT &rij,
+                                   F_FLOAT3 &rik_hat, F_FLOAT &rik,
+                                   F_FLOAT3 &drk, int &iparam)
+{
+  F_FLOAT ex_delr, ex_delr_d, tmp;
+
+  if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
+  else tmp = params[iparam].lam3 * (rij - rik);
+
+  if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
+  else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
+  else ex_delr = exp(tmp);
+
+  if(params[iparam].powermint == 3)
+    ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
+  else ex_delr_d = params[iparam].lam3 * ex_delr;
+
+  const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
+  vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk);
+  vec3_scale(F_F(1.0) / rik, drk, drk);
+
+  const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
+                       (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
+  const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
+  const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
+                              (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
+  const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
+
+  const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
+  const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd);
+
+  vec3_scale(fc * gijk_d * ex_delr, drk, drk);
+  vec3_scaleadd(dfc * gijk * ex_delr, rik_hat, drk, drk);
+  vec3_scaleadd(-fc * gijk * ex_delr_d, rik_hat, drk, drk);
+  vec3_scale(prefactor, drk, drk);
+}
+
+__device__ void attractive(int iparam, F_FLOAT prefactor,
+                           F_FLOAT4 &delij,
+                           F_FLOAT4 &delik,
+                           F_FLOAT3 &fi, F_FLOAT3 &fj, F_FLOAT3 &fk)
+{
+  F_FLOAT3 rij_hat, rik_hat;
+  F_FLOAT rij, rijinv, rik, rikinv;
+
+  rij = sqrt(delij.w);
+  rijinv = F_F(1.0) / rij;
+  vec3_scale(rijinv, delij, rij_hat);
+
+  rik = sqrt(delik.w);
+  rikinv = F_F(1.0) / rik;
+  vec3_scale(rikinv, delik, rik_hat);
+
+  ters_zetaterm_d(prefactor, rij_hat, rij, rik_hat, rik, fi, fj, fk, iparam);
+}
+
+__device__ void attractive_fi(int &iparam, F_FLOAT &prefactor,
+                              F_FLOAT4 &delij,
+                              F_FLOAT4 &delik,
+                              F_FLOAT3 &f)
+{
+  F_FLOAT3 rij_hat, rik_hat;
+  F_FLOAT rij, rijinv, rik, rikinv;
+
+  rij = sqrt(delij.w);
+  rijinv = F_F(1.0) / rij;
+  vec3_scale(rijinv, delij, rij_hat);
+
+  rik = sqrt(delik.w);
+  rikinv = F_F(1.0) / rik;
+  vec3_scale(rikinv, delik, rik_hat);
+
+  ters_zetaterm_d_fi(prefactor, rij_hat, rij, rik_hat, rik, f, iparam);
+}
+
+__device__ void attractive_fj(int iparam, F_FLOAT prefactor,
+                              F_FLOAT4 &delij,
+                              F_FLOAT4 &delik,
+                              F_FLOAT3 &f)
+{
+  F_FLOAT3 rij_hat, rik_hat;
+  F_FLOAT rij, rijinv, rik, rikinv;
+
+  rij = sqrt(delij.w);
+  rijinv = F_F(1.0) / rij;
+  vec3_scale(rijinv, delij, rij_hat);
+
+  rik = sqrt(delik.w);
+  rikinv = F_F(1.0) / rik;
+  vec3_scale(rikinv, delik, rik_hat);
+
+  ters_zetaterm_d_fj(prefactor, rij_hat, rij, rik_hat, rik, f, iparam);
+}
+
+__device__ void attractive_fk(int iparam, F_FLOAT prefactor,
+                              F_FLOAT4 &delij,
+                              F_FLOAT4 &delik,
+                              F_FLOAT3 &f)
+{
+  F_FLOAT3 rij_hat, rik_hat;
+  F_FLOAT rij, rijinv, rik, rikinv;
+
+  rij = sqrt(delij.w);
+  rijinv = F_F(1.0) / rij;
+  vec3_scale(rijinv, delij, rij_hat);
+
+  rik = sqrt(delik.w);
+  rikinv = F_F(1.0) / rik;
+  vec3_scale(rikinv, delik, rik_hat);
+
+  ters_zetaterm_d_fk(prefactor, rij_hat, rij, rik_hat, rik, f, iparam);
+}
+
+__global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
+{
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(ii >= _nall) return;
+
+  X_FLOAT4 myxtype;
+  F_FLOAT4 delij;
+  F_FLOAT xtmp, ytmp, ztmp;
+  int itype, jnum, i, j;
+  int* jlist;
+  int neigh_red = 0;
+  i = ii;//_ilist[ii];
+  myxtype = fetchXType(i);
+
+  xtmp = myxtype.x;
+  ytmp = myxtype.y;
+  ztmp = myxtype.z;
+  itype = map[(static_cast <int>(myxtype.w))];
+
+  jnum = _numneigh[i];
+  jlist = &_neighbors[i];
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(jj < jnum) {
+
+      j = jlist[jj * _nall];
+      j &= NEIGHMASK;
+      myxtype = fetchXType(j);
+      delij.x = xtmp - myxtype.x;
+      delij.y = ytmp - myxtype.y;
+      delij.z = ztmp - myxtype.z;
+      int jtype = map[(static_cast <int>(myxtype.w))];
+      int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
+      delij.w = vec3_dot(delij, delij);
+
+      if(delij.w < params[iparam_ij].cutsq) {
+        _glob_neighbors_red[i + neigh_red * _nall] = j;
+        _glob_neightype_red[i + neigh_red * _nall] = jtype;
+        _glob_r_ij[i + neigh_red * _nall] = delij;
+        neigh_red++;
+      }
+    }
+  }
+
+  _glob_numneigh_red[i] = neigh_red;
+}
+
+
+__global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
+{
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if(ii >= _nall) return;
+
+
+  F_FLOAT4 delij;
+  F_FLOAT4 delik;
+
+  int itype, jnum, i, j;
+  int* jlist;
+  i = ii;
+  itype = map[(static_cast <int>(_type[i]))];
+
+  jnum = _glob_numneigh_red[i];
+  jlist = &_glob_neighbors_red[i];
+
+  __syncthreads();
+
+  for(int jj = 0; jj < jnum; jj++) {
+    if(jj < jnum) {
+
+      j = jlist[jj * _nall];
+      j &= NEIGHMASK;
+      int jtype = _glob_neightype_red[i + jj * _nall];
+      delij = _glob_r_ij[i + jj * _nall];
+
+      int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
+
+      if(delij.w < params[iparam_ij].cutsq) {
+        F_FLOAT zeta_ij = 0.0;
+        F_FLOAT3 delij3 = {delij.x, delij.y, delij.z};
+
+        for(int kk = 0; kk < jnum; kk++) {
+          if(jj == kk) continue;
+
+          int k = jlist[kk * _nall];
+          k &= NEIGHMASK;
+
+          int ktype = _glob_neightype_red[i + kk * _nall];
+          delik = _glob_r_ij[i + kk * _nall];
+          F_FLOAT3 delik3 = {delik.x, delik.y, delik.z};
+          int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype];
+          const F_FLOAT rsqki = delik.w;
+
+          if(rsqki <= params[iparam_ijk].cutsq)
+            zeta_ij += zeta(iparam_ijk, delij.w, rsqki, delij3, delik3);
+        }
+
+        _glob_zeta_ij[i + jj * _nall] = zeta_ij;
+      }
+    }
+  }
+}
+
+//back3: num 12 steps 10: ZetaIJ/TPA 0.255/0.106
+//back5: num 12 steps 10: ZetaIJ/TPA 0.257/0.098
+//back6: num 12 steps 10: ZetaIJ/TPA 0.027/0.097 /rij berechnung extra
+//back12: num 12 steps 10: ZetaIJ/TPA 0.026/0.070
+//back15: num 12 steps 10: ZetaIJ/TPA 0.0137/0.0287 //pow beseitigt
+//        num 12 steps 10: ZetaIJ/TPA 0.0137/0.027
+template <int eflag, int vflagm>
+__global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
+{
+  ENERGY_FLOAT evdwl = ENERGY_F(0.0);
+
+  ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x];
+  ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
+
+  F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem;
+
+  if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x];
+  else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x];
+  else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x];
+
+  shared_F_F += threadIdx.x;
+
+  if(eflag_atom || eflag) {
+    sharedE[0] = ENERGY_F(0.0);
+    sharedV += blockDim.x;
+  }
+
+  if(vflagm || vflag_atom) {
+    sharedV[0 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[1 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[2 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[3 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[4 * blockDim.x] = ENERGY_F(0.0);
+    sharedV[5 * blockDim.x] = ENERGY_F(0.0);
+  }
+
+  int jnum_red = 0;
+#define fxtmp shared_F_F[0]
+#define fytmp shared_F_F[blockDim.x]
+#define fztmp shared_F_F[2*blockDim.x]
+  //#define jnum_red (static_cast <int> (shared_F_F[3*blockDim.x]))
+
+  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  X_FLOAT4 myxtype_i, myxtype_j, myxtype_k;
+  F_FLOAT4 delij, delik, deljk;
+  F_FLOAT fpair;
+  F_FLOAT prefactor_ij, prefactor_ji;
+
+  int itype, i, j;
+  int* jlist_red;
+
+  if(ii < _inum) {
+    i = _ilist[ii];
+
+    if(vflagm)
+      myxtype_i = fetchXType(i);
+
+    //itype=map[(static_cast <int> (myxtype_i.w))];
+    itype = map[_type[i]];
+
+
+    fxtmp = F_F(0.0);
+    fytmp = F_F(0.0);
+    fztmp = F_F(0.0);
+
+
+    //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i];
+    jnum_red = _glob_numneigh_red[i];
+    jlist_red = &_glob_neighbors_red[i];
+  }
+
+  __syncthreads();
+
+#pragma unroll 1
+
+  for(int jj = 0; jj < jnum_red; jj++) {
+    if(i < _nlocal) {
+      fpair = F_F(0.0);
+      j = jlist_red[jj * _nall];
+      j &= NEIGHMASK;
+
+      if(vflagm)
+        myxtype_j = fetchXType(j);
+
+      int jtype = _glob_neightype_red[i + jj * _nall];
+      delij = _glob_r_ij[i + jj * _nall];
+
+      volatile int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
+      volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype];
+
+      if(delij.w < params[iparam_ij].cutsq) {
+        F_FLOAT dxfp, dyfp, dzfp;
+        repulsive(iparam_ij, delij.w, fpair, eflag, evdwl);
+        fxtmp += dxfp = delij.x * fpair;
+        fytmp += dyfp = delij.y * fpair;
+        fztmp += dzfp = delij.z * fpair;
+
+        if(vflagm) {
+          sharedV[0 * blockDim.x] += delij.x * dxfp;
+          sharedV[1 * blockDim.x] += delij.y * dyfp;
+          sharedV[2 * blockDim.x] += delij.z * dzfp;
+          sharedV[3 * blockDim.x] += delij.x * dyfp;
+          sharedV[4 * blockDim.x] += delij.x * dzfp;
+          sharedV[5 * blockDim.x] += delij.y * dzfp;
+        }
+
+
+
+        force_zeta(iparam_ij, delij.w, _glob_zeta_ij[i + jj * _nall], fpair, prefactor_ij, eflag, evdwl);
+        fxtmp -=
+          dxfp = delij.x * fpair;
+        fytmp -=
+          dyfp = delij.y * fpair;
+        fztmp -=
+          dzfp = delij.z * fpair;
+
+        if(vflagm) {
+          sharedV[0 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dxfp;
+          sharedV[1 * blockDim.x] -= ENERGY_F(2.0) * delij.y * dyfp;
+          sharedV[2 * blockDim.x] -= ENERGY_F(2.0) * delij.z * dzfp;
+          sharedV[3 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dyfp;
+          sharedV[4 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dzfp;
+          sharedV[5 * blockDim.x] -= ENERGY_F(2.0) * delij.y * dzfp;
+        }
+
+        int j_jj = 0;
+
+        //#pragma unroll 1
+        for(int kk = 0; kk < _glob_numneigh_red[j]; kk++) {
+          if(_glob_neighbors_red[j + kk * _nall] == i) j_jj = kk;
+        }
+
+        force_zeta_prefactor_force(iparam_ji, delij.w, _glob_zeta_ij[j + j_jj * _nall], fpair, prefactor_ji);
+
+        fxtmp -=
+          dxfp = delij.x * fpair;
+        fytmp -=
+          dyfp = delij.y * fpair;
+        fztmp -=
+          dzfp = delij.z * fpair;
+
+
+
+        vec3_scale(F_F(-1.0), delij, delij);
+
+#pragma unroll 1
+
+        for(int kk = 0; kk < jnum_red; kk++) {
+          if(jj == kk) continue;
+
+          int k = jlist_red[kk * _nall];
+          k &= NEIGHMASK;
+
+          if(vflagm)
+            myxtype_k = fetchXType(k);
+
+          delik = _glob_r_ij[i + kk * _nall];
+
+          int ktype = _glob_neightype_red[i + kk * _nall];
+          int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype];
+          vec3_scale(F_F(-1.0), delik, delik);
+
+          if(delik.w <= params[iparam_ijk].cutsq) {
+            if(vflagm) {
+              F_FLOAT3 fi, fj, fk;
+              attractive(iparam_ijk, prefactor_ij,
+                         delij, delik, fi, fj, fk);
+              fxtmp += fi.x;
+              fytmp += fi.y;
+              fztmp += fi.z;
+
+              sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.x;
+              sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.y * fi.y;
+              sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.z * fi.z;
+              sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.y;
+              sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.z;
+              sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.y * fi.z;
+
+              sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.x;
+              sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.y;
+              sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.z * fj.z;
+              sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.y;
+              sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.z;
+              sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.z;
+
+              sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.x;
+              sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.y;
+              sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.z * fk.z;
+              sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.y;
+              sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.z;
+              sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.z;
+            } else {
+              F_FLOAT3 fi; //local variable
+              attractive_fi(iparam_ijk, prefactor_ij,
+                            delij, delik, fi);
+              fxtmp += fi.x;
+              fytmp += fi.y;
+              fztmp += fi.z;
+
+            }
+          }
+        }
+
+        int j_jnum_red = _glob_numneigh_red[j];
+        int* j_jlist_red = &_glob_neighbors_red[j];
+
+        int j_ii = 0;
+
+        //#pragma unroll 1
+        for(int j_kk = 0; j_kk < j_jnum_red; j_kk++) {
+          if(j_jlist_red[j_kk * _nall] == i) j_ii = j_kk;
+        }
+
+#pragma unroll 1
+
+        for(int kk = 0; kk < j_jnum_red; kk++) {
+          if(j_ii == kk) continue;
+
+          int k = j_jlist_red[kk * _nall];
+          k &= NEIGHMASK;
+          deljk = _glob_r_ij[j + kk * _nall];
+          vec3_scale(F_F(-1.0), deljk, deljk);
+          int ktype = _glob_neightype_red[j + kk * _nall];
+
+          int iparam_jik = elem2param[(jtype * nelements + itype) * nelements + ktype];
+          int iparam_jki = elem2param[(jtype * nelements + ktype) * nelements + itype];
+
+
+          vec3_scale(F_F(-1.0), delij, delij);
+
+          if(deljk.w <= params[iparam_jik].cutsq) {
+            F_FLOAT3 ftmp; //local variable
+
+            attractive_fj(iparam_jik, prefactor_ji,
+                          delij, deljk, ftmp);
+            fxtmp += ftmp.x;
+            fytmp += ftmp.y;
+            fztmp += ftmp.z;
+            int iparam_jk = elem2param[(jtype * nelements + ktype) * nelements + ktype];
+            F_FLOAT prefactor_jk;
+            force_zeta_prefactor(iparam_jk, deljk.w, _glob_zeta_ij[j + kk * _nall], prefactor_jk);
+
+            attractive_fk(iparam_jki, prefactor_jk,
+                          deljk, delij, ftmp);
+            fxtmp += ftmp.x;
+            fytmp += ftmp.y;
+            fztmp += ftmp.z;
+
+          }
+
+          vec3_scale(F_F(-1.0), delij, delij);
+        }
+      }
+    }
+
+  }
+
+  __syncthreads();
+
+  if(ii < _inum) {
+    F_FLOAT* my_f;
+
+    if(_collect_forces_later) {
+      ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
+
+      if(eflag) {
+        buffer = &buffer[1 * gridDim.x * gridDim.y];
+      }
+
+      if(vflagm) {
+        buffer = &buffer[6 * gridDim.x * gridDim.y];
+      }
+
+      my_f = (F_FLOAT*) buffer;
+      my_f += i;
+      *my_f = fxtmp;
+      my_f += _nmax;
+      *my_f = fytmp;
+      my_f += _nmax;
+      *my_f = fztmp;
+    } else {
+      my_f = _f + i;
+      *my_f += fxtmp;
+      my_f += _nmax;
+      *my_f += fytmp;
+      my_f += _nmax;
+      *my_f += fztmp;
+    }
+  }
+
+  __syncthreads();
+
+  if(eflag) {
+    sharedE[0] = evdwl;
+  }
+
+  if(eflag_atom && i < _nlocal) {
+    _eatom[i] = ENERGY_F(0.5) * evdwl;
+  }
+
+  if(vflag_atom && i < _nlocal) {
+    _vatom[i]         = ENERGY_F(0.5) * sharedV[0 * blockDim.x];
+    _vatom[i + _nmax]   = ENERGY_F(0.5) * sharedV[1 * blockDim.x];
+    _vatom[i + 2 * _nmax] = ENERGY_F(0.5) * sharedV[2 * blockDim.x];
+    _vatom[i + 3 * _nmax] = ENERGY_F(0.5) * sharedV[3 * blockDim.x];
+    _vatom[i + 4 * _nmax] = ENERGY_F(0.5) * sharedV[4 * blockDim.x];
+    _vatom[i + 5 * _nmax] = ENERGY_F(0.5) * sharedV[5 * blockDim.x];
+  }
+
+  if(vflagm && eflag) PairVirialCompute_A_Kernel_Template<1, 1>();
+  else if(eflag) PairVirialCompute_A_Kernel_Template<1, 0>();
+  else if(vflagm) PairVirialCompute_A_Kernel_Template<0, 1>();
+
+#undef fxtmp
+#undef fytmp
+#undef fztmp
+  //#undef jnum_red
+}
diff --git a/lib/cuda/pair_virial_compute_cu.h b/lib/cuda/pair_virial_compute_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..872471537e573ccf6700bf51d60fbd5f4daaba1a
--- /dev/null
+++ b/lib/cuda/pair_virial_compute_cu.h
@@ -0,0 +1,26 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_shared.h"
+
+extern "C" void Cuda_PairVirialCompute(cuda_shared_data* sdata, int offset, int end);
diff --git a/lib/cuda/pppm_cuda.cu b/lib/cuda/pppm_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19c2a23a68e8e9805f5bd0684496775291f0bef1
--- /dev/null
+++ b/lib/cuda/pppm_cuda.cu
@@ -0,0 +1,588 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include "cuda_precision.h"
+//#define FFT_CUFFT
+#define MY_PREFIX pppm
+#include "cuda_shared.h"
+#include "cuda_common.h"
+#include "pppm_cuda_cu.h"
+#include "cuda_runtime.h"
+#include <stdio.h>
+
+//#include "crm_cuda_utils.cu"
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+__device__ __constant__ FFT_FLOAT* work1;
+__device__ __constant__ FFT_FLOAT* work2;
+__device__ __constant__ FFT_FLOAT* work3;
+__device__ __constant__ PPPM_FLOAT* greensfn;
+__device__ __constant__ PPPM_FLOAT* gf_b;
+__device__ __constant__ PPPM_FLOAT* fkx;
+__device__ __constant__ PPPM_FLOAT* fky;
+__device__ __constant__ PPPM_FLOAT* fkz;
+__device__ __constant__ PPPM_FLOAT* vg;
+__device__ __constant__ int* part2grid;
+__device__ __constant__ PPPM_FLOAT* density_brick;
+__device__ __constant__ int* density_brick_int;
+__device__ __constant__ PPPM_FLOAT density_intScale;
+__device__ __constant__ PPPM_FLOAT* vdx_brick;
+__device__ __constant__ PPPM_FLOAT* vdy_brick;
+__device__ __constant__ PPPM_FLOAT* vdz_brick;
+__device__ __constant__ PPPM_FLOAT* density_fft;
+__device__ __constant__ ENERGY_FLOAT* energy;
+__device__ __constant__ ENERGY_FLOAT* virial;
+__device__ __constant__ int nxlo_in;
+__device__ __constant__ int nxhi_in;
+__device__ __constant__ int nxlo_out;
+__device__ __constant__ int nxhi_out;
+__device__ __constant__ int nylo_in;
+__device__ __constant__ int nyhi_in;
+__device__ __constant__ int nylo_out;
+__device__ __constant__ int nyhi_out;
+__device__ __constant__ int nzlo_in;
+__device__ __constant__ int nzhi_in;
+__device__ __constant__ int nzlo_out;
+__device__ __constant__ int nzhi_out;
+__device__ __constant__ int nxlo_fft;
+__device__ __constant__ int nxhi_fft;
+__device__ __constant__ int nylo_fft;
+__device__ __constant__ int nyhi_fft;
+__device__ __constant__ int nzlo_fft;
+__device__ __constant__ int nzhi_fft;
+__device__ __constant__ int nx_pppm;
+__device__ __constant__ int ny_pppm;
+__device__ __constant__ int nz_pppm;
+__device__ __constant__ int slabflag;
+__device__ __constant__ PPPM_FLOAT qqrd2e;
+__device__ __constant__ int order;
+//__device__ __constant__ float3 sublo;
+__device__ __constant__ PPPM_FLOAT* rho_coeff;
+__device__ __constant__ int nmax;
+__device__ __constant__ int nlocal;
+__device__ __constant__ PPPM_FLOAT* debugdata;
+__device__ __constant__ PPPM_FLOAT delxinv;
+__device__ __constant__ PPPM_FLOAT delyinv;
+__device__ __constant__ PPPM_FLOAT delzinv;
+__device__ __constant__ int nlower;
+__device__ __constant__ int nupper;
+__device__ __constant__ PPPM_FLOAT shiftone;
+
+
+#include "pppm_cuda_kernel.cu"
+#include "stdio.h"
+void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
+                      , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
+                      , int cu_nxlo_in, int cu_nxhi_in, int cu_nylo_in, int cu_nyhi_in, int cu_nzlo_in, int cu_nzhi_in, int cu_nxlo_out, int cu_nxhi_out, int cu_nylo_out, int cu_nyhi_out, int cu_nzlo_out, int cu_nzhi_out, int cu_nx_pppm, int cu_ny_pppm, int cu_nz_pppm
+                      , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b
+                      , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_int, int cu_slabflag
+                     )
+{
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_init Start");
+  cudaMemcpyToSymbol(density_brick, &cu_density_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(density_brick_int, &cu_density_brick_int, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vdx_brick, &cu_vdx_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vdy_brick, &cu_vdy_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vdz_brick, &cu_vdz_brick, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(density_fft, &cu_density_fft, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(energy, &cu_energy, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(virial, &cu_virial, sizeof(ENERGY_FLOAT*));
+  cudaMemcpyToSymbol(nxlo_in, &cu_nxlo_in, sizeof(int));
+  cudaMemcpyToSymbol(nxhi_in, &cu_nxhi_in, sizeof(int));
+  cudaMemcpyToSymbol(nxlo_out, &cu_nxlo_out, sizeof(int));
+  cudaMemcpyToSymbol(nxhi_out, &cu_nxhi_out, sizeof(int));
+  cudaMemcpyToSymbol(nylo_in, &cu_nylo_in, sizeof(int));
+  cudaMemcpyToSymbol(nyhi_in, &cu_nyhi_in, sizeof(int));
+  cudaMemcpyToSymbol(nylo_out, &cu_nylo_out, sizeof(int));
+  cudaMemcpyToSymbol(nyhi_out, &cu_nyhi_out, sizeof(int));
+  cudaMemcpyToSymbol(nzlo_in, &cu_nzlo_in, sizeof(int));
+  cudaMemcpyToSymbol(nzhi_in, &cu_nzhi_in, sizeof(int));
+  cudaMemcpyToSymbol(nzlo_out, &cu_nzlo_out, sizeof(int));
+  cudaMemcpyToSymbol(nzhi_out, &cu_nzhi_out, sizeof(int));
+  cudaMemcpyToSymbol(nxlo_fft, &cu_nxlo_fft, sizeof(int));
+  cudaMemcpyToSymbol(nxhi_fft, &cu_nxhi_fft, sizeof(int));
+  cudaMemcpyToSymbol(nylo_fft, &cu_nylo_fft, sizeof(int));
+  cudaMemcpyToSymbol(nyhi_fft, &cu_nyhi_fft, sizeof(int));
+  cudaMemcpyToSymbol(nzlo_fft, &cu_nzlo_fft, sizeof(int));
+  cudaMemcpyToSymbol(nzhi_fft, &cu_nzhi_fft, sizeof(int));
+  cudaMemcpyToSymbol(slabflag, &cu_slabflag, sizeof(int));
+  cudaMemcpyToSymbol(nx_pppm, &cu_nx_pppm, sizeof(int));
+  cudaMemcpyToSymbol(ny_pppm, &cu_ny_pppm, sizeof(int));
+  cudaMemcpyToSymbol(nz_pppm, &cu_nz_pppm, sizeof(int));
+  cudaMemcpyToSymbol(work1, &cu_work1, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol(work2, &cu_work2, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol(work3, &cu_work3, sizeof(FFT_FLOAT*));
+  cudaMemcpyToSymbol(greensfn, &cu_greensfn, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(gf_b, &cu_gf_b, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(fkx, &cu_fkx, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(fky, &cu_fky, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(fkz, &cu_fkz, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(vg, &cu_vg, sizeof(PPPM_FLOAT*));
+
+  PPPM_FLOAT cu_qqrd2e_a = cu_qqrd2e;
+  cudaMemcpyToSymbol(qqrd2e, &cu_qqrd2e_a, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(order, &cu_order, sizeof(int));
+  cudaMemcpyToSymbol(rho_coeff, &cu_rho_coeff, sizeof(PPPM_FLOAT*));
+  cudaMemcpyToSymbol(debugdata, &cu_debugdata, sizeof(PPPM_FLOAT*));
+
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_init");
+
+  /*if(sizeof(CUDA_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision\n");
+
+  #ifdef PPPM_PRECISION
+  if(sizeof(PPPM_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for pppm core\n");
+  if(sizeof(PPPM_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for pppm core\n");
+  #endif
+  #ifdef ENERGY_PRECISION
+  if(sizeof(ENERGY_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for energy\n");
+  if(sizeof(ENERGY_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for energy\n");
+  #endif
+  #ifdef ENERGY_PRECISION
+  if(sizeof(FFT_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for fft\n");
+  if(sizeof(FFT_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for fft\n");
+  #endif
+  #ifdef X_PRECISION
+  if(sizeof(X_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for positions\n");
+  if(sizeof(X_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for positions\n");
+  #endif
+  #ifdef F_PRECISION
+  if(sizeof(F_FLOAT)==sizeof(float)) printf("PPPMCuda Kernel: Using single precision for forces\n");
+  if(sizeof(F_FLOAT)==sizeof(double)) printf("PPPMCuda Kernel: Using double precision for forces\n");
+  #endif*/
+}
+
+void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT cu_shiftone, PPPM_FLOAT cu_delxinv, PPPM_FLOAT cu_delyinv, PPPM_FLOAT cu_delzinv, int cu_nlower, int cu_nupper)
+{
+  cudaMemcpyToSymbol(delxinv, &cu_delxinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(delyinv, &cu_delyinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(delzinv, &cu_delzinv, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(shiftone, &cu_shiftone, sizeof(PPPM_FLOAT));
+  cudaMemcpyToSymbol(nlower, &cu_nlower, sizeof(int));
+  cudaMemcpyToSymbol(nupper, &cu_nupper, sizeof(int));
+  cudaMemcpyToSymbol(MY_AP(sublo)   , sdata->domain.sublo, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(subhi)   , sdata->domain.subhi, 3 * sizeof(X_FLOAT));
+  cudaMemcpyToSymbol(MY_AP(boxlo)   , sdata->domain.boxlo, 3 * sizeof(X_FLOAT));
+  CUT_CHECK_ERROR("ERROR-CUDA pppm_init_setup");
+}
+
+void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa)
+{
+  cudaMemcpyToSymbol(part2grid, &cu_part2grid, sizeof(int*));
+  cudaMemcpyToSymbol(MY_AP(x)   , & sdata->atom.x   .dev_data, sizeof(X_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(f)   , & sdata->atom.f   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(q)   , & sdata->atom.q   .dev_data, sizeof(F_FLOAT*));
+  cudaMemcpyToSymbol(MY_AP(tag)   , & sdata->atom.tag   .dev_data, sizeof(int*));
+  //cudaMemcpyToSymbol(MY_AP(nlocal)   , & sdata->atom.nlocal   .dev_data, sizeof(int));
+  cudaMemcpyToSymbol(nlocal   , &nlocala, sizeof(int));
+  cudaMemcpyToSymbol(nmax   , &nmaxa, sizeof(int));
+  CUT_CHECK_ERROR("ERROR-CUDA pppm_device_update");
+
+}
+
+void pppm_update_nlocal(int nlocala)
+{
+  cudaMemcpyToSymbol(nlocal   , &nlocala, sizeof(int));
+  CUT_CHECK_ERROR("ERROR-CUDA update_nlocal b");
+}
+
+
+void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  setup_fkxyz_vg <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald);
+  cudaThreadSynchronize();
+
+  CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_fkxyz_vg ");
+}
+
+void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
+                              int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  setup_greensfn <<< grid, threads, 0>>>(unitkx, unitky, unitkz, g_ewald, nbx, nby, nbz, xprd, yprd, zprd_slab);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA Cuda_PPPM_Setup_greensfn ");
+}
+
+void poisson_scale(int nx_pppma, int ny_pppma, int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_scale_kernel <<< grid, threads, 0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_scale ");
+
+}
+
+void poisson_xgrad(int nx_pppma, int ny_pppma, int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_xgrad_kernel <<< grid, threads, 0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_xgrad ");
+}
+
+void poisson_ygrad(int nx_pppma, int ny_pppma, int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_ygrad_kernel <<< grid, threads, 0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_ygrad ");
+}
+
+void poisson_zgrad(int nx_pppma, int ny_pppma, int nz_pppma)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = nz_pppma;
+  grid.y = ny_pppma;
+  grid.z = 1;
+  threads.x = nx_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_zgrad_kernel <<< grid, threads, 0>>>();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_zgrad ");
+}
+
+void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppma, int ny_pppma, int nz_pppma)
+{
+
+  dim3 grid;
+  dim3 threads;
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  grid.z = 1;
+  threads.x = ihi - ilo + 1;
+  threads.y = 1;
+  threads.z = 1;
+  //printf("VDX_BRICK CUDA: %i %i %i\n",grid.x,grid.y,threads.x);
+  poisson_vdx_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdxbrick ");
+  cudaThreadSynchronize();
+}
+
+void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  grid.z = 1;
+  threads.x = ihi - ilo + 1;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_vdy_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdybrick ");
+  cudaThreadSynchronize();
+}
+
+void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm)
+{
+  dim3 grid;
+  dim3 threads;
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  grid.z = 1;
+  threads.x = ihi - ilo + 1;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_vdz_brick_kernel <<< grid, threads, 0>>>(ilo, jlo, klo);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_vdzbrick ");
+  cudaThreadSynchronize();
+}
+
+
+void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag)
+{
+  //printf("VFLAG_GPU: %i\n",vflag);
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_energy start ");
+  dim3 grid;
+  dim3 threads;
+  grid.x = nzhi_fft - nzlo_fft + 1;
+  grid.y = nyhi_fft - nylo_fft + 1;
+  grid.z = 1;
+  threads.x = nxhi_fft - nxlo_fft + 1;
+  threads.y = 1;
+  threads.z = 1;
+  poisson_energy_kernel <<< grid, threads, threads.x* sizeof(ENERGY_FLOAT)>>>(nxlo_fft, nylo_fft, nzlo_fft, vflag);
+
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA poisson_energy end ");
+}
+
+ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial)
+{
+  ENERGY_FLOAT host_energy = 0;
+  dim3 grid;
+  dim3 threads;
+
+  grid.x = nz_pppma;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = ny_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  sum_energy_kernel1 <<< grid, threads, ny_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel1 ");
+
+  grid.x = 1;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = nz_pppma;
+  threads.y = 1;
+  threads.z = 1;
+  sum_energy_kernel2 <<< grid, threads, nz_pppma* sizeof(ENERGY_FLOAT)>>>(vflag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_kernel2 ");
+
+  cudaMemcpy((void*)(&host_energy), cu_energy, sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
+
+  if(vflag)
+    cudaMemcpy((void*) cpu_virial, (void*) cu_virial, 6 * sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
+  CUT_CHECK_ERROR("ERROR-CUDA sumenergy_memcopy");
+
+  return host_energy;
+}
+
+void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int)
+{
+  CUT_CHECK_ERROR("cuda_make_rho begin");
+  dim3 grid, threads;
+  int cpu_flag[3];
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  int sharedmemsize = (32 + 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
+
+  do {
+    cpu_flag[0] = 0;
+    cpu_flag[1] = 0;
+    cpu_flag[2] = 0;
+    cudaMemcpyToSymbol(density_intScale, cu_density_intScale, sizeof(PPPM_FLOAT*));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre Z");
+    cudaMemset(flag, 0, 3 * sizeof(int));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre A");
+    cudaMemset(cu_density_brick, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(PPPM_FLOAT));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre B");
+    cudaMemset(cu_density_brick_int, 0, (khi - klo + 1) * (jhi - jlo + 1) * (ihi - ilo + 1)*sizeof(int));
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho pre C");
+    make_rho_kernel <<< grid, threads, sharedmemsize>>>((int*) flag, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1));
+    cudaThreadSynchronize();
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho A");
+    cudaMemcpy((void*) &cpu_flag, flag, 3 * sizeof(int), cudaMemcpyDeviceToHost);
+
+    if(cpu_flag[0] != 0) {
+      (*cu_density_intScale) /= 2;
+      MYDBG(printf("PPPM_Cuda::cuda_make_rho: Decrease cu_density_intScale to: %e\n", *cu_density_intScale);)
+    }
+    if((cpu_flag[0] == 0) && (cpu_flag[1] == 0)) {
+      (*cu_density_intScale) *= 2;
+      MYDBG(printf("PPPM_Cuda::cuda_make_rho: Increase cu_density_intScale to: %e\n", *cu_density_intScale);)
+    }
+    /* if((*cu_density_intScale)>0xe0000000)
+     {
+     	printf("Error Scaling\n");
+         cpu_flag[0]=0;
+         cpu_flag[1]=1;
+     }*/
+    CUT_CHECK_ERROR("ERROR-CUDA make_rho B");
+  } while((cpu_flag[0] != 0) || (cpu_flag[1] == 0));
+
+
+  grid.x = khi - klo + 1;
+  grid.y = jhi - jlo + 1;
+  threads.x = ihi - ilo + 1;
+  scale_rho_kernel <<< grid, threads, 0>>>();
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA make_rho_scale");
+}
+
+
+int cuda_particle_map(cuda_shared_data* sdata, void* flag)
+{
+  dim3 grid, threads;
+  int cpu_flag;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map ..pre");
+  particle_map_kernel <<< grid, threads, 0>>>((int*) flag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map a");
+  cudaMemcpy((void*) &cpu_flag, flag, sizeof(int), cudaMemcpyDeviceToHost);
+  CUT_CHECK_ERROR("ERROR-CUDA particla_map b");
+  return cpu_flag;
+}
+
+
+void cuda_fieldforce(cuda_shared_data* sdata, void* flag)
+{
+  dim3 grid, threads;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  int sharedmemsize = (32 + 3 * 32 * (sdata->pppm.nupper - sdata->pppm.nlower + 1) + sdata->pppm.order * (sdata->pppm.order / 2 - (1 - sdata->pppm.order) / 2 + 1)) * sizeof(PPPM_FLOAT);
+  fieldforce_kernel <<< grid, threads, sharedmemsize>>>
+  (sdata->pppm.nupper - sdata->pppm.nlower + 1, 32 / (sdata->pppm.nupper - sdata->pppm.nlower + 1), (int*) flag);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA fieldforce");
+}
+
+double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf)
+{
+  dim3 grid, threads;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  slabcorr_energy_kernel <<< grid, threads, 32* sizeof(ENERGY_FLOAT)>>>(dev_buf);
+  cudaThreadSynchronize();
+  cudaMemcpy((void*) buf, dev_buf, grid.x* sizeof(ENERGY_FLOAT), cudaMemcpyDeviceToHost);
+
+  double dipole_all = 0.0;
+
+  for(int i = 0; i < grid.x; i++)
+    dipole_all += buf[i];
+
+  return dipole_all;
+}
+
+void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact)
+{
+  dim3 grid, threads;
+  grid.x = (sdata->atom.nlocal + 31) / 32;
+  grid.y = 1;
+  grid.z = 1;
+  threads.x = 32;
+  threads.y = 1;
+  threads.z = 1;
+  slabcorr_force_kernel <<< grid, threads>>>(ffact);
+  cudaThreadSynchronize();
+}
+
+void sum_virial(double* host_virial)
+{
+}
+
+void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  int nslow = sdata->pppm.nzhi_in - sdata->pppm.nzlo_in;
+  int nmid = sdata->pppm.nyhi_in - sdata->pppm.nylo_in;
+  int nfast = sdata->pppm.nxhi_in - sdata->pppm.nxlo_in;
+  int nrimz = MAX(sdata->pppm.nzlo_in - sdata->pppm.nzlo_out, sdata->pppm.nzhi_out - sdata->pppm.nzhi_in);
+  int nrimy = MAX(sdata->pppm.nylo_in - sdata->pppm.nylo_out, sdata->pppm.nyhi_out - sdata->pppm.nyhi_in);
+  int nrimx = MAX(sdata->pppm.nxlo_in - sdata->pppm.nxlo_out, sdata->pppm.nxhi_out - sdata->pppm.nxhi_in);
+  dim3 grid;
+  grid.x = nslow + 1;
+  grid.y = nmid + 1;
+  grid.z = 1;
+  dim3 threads;
+  threads.x = nfast + 1;
+  threads.y = 1;
+  threads.z = 1;
+  cudaThreadSynchronize();
+  initfftdata_core_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nrimz;
+  grid.y = nmid + 1;
+  threads.x = nfast + 1;
+  initfftdata_z_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nslow + 1;
+  grid.y = nrimy;
+  threads.x = nfast + 1;
+  initfftdata_y_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nslow + 1;
+  grid.y = nmid + 1;
+  threads.x = nrimx;
+  initfftdata_x_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nrimz;
+  grid.y = nrimy;
+  threads.x = nfast + 1;
+  initfftdata_yz_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nrimz;
+  grid.y = nmid + 1;
+  threads.x = nrimx;
+  initfftdata_xz_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nslow + 1;
+  grid.y = nrimy;
+  threads.x = nrimx;
+  initfftdata_xy_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  grid.x = nrimz;
+  grid.y = nrimy;
+  threads.x = nrimx;
+  initfftdata_xyz_kernel <<< grid, threads, 0>>>(in, out);
+  cudaThreadSynchronize();
+  CUT_CHECK_ERROR("ERROR-CUDA initfftdata_kernel");
+}
+
+
diff --git a/lib/cuda/pppm_cuda_cu.h b/lib/cuda/pppm_cuda_cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..a22e811c3830c266b08010baf339dc381e0faae9
--- /dev/null
+++ b/lib/cuda/pppm_cuda_cu.h
@@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifndef PPPM_CUDA_CU_H_
+#define PPPM_CUDA_CU_H_
+
+extern "C" void pppm_device_init(void* cu_density_brick, void* cu_vdx_brick, void* cu_vdy_brick, void* cu_vdz_brick, void* cu_density_fft, void* cu_energy, void* cu_virial
+                                 , void* cu_work1, void* cu_work2, void* cu_work3, void* cu_greensfn, void* cu_fkx, void* cu_fky, void* cu_fkz, void* cu_vg
+                                 , int nxlo_in, int nxhi_in, int nylo_in, int nyhi_in, int nzlo_in, int nzhi_in, int nxlo_out, int nxhi_out, int nylo_out, int nyhi_out, int nzlo_out, int nzhi_out, int nx_pppm, int ny_pppm, int nz_pppm
+                                 , int cu_nxlo_fft, int cu_nxhi_fft, int cu_nylo_fft, int cu_nyhi_fft, int cu_nzlo_fft, int cu_nzhi_fft, void* cu_gf_b
+                                 , double cu_qqrd2e, int cu_order, void* cu_rho_coeff, void* cu_debugdata, void* cu_density_brick_lock, int slabflag
+                                );
+extern "C" void pppm_device_init_setup(cuda_shared_data* sdata, PPPM_FLOAT shiftone, PPPM_FLOAT delxinv, PPPM_FLOAT delyinv, PPPM_FLOAT delzinv, int nlower, int nupper);
+extern "C" void Cuda_PPPM_Setup_fkxyz_vg(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald);
+extern "C" void Cuda_PPPM_setup_greensfn(int nx_pppma, int ny_pppma, int nz_pppma, PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
+    int nbx, int nby, int nbz, PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab);
+
+extern "C" void pppm_device_update(cuda_shared_data* sdata, void* cu_part2grid, int nlocala, int nmaxa);
+extern "C" void pppm_update_nlocal(int nlocala);
+extern "C" void poisson_scale(int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_xgrad(int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_ygrad(int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_zgrad(int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_vdx_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_vdy_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_vdz_brick(int ihi, int ilo, int jhi, int jlo, int khi, int klo, int nx_pppm, int ny_pppm, int nz_pppm);
+extern "C" void poisson_energy(int nxlo_fft, int nxhi_fft, int nylo_fft, int nyhi_fft, int nzlo_fft, int nzhi_fft, int vflag);
+extern "C" ENERGY_FLOAT sum_energy(void* cu_virial, void* cu_energy, int nx_pppma, int ny_pppma, int nz_pppma, int vflag, ENERGY_FLOAT* cpu_virial);
+extern "C" int cuda_particle_map(cuda_shared_data* sdata, void* flag);
+extern "C" void cuda_make_rho(cuda_shared_data* sdata, void* flag, PPPM_FLOAT* cu_density_intScale, int ihi, int ilo, int jhi, int jlo, int khi, int klo, void* cu_density_brick, void* cu_density_brick_int);
+extern "C" void cuda_fieldforce(cuda_shared_data* sdata, void* flag);
+extern "C" double cuda_slabcorr_energy(cuda_shared_data* sdata, ENERGY_FLOAT* buf, ENERGY_FLOAT* dev_buf);
+extern "C" void cuda_slabcorr_force(cuda_shared_data* sdata, F_FLOAT ffact);
+extern "C" void pppm_initfftdata(cuda_shared_data* sdata, PPPM_FLOAT* in, FFT_FLOAT* out);
+#endif /*PPPM_CUDA_CU_H_*/
diff --git a/lib/cuda/pppm_cuda_kernel.cu b/lib/cuda/pppm_cuda_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..25a81866f05305f17357a5f398f89715cd0be426
--- /dev/null
+++ b/lib/cuda/pppm_cuda_kernel.cu
@@ -0,0 +1,858 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+
+   -----------------------------------------------------------------------
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany
+
+   See the README file in the USER-CUDA directory.
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#define OFFSET 4096
+__device__ int negativCUDA(float f)
+{
+  return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
+}
+
+__device__ void reduceBlock(float* data)
+{
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
+    __syncthreads();
+  }
+}
+
+__device__ void reduceBlock(double* data)
+{
+  int p2 = 1;
+
+  while(p2 * 2 < blockDim.x) p2 *= 2;
+
+  if(threadIdx.x < blockDim.x - p2)
+    data[threadIdx.x] += data[threadIdx.x + p2];
+
+  __syncthreads();
+
+  for(int i = 2; i <= p2; i *= 2) {
+    if(threadIdx.x < p2 / i)
+      data[threadIdx.x] += data[threadIdx.x + p2 / i];
+
+    __syncthreads();
+  }
+}
+
+extern __shared__ PPPM_FLOAT sharedmem[];
+
+__global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
+{
+  PPPM_FLOAT my_fkx = unitkx * (int(threadIdx.x) - nx_pppm * (2 * int(threadIdx.x) / nx_pppm));
+  PPPM_FLOAT my_fky = unitky * (int(blockIdx.y) - ny_pppm * (2 * int(blockIdx.y) / ny_pppm));
+  PPPM_FLOAT my_fkz = unitkz * (int(blockIdx.x) - nz_pppm * (2 * int(blockIdx.x) / nz_pppm));
+
+  if((blockIdx.x == 0) && (blockIdx.y == 0)) fkx[threadIdx.x] = my_fkx;
+
+  if((blockIdx.x == 0) && (threadIdx.x == 0)) fky[blockIdx.y] = my_fky;
+
+  if((threadIdx.x == 0) && (blockIdx.y == 0)) fkz[blockIdx.x] = my_fkz;
+
+  __syncthreads();
+
+  if((blockIdx.x >= nzlo_fft) && (blockIdx.x <= nzhi_fft) &&
+      (blockIdx.y >= nylo_fft) && (blockIdx.y <= nyhi_fft) &&
+      (threadIdx.x >= nxlo_fft) && (threadIdx.x <= nxhi_fft)) {
+    int n = ((int(blockIdx.x) - nzlo_fft) * (nyhi_fft - nylo_fft + 1) + int(blockIdx.y) - nylo_fft) * (nxhi_fft - nxlo_fft + 1) + int(threadIdx.x) - nxlo_fft;
+    PPPM_FLOAT sqk = my_fkx * my_fkx + my_fky * my_fky + my_fkz * my_fkz;
+    PPPM_FLOAT vterm = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0) / sqk + PPPM_F(0.25) / (g_ewald * g_ewald));
+    vg[6 * n + 0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx * my_fkx;
+    vg[6 * n + 1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky * my_fky;
+    vg[6 * n + 2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz * my_fkz;
+    vg[6 * n + 3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fky;
+    vg[6 * n + 4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fkz;
+    vg[6 * n + 5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky * my_fkz;
+
+  }
+}
+
+__device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z)
+{
+  PPPM_FLOAT sx, sy, sz;
+  sz = sy = sx = PPPM_F(0.0);
+
+  for(int l = order - 1; l >= 0; l--) {
+    sx = gf_b[l] + sx * x;
+    sy = gf_b[l] + sy * y;
+    sz = gf_b[l] + sz * z;
+  }
+
+  PPPM_FLOAT s = sx * sy * sz;
+  return s * s;
+}
+
+__global__ void setup_greensfn(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
+                               int nbx, int nby, int nbz,
+                               PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
+{
+  PPPM_FLOAT sqk;
+  int nx, ny, nz, kper, lper, mper, k, l, m;
+  PPPM_FLOAT snx, sny, snz, snx2, sny2, snz2;
+  PPPM_FLOAT argx, argy, argz, wx, wy, wz, sx, sy, sz, qx, qy, qz;
+  PPPM_FLOAT sum1, dot1, dot2;
+  PPPM_FLOAT numerator, denominator;
+
+  PPPM_FLOAT form = PPPM_F(1.0);
+  int n = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  m = blockIdx.x;
+  l = blockIdx.y;
+  k = threadIdx.x;
+
+  mper = m - nz_pppm * (2 * m / nz_pppm);
+  snz = sin(PPPM_F(0.5) * unitkz * mper * zprd_slab / nz_pppm);
+  snz2 = snz * snz;
+
+
+  lper = l - ny_pppm * (2 * l / ny_pppm);
+  sny = sin(PPPM_F(0.5) * unitky * lper * yprd / ny_pppm);
+  sny2 = sny * sny;
+
+  kper = k - nx_pppm * (2 * k / nx_pppm);
+  snx = sin(PPPM_F(0.5) * unitkx * kper * xprd / nx_pppm);
+  snx2 = snx * snx;
+
+  sqk = pow(unitkx * kper, PPPM_F(2.0)) + pow(unitky * lper, PPPM_F(2.0)) +
+        pow(unitkz * mper, PPPM_F(2.0));
+
+  if(sqk != PPPM_F(0.0)) {
+    numerator = form * PPPM_F(12.5663706) / sqk;
+    denominator = gf_denom(snx2, sny2, snz2);
+    sum1 = PPPM_F(0.0);
+
+    for(nx = -nbx; nx <= nbx; nx++) {
+      qx = unitkx * (kper + nx_pppm * nx);
+      sx = exp(PPPM_F(-.25) * pow(qx / g_ewald, PPPM_F(2.0)));
+      wx = PPPM_F(1.0);
+      argx = PPPM_F(0.5) * qx * xprd / nx_pppm;
+
+      if(argx != PPPM_F(0.0)) wx = pow(sin(argx) / argx, order);
+
+      for(ny = -nby; ny <= nby; ny++) {
+        qy = unitky * (lper + ny_pppm * ny);
+        sy = exp(PPPM_F(-.25) * pow(qy / g_ewald, PPPM_F(2.0)));
+        wy = PPPM_F(1.0);
+        argy = PPPM_F(0.5) * qy * yprd / ny_pppm;
+
+        if(argy != PPPM_F(0.0)) wy = pow(sin(argy) / argy, order);
+
+        for(nz = -nbz; nz <= nbz; nz++) {
+          qz = unitkz * (mper + nz_pppm * nz);
+          sz = exp(PPPM_F(-.25) * pow(qz / g_ewald, PPPM_F(2.0)));
+          wz = PPPM_F(1.0);
+          argz = PPPM_F(0.5) * qz * zprd_slab / nz_pppm;
+
+          if(argz != PPPM_F(0.0)) wz = pow(sin(argz) / argz, order);
+
+          dot1 = unitkx * kper * qx + unitky * lper * qy + unitkz * mper * qz;
+          dot2 = qx * qx + qy * qy + qz * qz;
+          sum1 += (dot1 / dot2) * sx * sy * sz * pow(wx * wy * wz, PPPM_F(2.0));
+        }
+      }
+    }
+
+    greensfn[n] = numerator * sum1 / denominator;
+  } else greensfn[n] = PPPM_F(0.0);
+}
+
+__global__ void poisson_scale_kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  FFT_FLOAT scaleinv = FFT_F(1.0) / (gridDim.x * gridDim.y * blockDim.x);
+  work1[2 * i] *= scaleinv * greensfn[i];
+  work1[2 * i + 1] *= scaleinv * greensfn[i];
+}
+
+__global__ void poisson_xgrad_kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  work2[2 * i] = fkx[threadIdx.x] * work1[2 * i + 1];
+  work2[2 * i + 1] = -fkx[threadIdx.x] * work1[2 * i];
+}
+
+__global__ void poisson_ygrad_kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  work2[2 * i] = fky[blockIdx.y] * work1[2 * i + 1];
+  work2[2 * i + 1] = -fky[blockIdx.y] * work1[2 * i];
+}
+
+__global__ void poisson_zgrad_kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  work2[2 * i] = fkz[blockIdx.x] * work1[2 * i + 1];
+  work2[2 * i + 1] = -fkz[blockIdx.x] * work1[2 * i];
+}
+
+__global__ void poisson_vdx_brick_kernel(int ilo, int jlo, int klo)
+{
+  int k = blockIdx.x + klo;
+  k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
+  int j = blockIdx.y + jlo;
+  j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
+  int i = threadIdx.x + ilo;
+  i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
+  vdx_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
+}
+
+__global__ void poisson_vdy_brick_kernel(int ilo, int jlo, int klo)
+{
+  int k = blockIdx.x + klo;
+  k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
+  int j = blockIdx.y + jlo;
+  j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
+  int i = threadIdx.x + ilo;
+  i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
+  vdy_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
+}
+
+__global__ void poisson_vdz_brick_kernel(int ilo, int jlo, int klo)
+{
+  int k = blockIdx.x + klo;
+  k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
+  int j = blockIdx.y + jlo;
+  j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
+  int i = threadIdx.x + ilo;
+  i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
+  vdz_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
+}
+
+__global__ void poisson_energy_kernel(int nxlo_fft, int nylo_fft, int nzlo_fft, int vflag)
+{
+  ENERGY_FLOAT scaleinv = FFT_F(1.0) / (nx_pppm * ny_pppm * nz_pppm);
+  int i = (blockIdx.x + nzlo_fft) * ny_pppm * nx_pppm + (blockIdx.y + nylo_fft) * nx_pppm + threadIdx.x + nxlo_fft;
+  ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
+  ENERGY_FLOAT myenergy = scaleinv * scaleinv * greensfn[i] * (work1[2 * i] * work1[2 * i] + work1[2 * i + 1] * work1[2 * i + 1]);
+  s_energy[threadIdx.x] = myenergy;
+
+  __syncthreads();
+  reduceBlock(s_energy);
+
+  if(threadIdx.x == 0)
+    energy[blockIdx.x * ny_pppm + blockIdx.y] = s_energy[0];
+
+  if(vflag) {
+    __syncthreads();
+
+    for(int j = 0; j < 6; j++) {
+      s_energy[threadIdx.x] = myenergy * vg[((blockIdx.x * gridDim.y + blockIdx.y) * (blockDim.x) + threadIdx.x) * 6 + j];
+      __syncthreads();
+      reduceBlock(s_energy);
+
+      if(threadIdx.x == 0)
+        virial[blockIdx.x * ny_pppm + blockIdx.y + j * nz_pppm * ny_pppm] = s_energy[0];
+    }
+  }
+}
+
+
+__global__ void sum_energy_kernel1(int vflag)
+{
+  ENERGY_FLOAT myenergy = energy[(blockIdx.x * ny_pppm + threadIdx.x)];
+  ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
+  s_energy[threadIdx.x] = myenergy;
+  __syncthreads();
+  reduceBlock(s_energy);
+
+  if(threadIdx.x == 0)
+    energy[blockIdx.x * ny_pppm] = s_energy[0];
+
+  if(vflag) {
+    __syncthreads();
+
+    for(int j = 0; j < 6; j++) {
+      myenergy = virial[blockIdx.x * ny_pppm + threadIdx.x + j * ny_pppm * nz_pppm];
+      s_energy[threadIdx.x] = myenergy;
+      __syncthreads();
+      reduceBlock(s_energy);
+
+      if(threadIdx.x == 0)
+        virial[blockIdx.x * ny_pppm + j * ny_pppm * nz_pppm] = s_energy[0];
+    }
+  }
+
+}
+
+__global__ void sum_energy_kernel2(int vflag)
+{
+  ENERGY_FLOAT myenergy = energy[threadIdx.x * ny_pppm];
+  ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
+  s_energy[threadIdx.x] = myenergy;
+  __syncthreads();
+  reduceBlock(s_energy);
+
+  if(threadIdx.x == 0)
+    energy[0] = s_energy[0];
+
+  if(vflag) {
+    __syncthreads();
+
+    for(int j = 0; j < 6; j++) {
+      myenergy = virial[threadIdx.x * ny_pppm + j * ny_pppm * nz_pppm];
+      s_energy[threadIdx.x] = myenergy;
+      __syncthreads();
+      reduceBlock(s_energy);
+
+      if(threadIdx.x == 0)
+        virial[j] = s_energy[0];
+    }
+  }
+}
+
+__device__ PPPM_FLOAT rho1d(int k, PPPM_FLOAT d, PPPM_FLOAT* srho_coeff)
+{
+  PPPM_FLOAT rho1d_tmp = PPPM_F(0.0);
+
+  for(int l = order - 1; l >= 0; l--)
+    rho1d_tmp = srho_coeff[l * order + k - (1 - order) / 2] + rho1d_tmp * d;
+
+  return rho1d_tmp;
+}
+
+__global__ void particle_map_kernel(int* flag)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(i < nlocal) {
+    int nx, ny, nz;
+    PPPM_FLOAT shift = PPPM_F(0.5) - shiftone; //+OFFSET;
+    nx = (int)((_x[i] - _boxlo[0]) * delxinv + shift); // - OFFSET;
+    ny = (int)((_x[i + nmax] - _boxlo[1]) * delyinv + shift); // - OFFSET;
+    nz = (int)((_x[i + 2 * nmax] - _boxlo[2]) * delzinv + shift); // - OFFSET;
+
+    part2grid[i] = nx;
+    part2grid[i + nmax] = ny;
+    part2grid[i + 2 * nmax] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+    if(nx + nlower < nxlo_out || nx + nupper > nxhi_out ||
+        ny + nlower < nylo_out || ny + nupper > nyhi_out ||
+        nz + nlower < nzlo_out || nz + nupper > nzhi_out) {
+      flag[0]++;
+      debugdata[0] = i;
+      debugdata[1] = _boxlo[0];
+      debugdata[2] = _boxlo[1];
+      debugdata[3] = _boxlo[2];
+      debugdata[4] = nx;
+      debugdata[5] = ny;
+      debugdata[6] = nz;
+      debugdata[7] = _x[i];
+      debugdata[8] = _x[i + _nmax];
+      debugdata[9] = _x[i + 2 * _nmax];
+      debugdata[10] = nlocal;
+
+    }
+  }
+}
+
+__global__ void make_rho_kernelA()
+{
+  int i, l, m, n, nx, ny, nz, mx, my, mz;
+
+  // clear 3d density array
+
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(i < nlocal) {
+
+    PPPM_FLOAT dx, dy, dz, x0, y0, z0;
+    nx = part2grid[i];
+    ny = part2grid[i + nmax];
+    nz = part2grid[i + 2 * nmax];
+    dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
+    dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
+    dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
+
+    z0 = delxinv * delyinv * delzinv * _q[i];
+
+    for(n = nlower; n <= nupper; n++) {
+      mz = n + nz;
+      y0 = z0 * rho1d(n, dz, rho_coeff);
+
+      for(m = nlower; m <= nupper; m++) {
+        my = m + ny;
+        x0 = y0 * rho1d(m, dy, rho_coeff);
+
+        for(l = nlower; l <= nupper; l++) {
+          mx = l + nx;
+          int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out;
+
+          while(atomicAdd(&density_brick_int[mzyx], 1) != 0) atomicAdd(&density_brick_int[mzyx], -1);
+
+          density_brick[mzyx] += x0 * rho1d(l, dx, rho_coeff);
+          __threadfence();
+          atomicAdd(&density_brick_int[mzyx], -1);
+          __syncthreads();
+
+        }
+      }
+    }
+  }
+}
+
+__global__ void make_rho_kernel(int* flag, int read_threads_at_same_time)
+{
+  int i, l, m, n, nx, ny, nz, mx, my, mz, a, b;
+
+  // clear 3d density array
+
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // int nzxy=blockIdx.x*gridDim.y+blockIdx.y;
+
+  int nelements = nupper - nlower + 1;
+  int* idx = (int*) sharedmem;
+  int* sdensity_brick_int = &idx[blockDim.x];
+  PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &sdensity_brick_int[nelements * blockDim.x];
+
+  if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1))
+    srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x];
+
+  __syncthreads();
+
+  i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(false) {
+    if(i < nlocal) {
+
+      PPPM_FLOAT dx, dy, dz, x0, y0, z0;
+      nx = part2grid[i];
+      ny = part2grid[i + nmax];
+      nz = part2grid[i + 2 * nmax];
+      dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
+      dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
+      dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
+
+      z0 = delxinv * delyinv * delzinv * _q[i];
+
+      for(n = nlower; n <= nupper; n++) {
+        mz = n + nz;
+        y0 = z0 * rho1d(n, dz, srho_coeff);
+
+        for(m = nlower; m <= nupper; m++) {
+          my = m + ny;
+          x0 = y0 * rho1d(m, dy, srho_coeff);
+
+          for(l = nlower; l <= nupper; l++) {
+            mx = l + nx;
+            int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out;
+
+            a = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale);
+            b = (atomicAdd(&density_brick_int[mzyx], a) | a);
+
+            if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) {
+              flag[1]++;
+
+              if((b) & (0x60000000)) flag[0]++;
+            }
+
+            __syncthreads();
+          }
+        }
+      }
+    }
+
+    return;
+  }
+
+  i = blockIdx.x * blockDim.x + threadIdx.x;
+  {
+
+    PPPM_FLOAT dx, dy, dz, x0, y0, z0, qtmp;
+
+    if(i < nlocal) {
+      qtmp = _q[i];
+      nx = part2grid[i];
+      ny = part2grid[i + nmax];
+      nz = part2grid[i + 2 * nmax];
+      dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
+      dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
+      dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
+      z0 = delxinv * delyinv * delzinv * qtmp;
+    } else {
+      nx = ny = nz = 1;
+      dx = dy = dz = PPPM_F(0.1);
+    }
+
+    __syncthreads();
+
+    for(n = nlower; n <= nupper; n++) {
+      mz = n + nz;
+      y0 = z0 * rho1d(n, dz, srho_coeff);
+
+      for(m = nlower; m <= nupper; m++) {
+        my = m + ny;
+        x0 = y0 * rho1d(m, dy, srho_coeff);
+
+        if(i < nlocal) {
+          idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out;
+
+          for(l = nlower; l <= nupper; l++) {
+            sdensity_brick_int[threadIdx.x * nelements + l - nlower] = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale);
+          }
+        } else idx[threadIdx.x] = -1;
+
+        __syncthreads();
+
+        for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) {
+          int kk = threadIdx.x / nelements;
+
+          if((threadIdx.x < nelements * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) {
+            a = sdensity_brick_int[ii * nelements + threadIdx.x];
+            //if(a*a>1e-100)
+            b = (atomicAdd(&density_brick_int[idx[ii + kk] + threadIdx.x - kk * nelements], a) | a);
+
+            //else
+            //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a);
+            if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) {
+              flag[1]++;
+
+              if((b) & (0x60000000)) flag[0]++;
+            }
+          }
+        }
+
+        __syncthreads();	   //*/
+      }
+    }
+
+  }
+}
+
+__global__ void scale_rho_kernel()
+{
+  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  density_brick[i] = (1.0 / density_intScale) * density_brick_int[i];
+}
+
+__global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_same_time, int* flag) //20*x64 0.36
+{
+  int i;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+  i = blockIdx.x * blockDim.x + threadIdx.x;
+  int* idx = (int*) sharedmem;
+  PPPM_FLOAT* tmp_brick = (PPPM_FLOAT*) &idx[blockDim.x];
+  PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &tmp_brick[3 * blockDim.x * elements_per_thread];
+
+  if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1))
+    srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x];
+
+  __syncthreads();
+  {
+    int l, m, n, nx, ny, nz, my, mz;
+    PPPM_FLOAT dx, dy, dz, x0, y0, z0;
+    PPPM_FLOAT ek[3];
+
+    if(i < nlocal) {
+      nx = part2grid[i];
+      ny = part2grid[i + nmax];
+      nz = part2grid[i + 2 * nmax];
+      dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
+      dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
+      dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
+
+      ek[0] = ek[1] = ek[2] = PPPM_F(0.0);
+    } else {
+      nx = ny = nz = 1;
+      dx = dy = dz = PPPM_F(0.1);
+    }
+
+    __syncthreads();
+
+    for(n = nlower; n <= nupper; n++) {
+      mz = n + nz;
+      z0 = rho1d(n, dz, srho_coeff);
+
+      for(m = nlower; m <= nupper; m++) {
+        my = m + ny;
+        y0 = z0 * rho1d(m, dy, srho_coeff);
+
+
+        if(i < nlocal)
+          idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out;
+        else idx[threadIdx.x] = -1;
+
+        __syncthreads();
+
+        for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) {
+          int kk = threadIdx.x / elements_per_thread;
+
+          if((threadIdx.x < elements_per_thread * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) {
+            tmp_brick[ii * elements_per_thread + threadIdx.x] = vdx_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
+            tmp_brick[(ii + blockDim.x)*elements_per_thread + threadIdx.x] = vdy_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
+            tmp_brick[(ii + 2 * blockDim.x)*elements_per_thread + threadIdx.x] = vdz_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
+          }
+        }
+
+        __syncthreads();
+
+        if(i < nlocal)
+          for(l = nlower; l <= nupper; l++) {
+            x0 = y0 * rho1d(l, dx, srho_coeff);
+
+            ek[0] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower];
+            ek[1] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + blockDim.x * elements_per_thread];
+            ek[2] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + 2 * blockDim.x * elements_per_thread];
+          }
+
+        __syncthreads();
+      }
+    }
+
+    // convert E-field to force
+
+
+    _f[i] += qqrd2e * _q[i] * ek[0];
+    _f[i + nmax] += qqrd2e * _q[i] * ek[1];
+    _f[i + 2 * nmax] += qqrd2e * _q[i] * ek[2];
+  }
+}
+
+__global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf)
+{
+  ENERGY_FLOAT* dipole = (ENERGY_FLOAT*) sharedmem;
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(i < nlocal)
+    dipole[threadIdx.x] = _q[i] * _x[i + 2 * nmax];
+  else
+    dipole[threadIdx.x] = ENERGY_F(0.0);
+
+  __syncthreads();
+  reduceBlock(dipole);
+
+  if(threadIdx.x == 0) buf[blockIdx.x] = dipole[0];
+}
+
+__global__ void slabcorr_force_kernel(F_FLOAT ffact)
+{
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if(i < nlocal)
+    _f[i + 2 * nmax] += qqrd2e * _q[i] * ffact;
+}
+
+
+__global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] = in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+  out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x) + 1] = 0;
+}
+
+__global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(slabflag) {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      out[2 * (((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+  } else {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      out[2 * (((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+  }
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    out[2 * ((((blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + (nzhi_out - nzlo_in)) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+}
+
+__global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(blockIdx.y < nylo_in - nylo_out)
+    out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + (2 * (nyhi_in + 1) - nylo_in - nyhi_out) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+
+  if(blockIdx.y < nyhi_out - nyhi_in)
+    out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + (nyhi_out - nylo_in)) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+}
+
+__global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(threadIdx.x < nxlo_in - nxlo_out)
+    out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+  if(threadIdx.x < nxhi_out - nxhi_in)
+    out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+}
+
+__global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(slabflag) {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nyhi_out - nyhi_in)
+        out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nylo_in - nylo_out)
+        out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+  } else {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nyhi_out - nyhi_in)
+        out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nylo_in - nylo_out)
+        out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+  }
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(blockIdx.y < nyhi_out - nyhi_in)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(blockIdx.y < nylo_in - nylo_out)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
+}
+
+__global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(threadIdx.x < nxlo_in - nxlo_out)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(threadIdx.x < nxhi_out - nxhi_in)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+
+  if(slabflag) {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(threadIdx.x < nxlo_in - nxlo_out)
+        out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(threadIdx.x < nxhi_out - nxhi_in)
+        out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+  } else {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(threadIdx.x < nxlo_in - nxlo_out)
+        out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(threadIdx.x < nxhi_out - nxhi_in)
+        out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+  }
+}
+
+__global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(blockIdx.y < nyhi_out - nyhi_in)
+    if(threadIdx.x < nxlo_in - nxlo_out)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+  if(blockIdx.y < nyhi_out - nyhi_in)
+    if(threadIdx.x < nxhi_out - nxhi_in)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+
+  if(blockIdx.y < nylo_in - nylo_out)
+    if(threadIdx.x < nxlo_in - nxlo_out)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+  if(blockIdx.y < nylo_in - nylo_out)
+    if(threadIdx.x < nxhi_out - nxhi_in)
+      out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+}
+
+__global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
+{
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(blockIdx.y < nyhi_out - nyhi_in)
+      if(threadIdx.x < nxlo_in - nxlo_out)
+        out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(blockIdx.y < nyhi_out - nyhi_in)
+      if(threadIdx.x < nxhi_out - nxhi_in)
+        out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(blockIdx.y < nylo_in - nylo_out)
+      if(threadIdx.x < nxlo_in - nxlo_out)
+        out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+  if(blockIdx.x < nzhi_out - nzhi_in)
+    if(blockIdx.y < nylo_in - nylo_out)
+      if(threadIdx.x < nxhi_out - nxhi_in)
+        out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+
+  if(slabflag) {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nyhi_out - nyhi_in)
+        if(threadIdx.x < nxlo_in - nxlo_out)
+          out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nyhi_out - nyhi_in)
+        if(threadIdx.x < nxhi_out - nxhi_in)
+          out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nylo_in - nylo_out)
+        if(threadIdx.x < nxlo_in - nxlo_out)
+          out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nylo_in - nylo_out)
+        if(threadIdx.x < nxhi_out - nxhi_in)
+          out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+  } else {
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nyhi_out - nyhi_in)
+        if(threadIdx.x < nxlo_in - nxlo_out)
+          out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nyhi_out - nyhi_in)
+        if(threadIdx.x < nxhi_out - nxhi_in)
+          out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nylo_in - nylo_out)
+        if(threadIdx.x < nxlo_in - nxlo_out)
+          out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
+
+    if(blockIdx.x < nzlo_in - nzlo_out)
+      if(blockIdx.y < nylo_in - nylo_out)
+        if(threadIdx.x < nxhi_out - nxhi_in)
+          out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
+  }
+}